From 0222def584bf383f1eb8435bc108ea5f540b28be Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Fri, 12 Feb 2021 23:09:35 +0300 Subject: [PATCH 1/3] Fix HiveOrcHadoopFsRelationSuite --- .../spark/sql/RandomDataGenerator.scala | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 4badcbaa89aa4..945efff3adce6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -182,25 +182,26 @@ object RandomDataGenerator { "1970-01-01", // the epoch date "9999-12-31" // the last supported date according to SQL standard ) + def getRandomDate(rand: Random): java.sql.Date = { + val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand)) + // The generated `date` is based on the hybrid calendar Julian + Gregorian since + // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used + // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to + // a local date in Proleptic Gregorian calendar to satisfy this requirement. Some + // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. + // As the consequence of that, 29 February of such years might not exist in Proleptic + // Gregorian calendar. When this happens, we shift the date by one day. + Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) + } if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) { randomNumeric[LocalDate]( rand, - (rand: Random) => LocalDate.ofEpochDay(uniformDaysRand(rand)), + (rand: Random) => getRandomDate(rand).toLocalDate, specialDates.map(LocalDate.parse)) } else { randomNumeric[java.sql.Date]( rand, - (rand: Random) => { - val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand)) - // The generated `date` is based on the hybrid calendar Julian + Gregorian since - // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used - // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to - // a local date in Proleptic Gregorian calendar to satisfy this requirement. Some - // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. - // As the consequence of that, 29 February of such years might not exist in Proleptic - // Gregorian calendar. When this happens, we shift the date by one day. - Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) - }, + getRandomDate, specialDates.map(java.sql.Date.valueOf)) } case TimestampType => @@ -222,10 +223,22 @@ object RandomDataGenerator { "1970-01-01 00:00:00", // the epoch timestamp "9999-12-31 23:59:59" // the last supported timestamp according to SQL standard ) + def getRandomTimestamp(rand: Random): java.sql.Timestamp = { + // DateTimeUtils.toJavaTimestamp takes microsecond. + val ts = DateTimeUtils.toJavaTimestamp(uniformMicrosRand(rand)) + // The generated `ts` is based on the hybrid calendar Julian + Gregorian since + // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used + // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to + // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement. Some + // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. + // As the consequence of that, 29 February of such years might not exist in Proleptic + // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day. + Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY)) + } if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) { randomNumeric[Instant]( rand, - (rand: Random) => DateTimeUtils.microsToInstant(uniformMicrosRand(rand)), + (rand: Random) => getRandomTimestamp(rand).toInstant, specialTs.map { s => val ldt = LocalDateTime.parse(s.replace(" ", "T")) ldt.atZone(ZoneId.systemDefault()).toInstant @@ -233,18 +246,7 @@ object RandomDataGenerator { } else { randomNumeric[java.sql.Timestamp]( rand, - (rand: Random) => { - // DateTimeUtils.toJavaTimestamp takes microsecond. - val ts = DateTimeUtils.toJavaTimestamp(uniformMicrosRand(rand)) - // The generated `ts` is based on the hybrid calendar Julian + Gregorian since - // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used - // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to - // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement. Some - // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. - // As the consequence of that, 29 February of such years might not exist in Proleptic - // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day. - Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY)) - }, + getRandomTimestamp, specialTs.map(java.sql.Timestamp.valueOf)) } case CalendarIntervalType => Some(() => { From 3ab34228ce8d76ec9290ed16aa98cca479c3ac81 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 15 Feb 2021 17:14:28 +0300 Subject: [PATCH 2/3] Generate only valid Julian dates for ORC --- .../spark/sql/RandomDataGenerator.scala | 21 ++++++++++++++++--- .../sql/sources/HadoopFsRelationTest.scala | 13 +++++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 945efff3adce6..1879d8ea3888f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -149,12 +149,15 @@ object RandomDataGenerator { * @param dataType the type to generate values for * @param nullable whether null values should be generated * @param rand an optional random number generator + * @param validJulianDatetime whether to generate dates and timestamps that are valid + * in the Julian calendar. * @return a function which can be called to generate random values. */ def forType( dataType: DataType, nullable: Boolean = true, - rand: Random = new Random): Option[() => Any] = { + rand: Random = new Random, + validJulianDatetime: Boolean = false): Option[() => Any] = { val valueGenerator: Option[() => Any] = dataType match { case StringType => Some(() => rand.nextString(rand.nextInt(MAX_STR_LEN))) case BinaryType => Some(() => { @@ -196,7 +199,13 @@ object RandomDataGenerator { if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) { randomNumeric[LocalDate]( rand, - (rand: Random) => getRandomDate(rand).toLocalDate, + (rand: Random) => { + if (validJulianDatetime) { + getRandomDate(rand).toLocalDate + } else { + LocalDate.ofEpochDay(uniformDaysRand(rand)) + } + }, specialDates.map(LocalDate.parse)) } else { randomNumeric[java.sql.Date]( @@ -238,7 +247,13 @@ object RandomDataGenerator { if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) { randomNumeric[Instant]( rand, - (rand: Random) => getRandomTimestamp(rand).toInstant, + (rand: Random) => { + if (validJulianDatetime) { + getRandomTimestamp(rand).toInstant + } else { + DateTimeUtils.microsToInstant(uniformMicrosRand(rand)) + } + }, specialTs.map { s => val ldt = LocalDateTime.parse(s.replace(" ", "T")) ldt.atZone(ZoneId.systemDefault()).toInstant diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala index b65a00457c72c..9befd910fab31 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.sources import java.io.File +import java.util.Locale import scala.util.Random @@ -160,7 +161,17 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes val dataGenerator = RandomDataGenerator.forType( dataType = dataType, nullable = true, - new Random(seed) + new Random(seed), + // TODO(SPARK-34440): Allow saving/loading datetime in ORC w/o rebasing + // The ORC datasource always performs datetime rebasing that can lead to + // shifting of the original dates/timestamps. For instance, 1582-10-06 is valid + // date in the Proleptic Gregorian calendar but it does not exist in the Julian + // calendar. The ORC datasource shifts the date to the next valid date 1582-10-15 + // during rebasing of this date to the Julian calendar. Since the test compares + // the original date before saving and the date loaded back from the ORC files, + // we set `validJulianDatetime` to `true` to generate only Proleptic Gregorian + // dates that exist in the Julian calendar and will be not changed during rebase. + validJulianDatetime = dataSourceName.toLowerCase(Locale.ROOT).contains("orc") ).getOrElse { fail(s"Failed to create data generator for schema $dataType") } From 2b24fc46138f64bb89552c6b7186eb7ccf05be83 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 15 Feb 2021 17:36:40 +0300 Subject: [PATCH 3/3] Generate via days/micros --- .../org/apache/spark/sql/RandomDataGenerator.scala | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 1879d8ea3888f..0fc040ffb747b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -200,11 +200,12 @@ object RandomDataGenerator { randomNumeric[LocalDate]( rand, (rand: Random) => { - if (validJulianDatetime) { - getRandomDate(rand).toLocalDate + val days = if (validJulianDatetime) { + DateTimeUtils.fromJavaDate(getRandomDate(rand)) } else { - LocalDate.ofEpochDay(uniformDaysRand(rand)) + uniformDaysRand(rand) } + LocalDate.ofEpochDay(days) }, specialDates.map(LocalDate.parse)) } else { @@ -248,11 +249,12 @@ object RandomDataGenerator { randomNumeric[Instant]( rand, (rand: Random) => { - if (validJulianDatetime) { - getRandomTimestamp(rand).toInstant + val micros = if (validJulianDatetime) { + DateTimeUtils.fromJavaTimestamp(getRandomTimestamp(rand)) } else { - DateTimeUtils.microsToInstant(uniformMicrosRand(rand)) + uniformMicrosRand(rand) } + DateTimeUtils.microsToInstant(micros) }, specialTs.map { s => val ldt = LocalDateTime.parse(s.replace(" ", "T"))