From 0031ab3dff8211b95b7dd499f2eb911ca5769801 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Tue, 16 Feb 2021 11:53:26 +0900 Subject: [PATCH] [SPARK-34424][SQL][TESTS] Fix failures of HiveOrcHadoopFsRelationSuite Modify `RandomDataGenerator.forType()` to allow generation of dates/timestamps that are valid in both Julian and Proleptic Gregorian calendars. Currently, the function can produce a date (for example `1582-10-06`) which is valid in the Proleptic Gregorian calendar. Though it cannot be saved to ORC files AS IS since ORC format (ORC libs in fact) assumes Julian calendar. So, Spark shifts `1582-10-06` to the next valid date `1582-10-15` while saving it to ORC files. And as a consequence of that, the test fails because it compares original date `1582-10-06` and the date `1582-10-15` loaded back from the ORC files. In this PR, I propose to generate valid dates/timestamps in both calendars for ORC datasource till SPARK-34440 is resolved. The changes fix failures of `HiveOrcHadoopFsRelationSuite`. For instance, the test "test all data types" fails with the seed **610710213676**: ``` == Results == !== Correct Answer - 20 == == Spark Answer - 20 == struct struct ... ![9,1582-10-06] [9,1582-10-15] ``` No By running the modified test suite: ``` $ build/sbt -Phive -Phive-thriftserver "test:testOnly *HiveOrcHadoopFsRelationSuite" ``` Closes #31552 from MaxGekk/fix-HiveOrcHadoopFsRelationSuite. Authored-by: Max Gekk Signed-off-by: HyukjinKwon (cherry picked from commit 03161055de0c132070354407160553363175c4d7) Signed-off-by: Max Gekk --- .../spark/sql/RandomDataGenerator.scala | 73 ++++++++++++------- .../sql/sources/HadoopFsRelationTest.scala | 13 +++- 2 files changed, 58 insertions(+), 28 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 9fa27c7df3832..0fc040ffb747b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -149,12 +149,15 @@ object RandomDataGenerator { * @param dataType the type to generate values for * @param nullable whether null values should be generated * @param rand an optional random number generator + * @param validJulianDatetime whether to generate dates and timestamps that are valid + * in the Julian calendar. * @return a function which can be called to generate random values. */ def forType( dataType: DataType, nullable: Boolean = true, - rand: Random = new Random): Option[() => Any] = { + rand: Random = new Random, + validJulianDatetime: Boolean = false): Option[() => Any] = { val valueGenerator: Option[() => Any] = dataType match { case StringType => Some(() => rand.nextString(rand.nextInt(MAX_STR_LEN))) case BinaryType => Some(() => { @@ -182,29 +185,37 @@ object RandomDataGenerator { "1970-01-01", // the epoch date "9999-12-31" // the last supported date according to SQL standard ) + def getRandomDate(rand: Random): java.sql.Date = { + val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand)) + // The generated `date` is based on the hybrid calendar Julian + Gregorian since + // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used + // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to + // a local date in Proleptic Gregorian calendar to satisfy this requirement. Some + // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. + // As the consequence of that, 29 February of such years might not exist in Proleptic + // Gregorian calendar. When this happens, we shift the date by one day. + Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) + } if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) { randomNumeric[LocalDate]( rand, - (rand: Random) => LocalDate.ofEpochDay(uniformDaysRand(rand)), + (rand: Random) => { + val days = if (validJulianDatetime) { + DateTimeUtils.fromJavaDate(getRandomDate(rand)) + } else { + uniformDaysRand(rand) + } + LocalDate.ofEpochDay(days) + }, specialDates.map(LocalDate.parse)) } else { randomNumeric[java.sql.Date]( rand, - (rand: Random) => { - val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand)) - // The generated `date` is based on the hybrid calendar Julian + Gregorian since - // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used - // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to - // a local date in Proleptic Gregorian calendar to satisfy this requirement. Some - // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. - // As the consequence of that, 29 February of such years might not exist in Proleptic - // Gregorian calendar. When this happens, we shift the date by one day. - Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) - }, + getRandomDate, specialDates.map(java.sql.Date.valueOf)) } case TimestampType => - def uniformMicorsRand(rand: Random): Long = { + def uniformMicrosRand(rand: Random): Long = { var milliseconds = rand.nextLong() % 253402329599999L // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT // for "0001-01-01 00:00:00.000000". We need to find a @@ -222,10 +233,29 @@ object RandomDataGenerator { "1970-01-01 00:00:00", // the epoch timestamp "9999-12-31 23:59:59" // the last supported timestamp according to SQL standard ) + def getRandomTimestamp(rand: Random): java.sql.Timestamp = { + // DateTimeUtils.toJavaTimestamp takes microsecond. + val ts = DateTimeUtils.toJavaTimestamp(uniformMicrosRand(rand)) + // The generated `ts` is based on the hybrid calendar Julian + Gregorian since + // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used + // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to + // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement. Some + // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. + // As the consequence of that, 29 February of such years might not exist in Proleptic + // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day. + Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY)) + } if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) { randomNumeric[Instant]( rand, - (rand: Random) => DateTimeUtils.microsToInstant(uniformMicorsRand(rand)), + (rand: Random) => { + val micros = if (validJulianDatetime) { + DateTimeUtils.fromJavaTimestamp(getRandomTimestamp(rand)) + } else { + uniformMicrosRand(rand) + } + DateTimeUtils.microsToInstant(micros) + }, specialTs.map { s => val ldt = LocalDateTime.parse(s.replace(" ", "T")) ldt.atZone(ZoneId.systemDefault()).toInstant @@ -233,18 +263,7 @@ object RandomDataGenerator { } else { randomNumeric[java.sql.Timestamp]( rand, - (rand: Random) => { - // DateTimeUtils.toJavaTimestamp takes microsecond. - val ts = DateTimeUtils.toJavaTimestamp(uniformMicorsRand(rand)) - // The generated `ts` is based on the hybrid calendar Julian + Gregorian since - // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used - // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to - // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement. Some - // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. - // As the consequence of that, 29 February of such years might not exist in Proleptic - // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day. - Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY)) - }, + getRandomTimestamp, specialTs.map(java.sql.Timestamp.valueOf)) } case CalendarIntervalType => Some(() => { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala index b65a00457c72c..9befd910fab31 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.sources import java.io.File +import java.util.Locale import scala.util.Random @@ -160,7 +161,17 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes val dataGenerator = RandomDataGenerator.forType( dataType = dataType, nullable = true, - new Random(seed) + new Random(seed), + // TODO(SPARK-34440): Allow saving/loading datetime in ORC w/o rebasing + // The ORC datasource always performs datetime rebasing that can lead to + // shifting of the original dates/timestamps. For instance, 1582-10-06 is valid + // date in the Proleptic Gregorian calendar but it does not exist in the Julian + // calendar. The ORC datasource shifts the date to the next valid date 1582-10-15 + // during rebasing of this date to the Julian calendar. Since the test compares + // the original date before saving and the date loaded back from the ORC files, + // we set `validJulianDatetime` to `true` to generate only Proleptic Gregorian + // dates that exist in the Julian calendar and will be not changed during rebase. + validJulianDatetime = dataSourceName.toLowerCase(Locale.ROOT).contains("orc") ).getOrElse { fail(s"Failed to create data generator for schema $dataType") }