From e53da4ab6a94d9e12119c90f1a6c653f7761ba00 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 22 Jul 2021 18:29:26 +0200 Subject: [PATCH 1/2] ARROW-11243: [C++] Recognize time types in CSV files * Allow reading CSV columns as time32 and time64 * Automatically infer "hh:mm" and "hh:mm:ss" as time32[s] --- cpp/src/arrow/csv/column_builder_test.cc | 18 +++++ cpp/src/arrow/csv/converter.cc | 17 +++-- cpp/src/arrow/csv/converter_test.cc | 62 +++++++++++++++ cpp/src/arrow/csv/inference_internal.h | 5 ++ cpp/src/arrow/util/value_parsing.h | 38 +++++++++- cpp/src/arrow/util/value_parsing_test.cc | 97 ++++++++++++++++++++++++ docs/source/cpp/csv.rst | 2 + docs/source/python/csv.rst | 3 +- python/pyarrow/tests/test_csv.py | 27 +++++++ 9 files changed, 259 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/csv/column_builder_test.cc b/cpp/src/arrow/csv/column_builder_test.cc index 9fa995350e2c..42e17cad7e60 100644 --- a/cpp/src/arrow/csv/column_builder_test.cc +++ b/cpp/src/arrow/csv/column_builder_test.cc @@ -400,6 +400,24 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkDate) { ArrayFromJSON(date32(), "[null]")}); } +TEST_F(InferringColumnBuilderTest, SingleChunkTime) { + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + + CheckInferred(tg, {{"", "01:23:45", "NA"}}, options, + {ArrayFromJSON(time32(TimeUnit::SECOND), "[null, 5025, null]")}); +} + +TEST_F(InferringColumnBuilderTest, MultipleChunkTime) { + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + auto type = time32(TimeUnit::SECOND); + + CheckInferred(tg, {{""}, {"01:23:45"}, {"NA"}}, options, + {ArrayFromJSON(type, "[null]"), ArrayFromJSON(type, "[5025]"), + ArrayFromJSON(type, "[null]")}); +} + TEST_F(InferringColumnBuilderTest, SingleChunkTimestamp) { auto options = ConvertOptions::Defaults(); auto tg = TaskGroup::MakeSerial(); diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index cb72b22b405c..b1cde12a28e2 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -130,7 +130,7 @@ struct ValueDecoder { protected: Trie null_trie_; - std::shared_ptr type_; + const std::shared_ptr type_; const ConvertOptions& options_; }; @@ -191,24 +191,29 @@ struct BinaryValueDecoder : public ValueDecoder { }; // -// Value decoder for integers and floats +// Value decoder for integers, floats and temporals // template struct NumericValueDecoder : public ValueDecoder { using value_type = typename T::c_type; - using ValueDecoder::ValueDecoder; + explicit NumericValueDecoder(const std::shared_ptr& type, + const ConvertOptions& options) + : ValueDecoder(type, options), concrete_type_(checked_cast(*type)) {} Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { // XXX should quoted values be allowed at all? TrimWhiteSpace(&data, &size); - if (ARROW_PREDICT_FALSE( - !internal::ParseValue(reinterpret_cast(data), size, out))) { + if (ARROW_PREDICT_FALSE(!internal::ParseValue( + concrete_type_, reinterpret_cast(data), size, out))) { return GenericConversionError(type_, data, size); } return Status::OK(); } + + protected: + const T& concrete_type_; }; // @@ -569,6 +574,8 @@ Result> Converter::Make(const std::shared_ptr)) CONVERTER_CASE(Type::BINARY, (PrimitiveConverter>)) diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc index 4bed649d558c..f59d184f7498 100644 --- a/cpp/src/arrow/csv/converter_test.cc +++ b/cpp/src/arrow/csv/converter_test.cc @@ -429,6 +429,11 @@ TEST(Date32Conversion, Nulls) { {{false, true}}); } +TEST(Date32Conversion, Errors) { + AssertConversionError(date32(), {"1945-06-31\n"}, {0}); + AssertConversionError(date32(), {"2020-13-01\n"}, {0}); +} + TEST(Date64Conversion, Basics) { AssertConversion(date64(), {"1945-05-08\n", "2020-03-15\n"}, {{-777945600000LL, 1584230400000LL}}); @@ -439,6 +444,63 @@ TEST(Date64Conversion, Nulls) { {{0, 1584230400000LL}}, {{false, true}}); } +TEST(Date64Conversion, Errors) { + AssertConversionError(date64(), {"1945-06-31\n"}, {0}); + AssertConversionError(date64(), {"2020-13-01\n"}, {0}); +} + +TEST(Time32Conversion, Seconds) { + const auto type = time32(TimeUnit::SECOND); + + AssertConversion(type, {"00:00\n", "00:00:00\n"}, {{0, 0}}); + AssertConversion(type, {"01:23:45\n", "23:45:43\n"}, + {{5025, 85543}}); + AssertConversion(type, {"N/A\n", "23:59:59\n"}, {{0, 86399}}, + {{false, true}}); + + AssertConversionError(type, {"24:00\n"}, {0}); + AssertConversionError(type, {"23:59:60\n"}, {0}); +} + +TEST(Time32Conversion, Millis) { + const auto type = time32(TimeUnit::MILLI); + + AssertConversion(type, {"00:00\n", "00:00:00\n"}, {{0, 0}}); + AssertConversion(type, {"01:23:45.1\n", "23:45:43.789\n"}, + {{5025100, 85543789}}); + AssertConversion(type, {"N/A\n", "23:59:59.999\n"}, + {{0, 86399999}}, {{false, true}}); + + AssertConversionError(type, {"24:00\n"}, {0}); + AssertConversionError(type, {"23:59:60\n"}, {0}); +} + +TEST(Time64Conversion, Micros) { + const auto type = time64(TimeUnit::MICRO); + + AssertConversion(type, {"00:00\n", "00:00:00\n"}, {{0LL, 0LL}}); + AssertConversion(type, {"01:23:45.1\n", "23:45:43.456789\n"}, + {{5025100000LL, 85543456789LL}}); + AssertConversion(type, {"N/A\n", "23:59:59.999999\n"}, + {{0, 86399999999LL}}, {{false, true}}); + + AssertConversionError(type, {"24:00\n"}, {0}); + AssertConversionError(type, {"23:59:60\n"}, {0}); +} + +TEST(Time64Conversion, Nanos) { + const auto type = time64(TimeUnit::NANO); + + AssertConversion(type, {"00:00\n", "00:00:00\n"}, {{0LL, 0LL}}); + AssertConversion(type, {"01:23:45.1\n", "23:45:43.123456789\n"}, + {{5025100000000LL, 85543123456789LL}}); + AssertConversion(type, {"N/A\n", "23:59:59.999999999\n"}, + {{0, 86399999999999LL}}, {{false, true}}); + + AssertConversionError(type, {"24:00\n"}, {0}); + AssertConversionError(type, {"23:59:60\n"}, {0}); +} + TEST(TimestampConversion, Basics) { auto type = timestamp(TimeUnit::SECOND); diff --git a/cpp/src/arrow/csv/inference_internal.h b/cpp/src/arrow/csv/inference_internal.h index 42486a1ebafb..1fd6d41b5cc9 100644 --- a/cpp/src/arrow/csv/inference_internal.h +++ b/cpp/src/arrow/csv/inference_internal.h @@ -32,6 +32,7 @@ enum class InferKind { Boolean, Real, Date, + Time, Timestamp, TimestampNS, TextDict, @@ -60,6 +61,8 @@ class InferStatus { case InferKind::Boolean: return SetKind(InferKind::Date); case InferKind::Date: + return SetKind(InferKind::Time); + case InferKind::Time: return SetKind(InferKind::Timestamp); case InferKind::Timestamp: return SetKind(InferKind::TimestampNS); @@ -114,6 +117,8 @@ class InferStatus { return make_converter(boolean()); case InferKind::Date: return make_converter(date32()); + case InferKind::Time: + return make_converter(time32(TimeUnit::SECOND)); case InferKind::Timestamp: return make_converter(timestamp(TimeUnit::SECOND)); case InferKind::TimestampNS: diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index 00295d1b51f3..02e6fa42e01a 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -719,7 +719,9 @@ struct StringConverter> { static bool Convert(const DATE_TYPE& type, const char* s, size_t length, value_type* out) { - if (length != 10) return false; + if (ARROW_PREDICT_FALSE(length != 10)) { + return false; + } duration_type since_epoch; if (ARROW_PREDICT_FALSE(!detail::ParseYYYY_MM_DD(s, &since_epoch))) { @@ -735,12 +737,36 @@ template struct StringConverter> { using value_type = typename TIME_TYPE::c_type; + // We allow the following formats for all units: + // - "hh:mm" + // - "hh:mm:ss" + // + // We allow the following formats for unit == MILLI, MICRO, or NANO: + // - "hh:mm:ss.s{1,3}" + // + // We allow the following formats for unit == MICRO, or NANO: + // - "hh:mm:ss.s{4,6}" + // + // We allow the following formats for unit == NANO: + // - "hh:mm:ss.s{7,9}" + static bool Convert(const TIME_TYPE& type, const char* s, size_t length, value_type* out) { - if (length < 8) return false; - auto unit = type.unit(); - + const auto unit = type.unit(); std::chrono::seconds since_midnight; + + if (length == 5) { + if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s, &since_midnight))) { + return false; + } + *out = + static_cast(util::CastSecondsToUnit(unit, since_midnight.count())); + return true; + } + + if (ARROW_PREDICT_FALSE(length < 8)) { + return false; + } if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s, &since_midnight))) { return false; } @@ -751,6 +777,10 @@ struct StringConverter> { return true; } + if (ARROW_PREDICT_FALSE(s[8] != '.')) { + return false; + } + uint32_t subseconds_count = 0; if (ARROW_PREDICT_FALSE( !detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) { diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index e790a10acf1f..b5dc5619dede 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -264,6 +264,103 @@ TEST(StringConversion, ToDate64) { AssertConversion("0001-01-01", -62135596800000LL); } +template +void AssertInvalidTimes(const T& type) { + // Invalid time format + AssertConversionFails(type, ""); + AssertConversionFails(type, "00"); + AssertConversionFails(type, "00:"); + AssertConversionFails(type, "00:00:"); + AssertConversionFails(type, "00:00:00:"); + AssertConversionFails(type, "000000"); + AssertConversionFails(type, "000000.000"); + + // Invalid time value + AssertConversionFails(type, "24:00:00"); + AssertConversionFails(type, "00:60:00"); + AssertConversionFails(type, "00:00:60"); +} + +TEST(StringConversion, ToTime32) { + { + Time32Type type{TimeUnit::SECOND}; + + AssertConversion(type, "00:00", 0); + AssertConversion(type, "01:23", 4980); + AssertConversion(type, "23:59", 86340); + + AssertConversion(type, "00:00:00", 0); + AssertConversion(type, "01:23:45", 5025); + AssertConversion(type, "23:45:43", 85543); + AssertConversion(type, "23:59:59", 86399); + + AssertInvalidTimes(type); + // No subseconds allowed + AssertConversionFails(type, "00:00:00.123"); + } + { + Time32Type type{TimeUnit::MILLI}; + + AssertConversion(type, "00:00", 0); + AssertConversion(type, "01:23", 4980000); + AssertConversion(type, "23:59", 86340000); + + AssertConversion(type, "00:00:00", 0); + AssertConversion(type, "01:23:45", 5025000); + AssertConversion(type, "23:45:43", 85543000); + AssertConversion(type, "23:59:59", 86399000); + + AssertConversion(type, "00:00:00.123", 123); + AssertConversion(type, "01:23:45.000", 5025000); + AssertConversion(type, "01:23:45.1", 5025100); + AssertConversion(type, "01:23:45.123", 5025123); + AssertConversion(type, "01:23:45.999", 5025999); + + AssertInvalidTimes(type); + // Invalid subseconds + AssertConversionFails(type, "00:00:00.1234"); + } +} + +TEST(StringConversion, ToTime64) { + { + Time64Type type{TimeUnit::MICRO}; + + AssertConversion(type, "00:00:00", 0LL); + AssertConversion(type, "01:23:45", 5025000000LL); + AssertConversion(type, "23:45:43", 85543000000LL); + AssertConversion(type, "23:59:59", 86399000000LL); + + AssertConversion(type, "00:00:00.123456", 123456LL); + AssertConversion(type, "01:23:45.000000", 5025000000LL); + AssertConversion(type, "01:23:45.1", 5025100000LL); + AssertConversion(type, "01:23:45.123", 5025123000LL); + AssertConversion(type, "01:23:45.999999", 5025999999LL); + + AssertInvalidTimes(type); + // Invalid subseconds + AssertConversionFails(type, "00:00:00.1234567"); + } + { + Time64Type type{TimeUnit::NANO}; + + AssertConversion(type, "00:00:00", 0LL); + AssertConversion(type, "01:23:45", 5025000000000LL); + AssertConversion(type, "23:45:43", 85543000000000LL); + AssertConversion(type, "23:59:59", 86399000000000LL); + + AssertConversion(type, "00:00:00.123456789", 123456789LL); + AssertConversion(type, "01:23:45.000000000", 5025000000000LL); + AssertConversion(type, "01:23:45.1", 5025100000000LL); + AssertConversion(type, "01:23:45.1234", 5025123400000LL); + AssertConversion(type, "01:23:45.999999999", 5025999999999LL); + + AssertInvalidTimes(type); + // Invalid subseconds + AssertConversionFails(type, "00:00:00.1234567891"); + } +} + TEST(StringConversion, ToTimestampDate_ISO8601) { { TimestampType type{TimeUnit::SECOND}; diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst index 2f92708e79f4..5ca17a1653b5 100644 --- a/docs/source/cpp/csv.rst +++ b/docs/source/cpp/csv.rst @@ -151,6 +151,7 @@ column. Type inference considers the following data types, in order: * Int64 * Boolean * Date32 +* Time32 (with seconds unit) * Timestamp (with seconds unit) * Timestamp (with nanoseconds unit) * Float64 @@ -169,6 +170,7 @@ can be chosen from the following list: * Decimal128 * Boolean * Date32 and Date64 +* Time32 and Time64 * Timestamp * Binary and Large Binary * String and Large String (with optional UTF8 input validation) diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst index 9c00027b0411..1724c63f417f 100644 --- a/docs/source/python/csv.rst +++ b/docs/source/python/csv.rst @@ -29,7 +29,8 @@ The features currently offered are the following: such as ``my_data.csv.gz``) * fetching column names from the first row in the CSV file * column-wise type inference and conversion to one of ``null``, ``int64``, - ``float64``, ``date32``, ``timestamp[s]``, ``timestamp[ns]``, ``string`` or ``binary`` data + ``float64``, ``date32``, ``time32[s]``, ``timestamp[s]``, ``timestamp[ns]``, + ``string`` or ``binary`` data * opportunistic dictionary encoding of ``string`` and ``binary`` columns (disabled by default) * detecting various spellings of null values such as ``NaN`` or ``#N/A`` diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 0c05c28290ea..2f3ef4fca9e7 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -894,6 +894,33 @@ def test_dates(self): 'b': [datetime(1970, 1, 2), datetime(1971, 1, 2)], } + def test_times(self): + # Times are inferred as time32[s] by default + from datetime import time + + rows = b"a,b\n12:34:56,12:34:56.789\n23:59:59,23:59:59.999\n" + table = self.read_bytes(rows) + # Column 'b' has subseconds, so cannot be inferred as time32[s] + schema = pa.schema([('a', pa.time32('s')), + ('b', pa.string())]) + assert table.schema == schema + assert table.to_pydict() == { + 'a': [time(12, 34, 56), time(23, 59, 59)], + 'b': ["12:34:56.789", "23:59:59.999"], + } + + # Can ask for time types explicitly + opts = ConvertOptions() + opts.column_types = {'a': pa.time64('us'), 'b': pa.time32('ms')} + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.time64('us')), + ('b', pa.time32('ms'))]) + assert table.schema == schema + assert table.to_pydict() == { + 'a': [time(12, 34, 56), time(23, 59, 59)], + 'b': [time(12, 34, 56, 789000), time(23, 59, 59, 999000)], + } + def test_auto_dict_encode(self): opts = ConvertOptions(auto_dict_encode=True) rows = "a,b\nab,1\ncdé,2\ncdé,3\nab,4".encode() From b103dd8d46ff7d46ae3d055ffae708f591ce9546 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 27 Jul 2021 10:03:37 +0200 Subject: [PATCH 2/2] Add inference tests with mixed types --- cpp/src/arrow/csv/column_builder_test.cc | 40 ++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/cpp/src/arrow/csv/column_builder_test.cc b/cpp/src/arrow/csv/column_builder_test.cc index 42e17cad7e60..7577c883e8ce 100644 --- a/cpp/src/arrow/csv/column_builder_test.cc +++ b/cpp/src/arrow/csv/column_builder_test.cc @@ -471,6 +471,46 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampNS) { options, expected); } +TEST_F(InferringColumnBuilderTest, SingleChunkIntegerAndTime) { + // Fallback to utf-8 + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + + CheckInferred(tg, {{"", "99", "01:23:45", "NA"}}, options, + {ArrayFromJSON(utf8(), R"(["", "99", "01:23:45", "NA"])")}); +} + +TEST_F(InferringColumnBuilderTest, MultipleChunkIntegerAndTime) { + // Fallback to utf-8 + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + auto type = utf8(); + + CheckInferred(tg, {{""}, {"99"}, {"01:23:45", "NA"}}, options, + {ArrayFromJSON(type, R"([""])"), ArrayFromJSON(type, R"(["99"])"), + ArrayFromJSON(type, R"(["01:23:45", "NA"])")}); +} + +TEST_F(InferringColumnBuilderTest, SingleChunkDateAndTime) { + // Fallback to utf-8 + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + + CheckInferred(tg, {{"", "01:23:45", "1998-04-05"}}, options, + {ArrayFromJSON(utf8(), R"(["", "01:23:45", "1998-04-05"])")}); +} + +TEST_F(InferringColumnBuilderTest, MultipleChunkDateAndTime) { + // Fallback to utf-8 + auto options = ConvertOptions::Defaults(); + auto tg = TaskGroup::MakeSerial(); + auto type = utf8(); + + CheckInferred(tg, {{""}, {"01:23:45"}, {"1998-04-05"}}, options, + {ArrayFromJSON(type, R"([""])"), ArrayFromJSON(type, R"(["01:23:45"])"), + ArrayFromJSON(type, R"(["1998-04-05"])")}); +} + TEST_F(InferringColumnBuilderTest, SingleChunkString) { auto options = ConvertOptions::Defaults(); auto tg = TaskGroup::MakeSerial();