Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions cpp/src/arrow/csv/column_builder_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,24 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkDate) {
ArrayFromJSON(date32(), "[null]")});
}

TEST_F(InferringColumnBuilderTest, SingleChunkTime) {
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();

CheckInferred(tg, {{"", "01:23:45", "NA"}}, options,
Comment thread
bkietz marked this conversation as resolved.
Outdated
{ArrayFromJSON(time32(TimeUnit::SECOND), "[null, 5025, null]")});
}

TEST_F(InferringColumnBuilderTest, MultipleChunkTime) {
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();
auto type = time32(TimeUnit::SECOND);

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should there also be an inferring test that results in time64?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, there is no inference to time64. Should there be one (for times with nanosecond precision perhaps)?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see, I was looking at the wrong case statement, you're right. Maybe there should be one to catch any times with fractional seconds, though I don't feel strongly about it since you can explicitly declare the type/schema you want.


CheckInferred(tg, {{""}, {"01:23:45"}, {"NA"}}, options,
{ArrayFromJSON(type, "[null]"), ArrayFromJSON(type, "[5025]"),
ArrayFromJSON(type, "[null]")});
}

TEST_F(InferringColumnBuilderTest, SingleChunkTimestamp) {
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();
Expand Down Expand Up @@ -453,6 +471,46 @@ TEST_F(InferringColumnBuilderTest, MultipleChunkTimestampNS) {
options, expected);
}

TEST_F(InferringColumnBuilderTest, SingleChunkIntegerAndTime) {
// Fallback to utf-8
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();

CheckInferred(tg, {{"", "99", "01:23:45", "NA"}}, options,
{ArrayFromJSON(utf8(), R"(["", "99", "01:23:45", "NA"])")});
}

TEST_F(InferringColumnBuilderTest, MultipleChunkIntegerAndTime) {
// Fallback to utf-8
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();
auto type = utf8();

CheckInferred(tg, {{""}, {"99"}, {"01:23:45", "NA"}}, options,
{ArrayFromJSON(type, R"([""])"), ArrayFromJSON(type, R"(["99"])"),
ArrayFromJSON(type, R"(["01:23:45", "NA"])")});
}

TEST_F(InferringColumnBuilderTest, SingleChunkDateAndTime) {
// Fallback to utf-8
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();

CheckInferred(tg, {{"", "01:23:45", "1998-04-05"}}, options,
{ArrayFromJSON(utf8(), R"(["", "01:23:45", "1998-04-05"])")});
}

TEST_F(InferringColumnBuilderTest, MultipleChunkDateAndTime) {
// Fallback to utf-8
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();
auto type = utf8();

CheckInferred(tg, {{""}, {"01:23:45"}, {"1998-04-05"}}, options,
{ArrayFromJSON(type, R"([""])"), ArrayFromJSON(type, R"(["01:23:45"])"),
ArrayFromJSON(type, R"(["1998-04-05"])")});
}

TEST_F(InferringColumnBuilderTest, SingleChunkString) {
auto options = ConvertOptions::Defaults();
auto tg = TaskGroup::MakeSerial();
Expand Down
17 changes: 12 additions & 5 deletions cpp/src/arrow/csv/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ struct ValueDecoder {

protected:
Trie null_trie_;
std::shared_ptr<DataType> type_;
const std::shared_ptr<DataType> type_;
const ConvertOptions& options_;
};

Expand Down Expand Up @@ -191,24 +191,29 @@ struct BinaryValueDecoder : public ValueDecoder {
};

//
// Value decoder for integers and floats
// Value decoder for integers, floats and temporals
//

template <typename T>
struct NumericValueDecoder : public ValueDecoder {
using value_type = typename T::c_type;

using ValueDecoder::ValueDecoder;
explicit NumericValueDecoder(const std::shared_ptr<DataType>& type,
const ConvertOptions& options)
: ValueDecoder(type, options), concrete_type_(checked_cast<const T&>(*type)) {}

Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
// XXX should quoted values be allowed at all?
TrimWhiteSpace(&data, &size);
if (ARROW_PREDICT_FALSE(
!internal::ParseValue<T>(reinterpret_cast<const char*>(data), size, out))) {
if (ARROW_PREDICT_FALSE(!internal::ParseValue<T>(
concrete_type_, reinterpret_cast<const char*>(data), size, out))) {
return GenericConversionError(type_, data, size);
}
return Status::OK();
}

protected:
const T& concrete_type_;
};

//
Expand Down Expand Up @@ -569,6 +574,8 @@ Result<std::shared_ptr<Converter>> Converter::Make(const std::shared_ptr<DataTyp
NUMERIC_CONVERTER_CASE(Type::DOUBLE, DoubleType)
NUMERIC_CONVERTER_CASE(Type::DATE32, Date32Type)
NUMERIC_CONVERTER_CASE(Type::DATE64, Date64Type)
NUMERIC_CONVERTER_CASE(Type::TIME32, Time32Type)
NUMERIC_CONVERTER_CASE(Type::TIME64, Time64Type)
CONVERTER_CASE(Type::BOOL, (PrimitiveConverter<BooleanType, BooleanValueDecoder>))
CONVERTER_CASE(Type::BINARY,
(PrimitiveConverter<BinaryType, BinaryValueDecoder<false>>))
Expand Down
62 changes: 62 additions & 0 deletions cpp/src/arrow/csv/converter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,11 @@ TEST(Date32Conversion, Nulls) {
{{false, true}});
}

TEST(Date32Conversion, Errors) {
AssertConversionError(date32(), {"1945-06-31\n"}, {0});
AssertConversionError(date32(), {"2020-13-01\n"}, {0});
}

TEST(Date64Conversion, Basics) {
AssertConversion<Date64Type, int64_t>(date64(), {"1945-05-08\n", "2020-03-15\n"},
{{-777945600000LL, 1584230400000LL}});
Expand All @@ -439,6 +444,63 @@ TEST(Date64Conversion, Nulls) {
{{0, 1584230400000LL}}, {{false, true}});
}

TEST(Date64Conversion, Errors) {
AssertConversionError(date64(), {"1945-06-31\n"}, {0});
AssertConversionError(date64(), {"2020-13-01\n"}, {0});
}

TEST(Time32Conversion, Seconds) {
const auto type = time32(TimeUnit::SECOND);

AssertConversion<Time32Type, int32_t>(type, {"00:00\n", "00:00:00\n"}, {{0, 0}});
AssertConversion<Time32Type, int32_t>(type, {"01:23:45\n", "23:45:43\n"},
{{5025, 85543}});
AssertConversion<Time32Type, int32_t>(type, {"N/A\n", "23:59:59\n"}, {{0, 86399}},
{{false, true}});

AssertConversionError(type, {"24:00\n"}, {0});
AssertConversionError(type, {"23:59:60\n"}, {0});
}

TEST(Time32Conversion, Millis) {
const auto type = time32(TimeUnit::MILLI);

AssertConversion<Time32Type, int32_t>(type, {"00:00\n", "00:00:00\n"}, {{0, 0}});
AssertConversion<Time32Type, int32_t>(type, {"01:23:45.1\n", "23:45:43.789\n"},
{{5025100, 85543789}});
AssertConversion<Time32Type, int32_t>(type, {"N/A\n", "23:59:59.999\n"},
{{0, 86399999}}, {{false, true}});

AssertConversionError(type, {"24:00\n"}, {0});
AssertConversionError(type, {"23:59:60\n"}, {0});
}

TEST(Time64Conversion, Micros) {
const auto type = time64(TimeUnit::MICRO);

AssertConversion<Time64Type, int64_t>(type, {"00:00\n", "00:00:00\n"}, {{0LL, 0LL}});
AssertConversion<Time64Type, int64_t>(type, {"01:23:45.1\n", "23:45:43.456789\n"},
{{5025100000LL, 85543456789LL}});
AssertConversion<Time64Type, int64_t>(type, {"N/A\n", "23:59:59.999999\n"},
{{0, 86399999999LL}}, {{false, true}});

AssertConversionError(type, {"24:00\n"}, {0});
AssertConversionError(type, {"23:59:60\n"}, {0});
}

TEST(Time64Conversion, Nanos) {
const auto type = time64(TimeUnit::NANO);

AssertConversion<Time64Type, int64_t>(type, {"00:00\n", "00:00:00\n"}, {{0LL, 0LL}});
AssertConversion<Time64Type, int64_t>(type, {"01:23:45.1\n", "23:45:43.123456789\n"},
{{5025100000000LL, 85543123456789LL}});
AssertConversion<Time64Type, int64_t>(type, {"N/A\n", "23:59:59.999999999\n"},
{{0, 86399999999999LL}}, {{false, true}});

AssertConversionError(type, {"24:00\n"}, {0});
AssertConversionError(type, {"23:59:60\n"}, {0});
}

TEST(TimestampConversion, Basics) {
auto type = timestamp(TimeUnit::SECOND);

Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/csv/inference_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ enum class InferKind {
Boolean,
Real,
Date,
Time,
Timestamp,
TimestampNS,
TextDict,
Expand Down Expand Up @@ -60,6 +61,8 @@ class InferStatus {
case InferKind::Boolean:
return SetKind(InferKind::Date);
case InferKind::Date:
return SetKind(InferKind::Time);
case InferKind::Time:
return SetKind(InferKind::Timestamp);
case InferKind::Timestamp:
return SetKind(InferKind::TimestampNS);
Expand Down Expand Up @@ -114,6 +117,8 @@ class InferStatus {
return make_converter(boolean());
case InferKind::Date:
return make_converter(date32());
case InferKind::Time:
return make_converter(time32(TimeUnit::SECOND));
case InferKind::Timestamp:
return make_converter(timestamp(TimeUnit::SECOND));
case InferKind::TimestampNS:
Expand Down
38 changes: 34 additions & 4 deletions cpp/src/arrow/util/value_parsing.h
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,9 @@ struct StringConverter<DATE_TYPE, enable_if_date<DATE_TYPE>> {

static bool Convert(const DATE_TYPE& type, const char* s, size_t length,
value_type* out) {
if (length != 10) return false;
if (ARROW_PREDICT_FALSE(length != 10)) {
return false;
}

duration_type since_epoch;
if (ARROW_PREDICT_FALSE(!detail::ParseYYYY_MM_DD(s, &since_epoch))) {
Expand All @@ -735,12 +737,36 @@ template <typename TIME_TYPE>
struct StringConverter<TIME_TYPE, enable_if_time<TIME_TYPE>> {
using value_type = typename TIME_TYPE::c_type;

// We allow the following formats for all units:
// - "hh:mm"
// - "hh:mm:ss"
//
// We allow the following formats for unit == MILLI, MICRO, or NANO:
// - "hh:mm:ss.s{1,3}"
//
// We allow the following formats for unit == MICRO, or NANO:
// - "hh:mm:ss.s{4,6}"
//
// We allow the following formats for unit == NANO:
// - "hh:mm:ss.s{7,9}"

static bool Convert(const TIME_TYPE& type, const char* s, size_t length,
value_type* out) {
if (length < 8) return false;
auto unit = type.unit();

const auto unit = type.unit();
std::chrono::seconds since_midnight;

if (length == 5) {
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s, &since_midnight))) {
return false;
}
*out =
static_cast<value_type>(util::CastSecondsToUnit(unit, since_midnight.count()));
return true;
}

if (ARROW_PREDICT_FALSE(length < 8)) {
return false;
}
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s, &since_midnight))) {
return false;
}
Expand All @@ -751,6 +777,10 @@ struct StringConverter<TIME_TYPE, enable_if_time<TIME_TYPE>> {
return true;
}

if (ARROW_PREDICT_FALSE(s[8] != '.')) {
return false;
}

uint32_t subseconds_count = 0;
if (ARROW_PREDICT_FALSE(
!detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
Expand Down
97 changes: 97 additions & 0 deletions cpp/src/arrow/util/value_parsing_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,103 @@ TEST(StringConversion, ToDate64) {
AssertConversion<Date64Type>("0001-01-01", -62135596800000LL);
}

template <typename T>
void AssertInvalidTimes(const T& type) {
// Invalid time format
AssertConversionFails(type, "");
AssertConversionFails(type, "00");
AssertConversionFails(type, "00:");
AssertConversionFails(type, "00:00:");
AssertConversionFails(type, "00:00:00:");
AssertConversionFails(type, "000000");
AssertConversionFails(type, "000000.000");

// Invalid time value
AssertConversionFails(type, "24:00:00");
AssertConversionFails(type, "00:60:00");
AssertConversionFails(type, "00:00:60");
}

TEST(StringConversion, ToTime32) {
{
Time32Type type{TimeUnit::SECOND};

AssertConversion(type, "00:00", 0);
AssertConversion(type, "01:23", 4980);
AssertConversion(type, "23:59", 86340);

AssertConversion(type, "00:00:00", 0);
AssertConversion(type, "01:23:45", 5025);
AssertConversion(type, "23:45:43", 85543);
AssertConversion(type, "23:59:59", 86399);

AssertInvalidTimes(type);
// No subseconds allowed
AssertConversionFails(type, "00:00:00.123");
}
{
Time32Type type{TimeUnit::MILLI};

AssertConversion(type, "00:00", 0);
AssertConversion(type, "01:23", 4980000);
AssertConversion(type, "23:59", 86340000);

AssertConversion(type, "00:00:00", 0);
AssertConversion(type, "01:23:45", 5025000);
AssertConversion(type, "23:45:43", 85543000);
AssertConversion(type, "23:59:59", 86399000);

AssertConversion(type, "00:00:00.123", 123);
AssertConversion(type, "01:23:45.000", 5025000);
AssertConversion(type, "01:23:45.1", 5025100);
AssertConversion(type, "01:23:45.123", 5025123);
AssertConversion(type, "01:23:45.999", 5025999);

AssertInvalidTimes(type);
// Invalid subseconds
AssertConversionFails(type, "00:00:00.1234");
}
}

TEST(StringConversion, ToTime64) {
{
Time64Type type{TimeUnit::MICRO};

AssertConversion(type, "00:00:00", 0LL);
AssertConversion(type, "01:23:45", 5025000000LL);
AssertConversion(type, "23:45:43", 85543000000LL);
AssertConversion(type, "23:59:59", 86399000000LL);

AssertConversion(type, "00:00:00.123456", 123456LL);
AssertConversion(type, "01:23:45.000000", 5025000000LL);
AssertConversion(type, "01:23:45.1", 5025100000LL);
AssertConversion(type, "01:23:45.123", 5025123000LL);
AssertConversion(type, "01:23:45.999999", 5025999999LL);

AssertInvalidTimes(type);
// Invalid subseconds
AssertConversionFails(type, "00:00:00.1234567");
}
{
Time64Type type{TimeUnit::NANO};

AssertConversion(type, "00:00:00", 0LL);
AssertConversion(type, "01:23:45", 5025000000000LL);
AssertConversion(type, "23:45:43", 85543000000000LL);
AssertConversion(type, "23:59:59", 86399000000000LL);

AssertConversion(type, "00:00:00.123456789", 123456789LL);
AssertConversion(type, "01:23:45.000000000", 5025000000000LL);
AssertConversion(type, "01:23:45.1", 5025100000000LL);
AssertConversion(type, "01:23:45.1234", 5025123400000LL);
AssertConversion(type, "01:23:45.999999999", 5025999999999LL);

AssertInvalidTimes(type);
// Invalid subseconds
AssertConversionFails(type, "00:00:00.1234567891");
}
}

TEST(StringConversion, ToTimestampDate_ISO8601) {
{
TimestampType type{TimeUnit::SECOND};
Expand Down
Loading