From e606a952c75a9b4f7d60181c10b64fb6c070cd92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 6 Apr 2022 19:16:13 +0200 Subject: [PATCH 1/5] ARROW-16025: [Python][C++] Fix possible segmentation fault when trying to close ORCFileWriter without having a writer --- cpp/src/arrow/adapters/orc/adapter.cc | 4 +++- python/pyarrow/tests/test_orc.py | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 41bb7f25cbc1..7d3309e4e75b 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -837,7 +837,9 @@ class ORCFileWriter::Impl { } Status Close() { - writer_->close(); + if (writer_) { + writer_->close(); + } return Status::OK(); } diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index abdd8bc11f9a..7bc318c9d4a7 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -613,3 +613,11 @@ def test_column_selection(tempdir): with pytest.raises(ValueError): orc_file.read(columns=[5]) + +def test_wrong_usage_orc_writer(tempdir): + from pyarrow import orc + + path = str(tempdir / 'test.orc') + with orc.ORCWriter(path) as writer: + with pytest.raises(AttributeError): + writer.test() From f07275ec569318c611b8263b021f20fd217d6ad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 6 Apr 2022 19:36:10 +0200 Subject: [PATCH 2/5] ARROW-16025: [Python][C++] Fix minor lint issue --- python/pyarrow/tests/test_orc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 7bc318c9d4a7..d0a3629cda1d 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -614,6 +614,7 @@ def test_column_selection(tempdir): with pytest.raises(ValueError): orc_file.read(columns=[5]) + def test_wrong_usage_orc_writer(tempdir): from pyarrow import orc From 35f1d8e616dcfe772bc6d321aaeb0418a21af606 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 7 Apr 2022 10:50:23 +0200 Subject: [PATCH 3/5] ARROW-16025: [Python][C++] Add c++ unit test reproducing issssue --- cpp/src/arrow/adapters/orc/adapter_test.cc | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 8d2ad777cae1..d6f53d755b1a 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -253,6 +253,23 @@ void AssertTableWriteReadEqual(const std::shared_ptr& input_table, AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); } +void TestORCWriterNoWrite(const int64_t max_size = kDefaultSmallMemStreamSize) { + EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, + io::BufferOutputStream::Create(max_size)); + auto write_options = adapters::orc::WriteOptions(); +#ifdef ARROW_WITH_SNAPPY + write_options.compression = Compression::SNAPPY; +#else + write_options.compression = Compression::UNCOMPRESSED; +#endif + write_options.file_version = adapters::orc::FileVersion(0, 11); + write_options.compression_block_size = 32768; + write_options.row_index_stride = 5000; + EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( + buffer_output_stream.get(), write_options)); + ARROW_EXPECT_OK(writer->Close()); +} + void AssertArrayWriteReadEqual(const std::shared_ptr& input_array, const std::shared_ptr& expected_output_array, const int64_t max_size = kDefaultSmallMemStreamSize) { @@ -406,6 +423,10 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) { // Trivial +class TestORCWriterTrivialNoWrite : public ::testing::Test {}; +TEST_F(TestORCWriterTrivialNoWrite, noWrite) { + TestORCWriterNoWrite(kDefaultSmallMemStreamSize / 16); +} class TestORCWriterTrivialNoConversion : public ::testing::Test { public: TestORCWriterTrivialNoConversion() { From 86f3c2c6744f9c80ffc66f84d309e6848676b885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 7 Apr 2022 13:58:53 +0200 Subject: [PATCH 4/5] ARROW-15723: [Python] Add unit test reproducing previous segfault for null arrays on ORCWriter --- python/pyarrow/tests/test_orc.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index d0a3629cda1d..408bbe64bf91 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -622,3 +622,14 @@ def test_wrong_usage_orc_writer(tempdir): with orc.ORCWriter(path) as writer: with pytest.raises(AttributeError): writer.test() + +def test_orc_writer_with_null_arrays(tempdir): + from pyarrow import orc + import pyarrow as pa + + path = str(tempdir / 'test.orc') + a = pa.array([1, None, 3, None]) + b = pa.array([None, None, None, None]) + table = pa.table({"int64": a, "utf8": b}) + with pytest.raises(pa.ArrowNotImplementedError): + orc.write_table(table, path) From bcadad3eacdf505807c02f1823932ed285e07be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 7 Apr 2022 15:16:31 +0200 Subject: [PATCH 5/5] ARROW-15723: [Python] Fix lint and review comment --- cpp/src/arrow/adapters/orc/adapter_test.cc | 32 ++++++++++------------ python/pyarrow/tests/test_orc.py | 1 + 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index d6f53d755b1a..6b9aa4740b4f 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -253,23 +253,6 @@ void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); } -void TestORCWriterNoWrite(const int64_t max_size = kDefaultSmallMemStreamSize) { - EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, - io::BufferOutputStream::Create(max_size)); - auto write_options = adapters::orc::WriteOptions(); -#ifdef ARROW_WITH_SNAPPY - write_options.compression = Compression::SNAPPY; -#else - write_options.compression = Compression::UNCOMPRESSED; -#endif - write_options.file_version = adapters::orc::FileVersion(0, 11); - write_options.compression_block_size = 32768; - write_options.row_index_stride = 5000; - EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( - buffer_output_stream.get(), write_options)); - ARROW_EXPECT_OK(writer->Close()); -} - void AssertArrayWriteReadEqual(const std::shared_ptr& input_array, const std::shared_ptr& expected_output_array, const int64_t max_size = kDefaultSmallMemStreamSize) { @@ -425,7 +408,20 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) { class TestORCWriterTrivialNoWrite : public ::testing::Test {}; TEST_F(TestORCWriterTrivialNoWrite, noWrite) { - TestORCWriterNoWrite(kDefaultSmallMemStreamSize / 16); + EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, + io::BufferOutputStream::Create(kDefaultSmallMemStreamSize / 16)); + auto write_options = adapters::orc::WriteOptions(); +#ifdef ARROW_WITH_SNAPPY + write_options.compression = Compression::SNAPPY; +#else + write_options.compression = Compression::UNCOMPRESSED; +#endif + write_options.file_version = adapters::orc::FileVersion(0, 11); + write_options.compression_block_size = 32768; + write_options.row_index_stride = 5000; + EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open( + buffer_output_stream.get(), write_options)); + ARROW_EXPECT_OK(writer->Close()); } class TestORCWriterTrivialNoConversion : public ::testing::Test { public: diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 408bbe64bf91..866cc01452bb 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -623,6 +623,7 @@ def test_wrong_usage_orc_writer(tempdir): with pytest.raises(AttributeError): writer.test() + def test_orc_writer_with_null_arrays(tempdir): from pyarrow import orc import pyarrow as pa