apache · westonpace · Mar 6, 2021 · Mar 25, 2021 · lidavidm · Mar 19, 2021
diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
diff --git a/cpp/src/arrow/csv/reader.h b/cpp/src/arrow/csv/reader.h
@@ -65,16 +65,24 @@ class ARROW_EXPORT StreamingReader : public RecordBatchReader {
 
   /// Create a StreamingReader instance
   ///
-  /// Currently, the StreamingReader is always single-threaded (parallel
-  /// readahead is not supported).
+  /// This involves some I/O as the first batch must be loaded during the creation process
+  /// so it is returned as a future
+  ///
+  /// Currently, the StreamingReader is not async-reentrant and does not do any fan-out
+  /// parsing (see ARROW-11889)
+  static Future<std::shared_ptr<StreamingReader>> MakeAsync(
+      io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+      const ReadOptions&, const ParseOptions&, const ConvertOptions&);
+
   static Result<std::shared_ptr<StreamingReader>> Make(
       io::IOContext io_context, std::shared_ptr<io::InputStream> input,
       const ReadOptions&, const ParseOptions&, const ConvertOptions&);
 
   ARROW_DEPRECATED("Use IOContext-based overload")
   static Result<std::shared_ptr<StreamingReader>> Make(
-      MemoryPool* pool, std::shared_ptr<io::InputStream> input, const ReadOptions&,
-      const ParseOptions&, const ConvertOptions&);
+      MemoryPool* pool, std::shared_ptr<io::InputStream> input,
+      const ReadOptions& read_options, const ParseOptions& parse_options,
+      const ConvertOptions& convert_options);
 };
 
 }  // namespace csv

diff --git a/cpp/src/arrow/csv/reader_test.cc b/cpp/src/arrow/csv/reader_test.cc
@@ -32,12 +32,40 @@
 #include "arrow/table.h"
 #include "arrow/testing/future_util.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/util/async_generator.h"
 #include "arrow/util/future.h"
 #include "arrow/util/thread_pool.h"
 
 namespace arrow {
+
+using RecordBatchGenerator = AsyncGenerator<std::shared_ptr<RecordBatch>>;
+
 namespace csv {
 
+// Allows the streaming reader to be used in tests that expect a table reader
+class StreamingReaderAsTableReader : public TableReader {
+ public:
+  explicit StreamingReaderAsTableReader(std::shared_ptr<StreamingReader> reader)
+      : reader_(std::move(reader)) {}
+  virtual ~StreamingReaderAsTableReader() = default;
+  virtual Result<std::shared_ptr<Table>> Read() {
+    auto table_fut = ReadAsync();
+    auto table_res = table_fut.result();
+    ARROW_ASSIGN_OR_RAISE(auto table, table_res);
+    return table;
+  }
+  virtual Future<std::shared_ptr<Table>> ReadAsync() {
+    auto reader = reader_;
+    RecordBatchGenerator rb_generator = [reader]() { return reader->ReadNextAsync(); };
+    return CollectAsyncGenerator(rb_generator).Then([](const RecordBatchVector& rbs) {
+      return Table::FromRecordBatches(rbs);
+    });
+  }
+
+ private:
+  std::shared_ptr<StreamingReader> reader_;
+};
+
 using TableReaderFactory =
     std::function<Result<std::shared_ptr<TableReader>>(std::shared_ptr<io::InputStream>)>;
 
@@ -152,5 +180,32 @@ TEST(AsyncReaderTests, NestedParallelism) {
   TestNestedParallelism(thread_pool, table_factory);
 }
 
+Result<TableReaderFactory> MakeStreamingFactory() {
+  return [](std::shared_ptr<io::InputStream> input_stream)
+             -> Result<std::shared_ptr<TableReader>> {
+    auto read_options = ReadOptions::Defaults();
+    read_options.block_size = 1 << 10;
+    ARROW_ASSIGN_OR_RAISE(
+        auto streaming_reader,
+        StreamingReader::Make(io::default_io_context(), input_stream, read_options,
+                              ParseOptions::Defaults(), ConvertOptions::Defaults()));
+    return std::make_shared<StreamingReaderAsTableReader>(std::move(streaming_reader));
+  };
+}
+
+TEST(StreamingReaderTests, Stress) {
+  ASSERT_OK_AND_ASSIGN(auto table_factory, MakeStreamingFactory());
+  StressTableReader(table_factory);
+}
+TEST(StreamingReaderTests, StressInvalid) {
+  ASSERT_OK_AND_ASSIGN(auto table_factory, MakeStreamingFactory());
+  StressInvalidTableReader(table_factory);
+}
+TEST(StreamingReaderTests, NestedParallelism) {
+  ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(1));
+  ASSERT_OK_AND_ASSIGN(auto table_factory, MakeStreamingFactory());
+  TestNestedParallelism(thread_pool, table_factory);
+}
+
 }  // namespace csv
 }  // namespace arrow
diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h
@@ -25,6 +25,7 @@
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/type_fwd.h"
+#include "arrow/util/future.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
@@ -207,6 +208,14 @@ class ARROW_EXPORT RecordBatchReader {
   /// \return Status
   virtual Status ReadNext(std::shared_ptr<RecordBatch>* batch) = 0;
 
+  // Fallback to sync implementation until all other readers are converted(ARROW-11770)
+  // and then this could become pure virtual with ReadNext falling back to async impl.
+  virtual Future<std::shared_ptr<RecordBatch>> ReadNextAsync() {
+    std::shared_ptr<RecordBatch> batch;
+    ARROW_RETURN_NOT_OK(ReadNext(&batch));
+    return Future<std::shared_ptr<RecordBatch>>::MakeFinished(std::move(batch));
+  }
+
   /// \brief Iterator interface
   Result<std::shared_ptr<RecordBatch>> Next() {
     std::shared_ptr<RecordBatch> batch;

diff --git a/cpp/src/arrow/util/async_generator.h b/cpp/src/arrow/util/async_generator.h
@@ -280,7 +280,7 @@ AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator,
   return MappingGenerator<T, V>(std::move(source_generator), std::move(map));
 }
 
-/// \see MakeAsyncGenerator
+/// \see MakeTransformedGenerator
 template <typename T, typename V>
 class TransformingGenerator {
   // The transforming generator state will be referenced as an async generator but will
@@ -382,8 +382,8 @@ class TransformingGenerator {
 ///
 /// This generator may queue up to 1 instance of T
 template <typename T, typename V>
-AsyncGenerator<V> MakeAsyncGenerator(AsyncGenerator<T> generator,
-                                     Transformer<T, V> transformer) {
+AsyncGenerator<V> MakeTransformedGenerator(AsyncGenerator<T> generator,
+                                           Transformer<T, V> transformer) {
   return TransformingGenerator<T, V>(generator, transformer);
 }
 

diff --git a/cpp/src/arrow/util/async_generator_test.cc b/cpp/src/arrow/util/async_generator_test.cc
@@ -496,7 +496,7 @@ TEST(TestAsyncUtil, SynchronousFinish) {
     return Future<TestInt>::MakeFinished(IterationTraits<TestInt>::End());
   };
   Transformer<TestInt, TestStr> skip_all = [](TestInt value) { return TransformSkip(); };
-  auto transformed = MakeAsyncGenerator(generator, skip_all);
+  auto transformed = MakeTransformedGenerator(generator, skip_all);
   auto future = CollectAsyncGenerator(transformed);
   ASSERT_FINISHES_OK_AND_ASSIGN(auto actual, future);
   ASSERT_EQ(std::vector<TestStr>(), actual);
@@ -561,7 +561,7 @@ TEST(TestAsyncUtil, StackOverflow) {
   };
   Transformer<TestInt, TestStr> discard =
       [](TestInt next) -> Result<TransformFlow<TestStr>> { return TransformSkip(); };
-  auto transformed = MakeAsyncGenerator(generator, discard);
+  auto transformed = MakeTransformedGenerator(generator, discard);
   auto collected_future = CollectAsyncGenerator(transformed);
   ASSERT_FINISHES_OK_AND_ASSIGN(auto collected, collected_future);
   ASSERT_EQ(0, collected.size());
@@ -796,7 +796,7 @@ TEST(TestAsyncUtil, ReadaheadFailed) {
 TEST(TestAsyncIteratorTransform, SkipSome) {
   auto original = AsyncVectorIt<TestInt>({1, 2, 3});
   auto filter = MakeFilter([](TestInt& t) { return t.value != 2; });
-  auto filtered = MakeAsyncGenerator(std::move(original), filter);
+  auto filtered = MakeTransformedGenerator(std::move(original), filter);
   AssertAsyncGeneratorMatch({"1", "3"}, std::move(filtered));
 }
 

diff --git a/cpp/src/arrow/util/future.cc b/cpp/src/arrow/util/future.cc
@@ -39,6 +39,8 @@ using internal::checked_cast;
 // should ideally not limit scalability.
 static std::mutex global_waiter_mutex;
 
+const double FutureWaiter::kInfinity = HUGE_VAL;
+
 class FutureWaiterImpl : public FutureWaiter {
  public:
   FutureWaiterImpl(Kind kind, std::vector<FutureImpl*> futures)

diff --git a/cpp/src/arrow/util/future.h b/cpp/src/arrow/util/future.h
@@ -176,7 +176,9 @@ class ARROW_EXPORT FutureWaiter {
  public:
   enum Kind : int8_t { ANY, ALL, ALL_OR_FIRST_FAILED, ITERATE };
 
-  static constexpr double kInfinity = HUGE_VAL;
+  // HUGE_VAL isn't constexpr on Windows
+  // https://social.msdn.microsoft.com/Forums/vstudio/en-US/47e8b9ff-b205-4189-968e-ee3bc3e2719f/constexpr-compile-error?forum=vclanguage
+  static const double kInfinity;
 
   static std::unique_ptr<FutureWaiter> Make(Kind kind, std::vector<FutureImpl*> futures);
 

diff --git a/cpp/src/arrow/util/thread_pool.h b/cpp/src/arrow/util/thread_pool.h
@@ -104,15 +104,22 @@ class ARROW_EXPORT Executor {
   template <typename T>
   Future<T> Transfer(Future<T> future) {
     auto transferred = Future<T>::Make();
-    future.AddCallback([this, transferred](const Result<T>& result) mutable {
+    auto callback = [this, transferred](const Result<T>& result) mutable {
       auto spawn_status = Spawn([transferred, result]() mutable {
         transferred.MarkFinished(std::move(result));
       });
       if (!spawn_status.ok()) {
         transferred.MarkFinished(spawn_status);
       }
-    });
-    return transferred;
+    };
+    auto callback_factory = [&callback]() { return callback; };
+    if (future.TryAddCallback(callback_factory)) {
+      return transferred;
+    }
+    // If the future is already finished and we aren't going to force spawn a thread
+    // then we don't need to add another layer of callback and can return the original
+    // future
+    return future;
   }
 
   // Submit a callable and arguments for execution.  Return a future that