diff --git a/src/TiledArray/cuda/cuda_task_fn.h b/src/TiledArray/cuda/cuda_task_fn.h index 98db4def7e..f11df63285 100644 --- a/src/TiledArray/cuda/cuda_task_fn.h +++ b/src/TiledArray/cuda/cuda_task_fn.h @@ -131,6 +131,9 @@ struct cudaTaskFn : public TaskInterface { // std::string message = "callback on cudaTaskFn: " + address.str() + // '\n'; std::cout << message; callback->notify(); + // must flush the prebuf (if any) to make sure this cleanup task will be + // actually submitted + ThreadPool::instance()->flush_prebuf(); const auto t1 = TiledArray::now(); TiledArray::detail::cuda_taskfn_callback_duration_ns() += @@ -799,10 +802,10 @@ add_cuda_taskfn( cudaTaskFn* t) { typename cudaTaskFn::futureT res(t->result()); - // add the cuda task - world.taskq.add(static_cast(t)); // add the internal async task in cuda task as well world.taskq.add(t->async_task()); + // add the cuda task + world.taskq.add(static_cast(t)); return res; }