diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d2a5cc7565..2b3575bd2e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -219,9 +219,19 @@ find_package(Hydrogen 1.5.0 CONFIG REQUIRED) message(STATUS "Found Hydrogen@${HYDROGEN_VERSION}: ${Hydrogen_DIR}") set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND}) +# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA, +# LBANN gets CUDA. +set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA}) +set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA}) + +set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM}) +if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM) + set(LBANN_HAS_GPU TRUE) +endif () + # DiHydrogen and Distconv if (LBANN_WITH_DISTCONV) - find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns DistConv) + find_package(DiHydrogen 0.3.0 CONFIG REQUIRED COMPONENTS Meta Patterns DistConv) set(LBANN_HAS_DISTCONV TRUE) set(LBANN_H2_LIBS H2::H2Meta @@ -292,16 +302,6 @@ if (LBANN_WITH_ONEDNN) endif () endif () -# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA, -# LBANN gets CUDA. -set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA}) -set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA}) - -set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM}) -if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM) - set(LBANN_HAS_GPU TRUE) -endif () - # Only used if have GPU and have CPU half. if (LBANN_HAS_GPU AND LBANN_HAS_HALF) set(LBANN_HAS_GPU_FP16 ${HYDROGEN_GPU_USE_FP16}) diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp index 3f1ecb9d3f2..2cd2eb79dde 100644 --- a/include/lbann/layers/activations/relu.hpp +++ b/include/lbann/layers/activations/relu.hpp @@ -30,6 +30,10 @@ #include "lbann/layers/data_type_layer.hpp" #include "lbann/utils/distconv.hpp" +#ifdef LBANN_HAS_DISTCONV +#include "distconv/dnn_backend/relu.hpp" +#endif + namespace lbann { #ifdef LBANN_HAS_DISTCONV diff --git a/include/lbann/layers/learning/distconv/distconv_layers.hpp b/include/lbann/layers/learning/distconv/distconv_layers.hpp index abecb77723e..b5c31fc4544 100644 --- a/include/lbann/layers/learning/distconv/distconv_layers.hpp +++ b/include/lbann/layers/learning/distconv/distconv_layers.hpp @@ -48,25 +48,26 @@ namespace distconv{ tensor::Tensor &output); template - int apply_bias(const tensor::Tensor &bias, - tensor::Tensor &output); - + int apply_bias( + const tensor::Tensor& bias, + tensor::Tensor& output); + template int backward_wrt_input( bool transpose_A, const tensor::Tensor &output_grad, const tensor::Tensor &linearity, tensor::Tensor &input_grad); - + template int backward_wrt_weight( bool transpose, DataType dst_scale, DataType gradient_scale, - const tensor::Tensor &input, - const tensor::Tensor &output_grad, - tensor::Tensor &linearity_grad); - + const tensor::Tensor& input, + const tensor::Tensor& output_grad, + tensor::Tensor& linearity_grad); + template int backward_wrt_bias( DataType gradient_scale, @@ -78,23 +79,25 @@ namespace distconv{ Backend &m_be; }; // class definition ChannelwiseFullyConnected - template - tensor::Shape - get_fc_output_local_tensor_shape(const tensor::Tensor &input, - const int_vector &linearity_dims, - bool transpose){ + tensor::Shape get_fc_output_local_tensor_shape( + const tensor::Tensor& input, + const int_vector& linearity_dims, + bool transpose) + { //https://github.com/LLNL/DiHydrogen/blob/7f86db1f9701ac3afb5e16aefdd57563d57a1698/legacy/include/distconv/distconv.hpp#L173 - //Get the input layer local tensor shape + // Get the input layer local tensor shape auto output_local_shape = input.get_local_shape(); output_local_shape[0] = transpose? linearity_dims[1] : linearity_dims[0]; return output_local_shape; } -extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, float>; -extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, double>; + extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, + float>; + extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, + double>; } // namespace distconv #endif // LBANN_HAS_DISTCONV diff --git a/include/lbann/layers/transform/concatenate.hpp b/include/lbann/layers/transform/concatenate.hpp index e2c26082f66..355c23d3407 100644 --- a/include/lbann/layers/transform/concatenate.hpp +++ b/include/lbann/layers/transform/concatenate.hpp @@ -468,7 +468,7 @@ fp_compute() { dc::tensor::Concatenate(this->get_activations(0), this->get_prev_activations(0), this->get_prev_activations(1), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } template @@ -477,7 +477,7 @@ bp_compute() { dc::tensor::Slice(this->get_error_signals(0), this->get_error_signals(1), this->get_prev_error_signals(0), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } #endif // LBANN_HAS_DISTCONV diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp index 9a8e4765e3b..6b2644dff01 100644 --- a/include/lbann/layers/transform/pooling.hpp +++ b/include/lbann/layers/transform/pooling.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_POOLING_HPP_INCLUDED #include "lbann/layers/data_type_layer.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/dim_helpers.hpp" #include "lbann/utils/dnn_enums.hpp" #ifdef LBANN_HAS_DNN_LIB @@ -69,7 +70,8 @@ class pooling_distconv_adapter : public data_type_distconv_adapter> m_pooling; }; @@ -295,12 +297,15 @@ class pooling_layer : public data_type_layer { if(this->using_gpus()) { #ifdef LBANN_HAS_DISTCONV if (this->distconv_enabled()) { - get_distconv_adapter().fp_compute(); + const auto& mode = + this->m_model->get_execution_context().get_execution_mode(); + get_distconv_adapter().fp_compute(mode == execution_mode::training); return; } #endif // LBANN_HAS_DISTCONV fp_compute_dnn(); - } else { + } + else { fp_compute_im2col(); } } @@ -801,10 +806,14 @@ setup_layer(size_t workspace_capacity) { } template -void pooling_distconv_adapter:: -fp_compute() { - m_pooling->forward(El::To(1), this->get_prev_activations(), - El::To(0), this->get_activations()); +void pooling_distconv_adapter::fp_compute( + bool const training) +{ + m_pooling->forward(El::To(1), + this->get_prev_activations(), + El::To(0), + this->get_activations(), + training); } template diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp index f21f288c194..a3606e689cb 100644 --- a/include/lbann/utils/distconv.hpp +++ b/include/lbann/utils/distconv.hpp @@ -39,12 +39,13 @@ #define DISTCONV_DEBUG #endif +#include "distconv/dnn_backend/backend.hpp" #include "distconv/distconv.hpp" -#include "distconv/tensor/tensor_mpi_cuda.hpp" +#include "distconv/tensor/algorithms.hpp" #include "distconv/tensor/shuffle_mpi.hpp" #include "distconv/tensor/shuffle_mpi_cuda.hpp" #include "distconv/tensor/shuffle_mpi_cuda_al.hpp" -#include "distconv/tensor/algorithms.hpp" +#include "distconv/tensor/tensor_mpi_cuda.hpp" #include "distconv/util/util.hpp" #ifdef DISTCONV_HAS_P2P #include "p2p/p2p.hpp" @@ -55,6 +56,15 @@ #include "lbann/layers/learning/distconv/distconv_layers.hpp" namespace lbann { +inline auto default_hydrogen_stream() +{ +#if H2_HAS_CUDA + return hydrogen::cuda::GetDefaultStream(); +#elif H2_HAS_ROCM + return hydrogen::rocm::GetDefaultStream(); +#endif +} + class Layer; namespace dc { @@ -109,7 +119,7 @@ using MPIRootPrintStreamInfo = ::distconv::util::MPIRootPrintStreamInfo; using MPIRootPrintStreamWaning = ::distconv::util::MPIRootPrintStreamWarning; // Distconv layer classes -using Backend = ::distconv::cudnn::BackendCUDNN; +using Backend = ::distconv::BackendDNNLib; using ReLU = ::distconv::ReLU; using LeakyReLU = ::distconv::LeakyReLU; template @@ -232,7 +242,7 @@ Dist get_hydrogen_data_parallel_distribution(int num_dims); template void dump_tensor(const Tensor &t, const std::string &path) { dc::MPIPrintStreamDebug() << "Dumping tensor to " << path; - cudaDeviceSynchronize(); + h2::gpu::sync(); distconv::dump_tensor(t, path, true); } diff --git a/include/lbann/utils/dnn_lib/miopen.hpp b/include/lbann/utils/dnn_lib/miopen.hpp index a100e6c840f..fb99b23a146 100644 --- a/include/lbann/utils/dnn_lib/miopen.hpp +++ b/include/lbann/utils/dnn_lib/miopen.hpp @@ -82,7 +82,7 @@ using dnnHandle_t = miopenHandle_t; using dnnDataType_t = miopenDataType_t; using dnnTensorDescriptor_t = miopenTensorDescriptor_t; using dnnFilterDescriptor_t = miopenTensorDescriptor_t; -using dnnTensorFormat_t = int; +using dnnTensorFormat_t = miopenTensorLayout_t; using dnnDropoutDescriptor_t = miopenDropoutDescriptor_t; using dnnRNGType_t = miopenRNGType_t; using dnnRNNDescriptor_t = miopenRNNDescriptor_t; @@ -110,7 +110,7 @@ using dnnConvolutionBwdFilterAlgo_t = miopenConvBwdWeightsAlgorithm_t; constexpr dnnConvolutionMode_t DNN_CROSS_CORRELATION = miopenConvolution; constexpr dnnNanPropagation_t DNN_PROPAGATE_NAN = MIOPEN_PROPAGATE_NAN; constexpr dnnMathType_t DNN_DEFAULT_MATH = 0; -constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = 0; +constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = miopenTensorNCHW; constexpr dnnRNGType_t DNN_RNG_PSEUDO_XORWOW = MIOPEN_RNG_PSEUDO_XORWOW; constexpr dnnLRNMode_t DNN_LRN_CROSS_CHANNEL = miopenLRNCrossChannel; constexpr dnnMathType_t DNN_TENSOR_OP_MATH_ALLOW_CONVERSION = -1; // not supported with ROCm diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp index d76856f50ee..2cf1a3cfbc7 100644 --- a/src/callbacks/check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -133,7 +133,6 @@ struct CheckWeightsFunctor : DefaultErrorReporter // Get weights matrix and gradient const auto& weights_matrix = dtw.get_values(); const auto& gradient = dtw.get_optimizer()->get_gradient(); - // Iterate through weights matrix entries for (El::Int col = 0; col < weights_matrix.Width(); ++col) { for (El::Int row = 0; row < weights_matrix.Height(); ++row) { diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp index 00763b58c8a..55684aaf81a 100644 --- a/src/layers/data_type_distconv_adapter.cpp +++ b/src/layers/data_type_distconv_adapter.cpp @@ -332,7 +332,7 @@ setup_prev_activations_i(int index) const { const dc::LocaleMPI loc(dc::get_mpi_comm(), false); t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); } else { // Create a shallow copy const auto &parent_activations = @@ -427,7 +427,7 @@ setup_activations_i(int index) const { const auto local_shape = get_activations_local_shape(index); auto t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); return t; } @@ -483,7 +483,7 @@ setup_prev_error_signals_i(int index) const { const dc::LocaleMPI loc(dc::get_mpi_comm(), false); t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); } else { // Create a shallow copy const auto &child_error_signals = @@ -550,7 +550,7 @@ setup_error_signals_i(int index) const { const auto local_shape = get_error_signals_local_shape(index); auto t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); return t; } @@ -797,9 +797,9 @@ void data_type_distconv_adapter::ensu get_original_prev_activations(), get_prev_activations()); shuffler.shuffle_forward( - get_original_prev_activations().get_const_base_ptr(), - get_prev_activations().get_base_ptr(), - hydrogen::cuda::GetDefaultStream()); + get_original_prev_activations().get_const_base_ptr(), + get_prev_activations().get_base_ptr(), + default_hydrogen_stream()); } } @@ -818,10 +818,9 @@ void data_type_distconv_adapter::copy auto &shuffler = get_activations_shuffler( get_activations(), get_original_activations()); - shuffler.shuffle_forward( - get_activations().get_const_base_ptr(), - get_original_activations().get_base_ptr(), - hydrogen::cuda::GetDefaultStream()); + shuffler.shuffle_forward(get_activations().get_const_base_ptr(), + get_original_activations().get_base_ptr(), + default_hydrogen_stream()); } } @@ -846,9 +845,9 @@ void data_type_distconv_adapter::ensu get_original_prev_error_signals(i), get_prev_error_signals(i)); shuffler.shuffle_forward( - get_original_prev_error_signals(i).get_const_base_ptr(), - get_prev_error_signals(i).get_base_ptr(), - hydrogen::cuda::GetDefaultStream()); + get_original_prev_error_signals(i).get_const_base_ptr(), + get_prev_error_signals(i).get_base_ptr(), + default_hydrogen_stream()); } } @@ -868,10 +867,9 @@ void data_type_distconv_adapter::copy auto &shuffler = get_error_signals_shuffler( get_error_signals(i), get_original_error_signals(i)); - shuffler.shuffle_forward( - get_error_signals(i).get_const_base_ptr(), - get_original_error_signals(i).get_base_ptr(), - hydrogen::cuda::GetDefaultStream()); + shuffler.shuffle_forward(get_error_signals(i).get_const_base_ptr(), + get_original_error_signals(i).get_base_ptr(), + default_hydrogen_stream()); } } diff --git a/src/layers/io/input_layer.cpp b/src/layers/io/input_layer.cpp index 68664436ff2..dd267e692b2 100644 --- a/src/layers/io/input_layer.cpp +++ b/src/layers/io/input_layer.cpp @@ -304,7 +304,11 @@ void input_distconv_adapter::setup_fp_tensors() { // only specialized for BaseAllocator. size_t buf_size = m_host_tensor->get_local_real_size() * sizeof(TensorDataType); TensorDataType *buf = nullptr; +#if H2_HAS_CUDA CHECK_CUDA(cudaMallocHost(&buf, buf_size)); +#elif H2_HAS_ROCM + CHECK_ROCM(hipHostMalloc(&buf, buf_size)); +#endif // Note buf should be deallocated. dc::tensor::View(*m_host_tensor, buf); setup_shuffler_buffers(*m_original_host_tensor, @@ -338,7 +342,7 @@ setup_activations_i(int index) const { const auto local_shape = get_activations_local_shape(index); auto t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); return t; } else { @@ -434,7 +438,7 @@ template ::fp_compute() { auto &l = dynamic_cast&>(this->layer()); - auto stream = hydrogen::cuda::GetDefaultStream(); + auto stream = default_hydrogen_stream(); // Note that the mini-batch size of the data reader is not // actually the one for the current mini-batch as the mini-batch // index is already updated by fp_compute. diff --git a/src/layers/learning/distconv/distconv_layers.cpp b/src/layers/learning/distconv/distconv_layers.cpp index 92dd171dd35..6da0a9e546c 100644 --- a/src/layers/learning/distconv/distconv_layers.cpp +++ b/src/layers/learning/distconv/distconv_layers.cpp @@ -331,7 +331,7 @@ namespace distconv{ const tensor::Tensor &output_gradient, \ tensor::Tensor &bias_gradient); -ETI(float, cudnn::BackendCUDNN) -ETI(double, cudnn::BackendCUDNN) + ETI(float, BackendDNNLib) + ETI(double, BackendDNNLib) #undef ETI } // namespace distconv diff --git a/src/layers/transform/split.cu b/src/layers/transform/split.cu index ca212564f89..b9f8ecdcb89 100644 --- a/src/layers/transform/split.cu +++ b/src/layers/transform/split.cu @@ -71,29 +71,30 @@ void split_distconv_adapter::bp_compute() { auto &error_signals = this->get_error_signals(0); switch (this->layer().get_num_children()) { case 0: - error_signals.zero(hydrogen::cuda::GetDefaultStream()); + error_signals.zero(default_hydrogen_stream()); break; case 1: dc::tensor::Copy(error_signals, this->get_prev_error_signals(0), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); break; case 2: dc::tensor::Transform(error_signals, this->get_prev_error_signals(0), this->get_prev_error_signals(1), sum_op(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); break; default: dc::tensor::Copy(error_signals, this->get_prev_error_signals(1), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); for (int i = 1; i < this->layer().get_num_children(); ++i) { const auto &prev_error = this->get_prev_error_signals(i); - dc::tensor::Transform(error_signals, prev_error, + dc::tensor::Transform(error_signals, + prev_error, accumulate_op(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } } return; diff --git a/src/layers/transform/sum.cu b/src/layers/transform/sum.cu index c9ea29b76e4..d0ee8adf1c8 100644 --- a/src/layers/transform/sum.cu +++ b/src/layers/transform/sum.cu @@ -67,11 +67,12 @@ void sum_distconv_adapter::fp_compute() { auto &activations = this->get_activations(); switch (this->layer().get_num_parents()) { case 0: - activations.zero(hydrogen::cuda::GetDefaultStream()); + activations.zero(default_hydrogen_stream()); break; case 1: - dc::tensor::Copy(activations, this->get_prev_activations(), - hydrogen::cuda::GetDefaultStream()); + dc::tensor::Copy(activations, + this->get_prev_activations(), + default_hydrogen_stream()); break; case 2: // Optimization for layers with 2 parents (e.g., @@ -82,19 +83,21 @@ void sum_distconv_adapter::fp_compute() { this->get_prev_activations(0), this->get_prev_activations(1), sum_op(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); break; default: for (int i = 0; i < this->layer().get_num_parents(); ++i) { auto &prev_activations = this->get_prev_activations(i); prev_activations.set_outermost_dimension(activations.get_shape()[-1]); if (i == 0) { - dc::tensor::Copy(activations, prev_activations, - hydrogen::cuda::GetDefaultStream()); + dc::tensor::Copy(activations, + prev_activations, + default_hydrogen_stream()); } else { - distconv::tensor::Transform(activations, prev_activations, + distconv::tensor::Transform(activations, + prev_activations, accumulate_op(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } } } diff --git a/src/utils/distconv.cpp b/src/utils/distconv.cpp index 49fc966aef8..d89f9da5052 100644 --- a/src/utils/distconv.cpp +++ b/src/utils/distconv.cpp @@ -209,7 +209,7 @@ TensorDataType *get_shuffler_src_buf(const TensorDev &tensor) { shuffler_src_buf_size = TensorShuffler::get_buf_size(tensor); MPIPrintStreamDebug() << "Allocating shared shuffler buffer of size " << shuffler_src_buf_size; - DISTCONV_CUDA_MALLOC(&shuffler_src_buf, shuffler_src_buf_size); + DISTCONV_GPU_MALLOC(&shuffler_src_buf, shuffler_src_buf_size); } // Returns the pre-allocated memory if it's large enough size_t required_size = TensorShuffler::get_buf_size(tensor); @@ -228,7 +228,7 @@ TensorDataType *get_shuffler_dst_buf(const TensorDev &tensor) { shuffler_dst_buf_size = TensorShuffler::get_buf_size(tensor); MPIPrintStreamDebug() << "Allocating shared shuffler buffer of size " << shuffler_src_buf_size; - DISTCONV_CUDA_MALLOC(&shuffler_dst_buf, shuffler_dst_buf_size); + DISTCONV_GPU_MALLOC(&shuffler_dst_buf, shuffler_dst_buf_size); } size_t required_size = TensorShuffler::get_buf_size(tensor); // Returns the pre-allocated memory if it's large enough @@ -241,11 +241,11 @@ TensorDataType *get_shuffler_dst_buf(const TensorDev &tensor) { } void delete_shuffler_buffers() { if (shuffler_src_buf) { - CHECK_CUDA(cudaFree(shuffler_src_buf)); + DISTCONV_CHECK_GPU(GPU_FREE(shuffler_src_buf)); shuffler_src_buf = nullptr; } if (shuffler_dst_buf) { - CHECK_CUDA(cudaFree(shuffler_dst_buf)); + DISTCONV_CHECK_GPU(GPU_FREE(shuffler_dst_buf)); shuffler_dst_buf = nullptr; } } @@ -290,13 +290,15 @@ void initialize(MPI_Comm comm) { #ifdef DISTCONV_HAS_P2P p2p_instance = new p2p::P2P(mpi_comm); #endif // DISTCONV_HAS_P2P - hosttransfer_comm_instance = new Al::hosttransfer_backend::comm_type( - mpi_comm, hydrogen::cuda::GetDefaultStream()); - ::distconv::cudnn::Options backend_opts; + hosttransfer_comm_instance = + new Al::hosttransfer_backend::comm_type(mpi_comm, + default_hydrogen_stream()); + ::distconv::backend::Options backend_opts; backend_opts.m_deterministic = opt_deterministic; - backend_instance = new Backend( - mpi_comm, lbann::dnn_lib::get_handle(), - hydrogen::cuda::GetDefaultStream(), backend_opts); + backend_instance = new Backend(mpi_comm, + lbann::dnn_lib::get_handle(), + default_hydrogen_stream(), + backend_opts); print_options(std::cout); initialized = true; } @@ -454,7 +456,11 @@ Dist get_hydrogen_data_parallel_distribution(int num_dims) { size_t get_workspace_capacity() { size_t available, total; +#if H2_HAS_CUDA FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total)); +#elif H2_HAS_ROCM + FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total)); +#endif size_t workspace_capacity = available; // set aside some space for shuffling, halo exchange, etc. workspace_capacity -= 1 << 28;