LBANN · benson31 · Nov 12, 2022 · Oct 14, 2022 · Oct 14, 2022 · Oct 18, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -219,9 +219,19 @@ find_package(Hydrogen 1.5.0 CONFIG REQUIRED)
 message(STATUS "Found Hydrogen@${HYDROGEN_VERSION}: ${Hydrogen_DIR}")
 set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND})
 
+# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA,
+# LBANN gets CUDA.
+set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA})
+set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA})
+
+set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM})
+if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM)
+  set(LBANN_HAS_GPU TRUE)
+endif ()
+
 # DiHydrogen and Distconv
 if (LBANN_WITH_DISTCONV)
-  find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns DistConv)
+  find_package(DiHydrogen 0.3.0 CONFIG REQUIRED COMPONENTS Meta Patterns DistConv)
   set(LBANN_HAS_DISTCONV TRUE)
   set(LBANN_H2_LIBS
     H2::H2Meta
@@ -292,16 +302,6 @@ if (LBANN_WITH_ONEDNN)
   endif ()
 endif ()
 
-# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA,
-# LBANN gets CUDA.
-set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA})
-set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA})
-
-set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM})
-if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM)
-  set(LBANN_HAS_GPU TRUE)
-endif ()
-
 # Only used if have GPU and have CPU half.
 if (LBANN_HAS_GPU AND LBANN_HAS_HALF)
   set(LBANN_HAS_GPU_FP16 ${HYDROGEN_GPU_USE_FP16})

diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp
@@ -30,6 +30,10 @@
 #include "lbann/layers/data_type_layer.hpp"
 #include "lbann/utils/distconv.hpp"
 
+#ifdef LBANN_HAS_DISTCONV
+#include "distconv/dnn_backend/relu.hpp"
+#endif
+
 namespace lbann {
 
 #ifdef LBANN_HAS_DISTCONV

diff --git a/include/lbann/layers/learning/distconv/distconv_layers.hpp b/include/lbann/layers/learning/distconv/distconv_layers.hpp
@@ -48,25 +48,26 @@ namespace distconv{
             tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output);
 
     template <typename Allocator>
-    int apply_bias(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &bias, 
-                 tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output);
-
+    int apply_bias(
+      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& bias,
+      tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& output);
+
     template <typename Allocator>
     int backward_wrt_input(
       bool transpose_A,
       const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
       const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &linearity,
       tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_grad);
-    
+
     template <typename Allocator>
     int backward_wrt_weight(
       bool transpose,
       DataType dst_scale,
       DataType gradient_scale,
-      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input, 
-      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
-      tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &linearity_grad);
-    
+      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& input,
+      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& output_grad,
+      tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& linearity_grad);
+
     template <typename Allocator>
     int backward_wrt_bias(
       DataType gradient_scale,
@@ -78,23 +79,25 @@ namespace distconv{
     Backend &m_be;
   }; // class definition ChannelwiseFullyConnected
 
-
   template <typename DataType, typename locale, typename Allocator>
-  tensor::Shape 
-  get_fc_output_local_tensor_shape(const tensor::Tensor<DataType, locale, Allocator> &input,
-                                   const int_vector &linearity_dims,
-                                   bool transpose){
+  tensor::Shape get_fc_output_local_tensor_shape(
+    const tensor::Tensor<DataType, locale, Allocator>& input,
+    const int_vector& linearity_dims,
+    bool transpose)
+  {
 
     //https://github.com/LLNL/DiHydrogen/blob/7f86db1f9701ac3afb5e16aefdd57563d57a1698/legacy/include/distconv/distconv.hpp#L173
 
-    //Get the input layer local tensor shape 
+    // Get the input layer local tensor shape
 
     auto output_local_shape = input.get_local_shape();
     output_local_shape[0] = transpose? linearity_dims[1] : linearity_dims[0];
     return output_local_shape;
   }
-extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, float>;
-extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, double>;
+  extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib,
+                                                  float>;
+  extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib,
+                                                  double>;
 }  // namespace distconv
 
 #endif // LBANN_HAS_DISTCONV

diff --git a/include/lbann/layers/transform/concatenate.hpp b/include/lbann/layers/transform/concatenate.hpp
@@ -468,7 +468,7 @@ fp_compute() {
   dc::tensor::Concatenate(this->get_activations(0),
                           this->get_prev_activations(0),
                           this->get_prev_activations(1),
-                          hydrogen::cuda::GetDefaultStream());
+                          default_hydrogen_stream());
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
@@ -477,7 +477,7 @@ bp_compute() {
   dc::tensor::Slice(this->get_error_signals(0),
                     this->get_error_signals(1),
                     this->get_prev_error_signals(0),
-                    hydrogen::cuda::GetDefaultStream());
+                    default_hydrogen_stream());
 }
 #endif // LBANN_HAS_DISTCONV
 

diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp
@@ -28,6 +28,7 @@
 #define LBANN_LAYER_POOLING_HPP_INCLUDED
 
 #include "lbann/layers/data_type_layer.hpp"
+#include "lbann/models/model.hpp"
 #include "lbann/utils/dim_helpers.hpp"
 #include "lbann/utils/dnn_enums.hpp"
 #ifdef LBANN_HAS_DNN_LIB
@@ -69,7 +70,8 @@ class pooling_distconv_adapter : public data_type_distconv_adapter<TensorDataTyp
   void setup_distributions(tensor_overlap_constraints &constraints) override;
   dc::Shape get_activations_local_shape(int index=0) const override;
   void setup_layer(size_t workspace_capacity) override;
-  void fp_compute();
+  void
+  fp_compute(bool training = true); // training=true for max back-compatibility.
   void bp_compute();
   std::unique_ptr<dc::Pooling<TensorDataType>> m_pooling;
 };
@@ -295,12 +297,15 @@ class pooling_layer : public data_type_layer<TensorDataType> {
     if(this->using_gpus()) {
 #ifdef LBANN_HAS_DISTCONV
       if (this->distconv_enabled()) {
-        get_distconv_adapter().fp_compute();
+        const auto& mode =
+          this->m_model->get_execution_context().get_execution_mode();
+        get_distconv_adapter().fp_compute(mode == execution_mode::training);
         return;
       }
 #endif // LBANN_HAS_DISTCONV
       fp_compute_dnn();
-    } else {
+    }
+    else {
       fp_compute_im2col();
     }
   }
@@ -801,10 +806,14 @@ setup_layer(size_t workspace_capacity) {
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
-void pooling_distconv_adapter<TensorDataType, Layout, Device>::
-fp_compute() {
-  m_pooling->forward(El::To<TensorDataType>(1), this->get_prev_activations(),
-                     El::To<TensorDataType>(0), this->get_activations());
+void pooling_distconv_adapter<TensorDataType, Layout, Device>::fp_compute(
+  bool const training)
+{
+  m_pooling->forward(El::To<TensorDataType>(1),
+                     this->get_prev_activations(),
+                     El::To<TensorDataType>(0),
+                     this->get_activations(),
+                     training);
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>

diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp
@@ -39,12 +39,13 @@
 #define DISTCONV_DEBUG
 #endif
 
+#include "distconv/dnn_backend/backend.hpp"
 #include "distconv/distconv.hpp"
-#include "distconv/tensor/tensor_mpi_cuda.hpp"
+#include "distconv/tensor/algorithms.hpp"
 #include "distconv/tensor/shuffle_mpi.hpp"
 #include "distconv/tensor/shuffle_mpi_cuda.hpp"
 #include "distconv/tensor/shuffle_mpi_cuda_al.hpp"
-#include "distconv/tensor/algorithms.hpp"
+#include "distconv/tensor/tensor_mpi_cuda.hpp"
 #include "distconv/util/util.hpp"
 #ifdef DISTCONV_HAS_P2P
 #include "p2p/p2p.hpp"
@@ -55,6 +56,15 @@
 #include "lbann/layers/learning/distconv/distconv_layers.hpp"
 namespace lbann {
 
+inline auto default_hydrogen_stream()
+{
+#if H2_HAS_CUDA
+  return hydrogen::cuda::GetDefaultStream();
+#elif H2_HAS_ROCM
+  return hydrogen::rocm::GetDefaultStream();
+#endif
+}
+
 class Layer;
 
 namespace dc {
@@ -109,7 +119,7 @@ using MPIRootPrintStreamInfo = ::distconv::util::MPIRootPrintStreamInfo;
 using MPIRootPrintStreamWaning = ::distconv::util::MPIRootPrintStreamWarning;
 
 // Distconv layer classes
-using Backend = ::distconv::cudnn::BackendCUDNN;
+using Backend = ::distconv::BackendDNNLib;
 using ReLU = ::distconv::ReLU<Backend>;
 using LeakyReLU = ::distconv::LeakyReLU<Backend>;
 template <typename TensorDataType>
@@ -232,7 +242,7 @@ Dist get_hydrogen_data_parallel_distribution(int num_dims);
 template <typename Tensor>
 void dump_tensor(const Tensor &t, const std::string &path) {
   dc::MPIPrintStreamDebug() << "Dumping tensor to " << path;
-  cudaDeviceSynchronize();
+  h2::gpu::sync();
   distconv::dump_tensor(t, path, true);
 }
 

diff --git a/include/lbann/utils/dnn_lib/miopen.hpp b/include/lbann/utils/dnn_lib/miopen.hpp
@@ -82,7 +82,7 @@ using dnnHandle_t = miopenHandle_t;
 using dnnDataType_t = miopenDataType_t;
 using dnnTensorDescriptor_t = miopenTensorDescriptor_t;
 using dnnFilterDescriptor_t = miopenTensorDescriptor_t;
-using dnnTensorFormat_t = int;
+using dnnTensorFormat_t = miopenTensorLayout_t;
 using dnnDropoutDescriptor_t = miopenDropoutDescriptor_t;
 using dnnRNGType_t = miopenRNGType_t;
 using dnnRNNDescriptor_t = miopenRNNDescriptor_t;
@@ -110,7 +110,7 @@ using dnnConvolutionBwdFilterAlgo_t = miopenConvBwdWeightsAlgorithm_t;
 constexpr dnnConvolutionMode_t DNN_CROSS_CORRELATION = miopenConvolution;
 constexpr dnnNanPropagation_t DNN_PROPAGATE_NAN = MIOPEN_PROPAGATE_NAN;
 constexpr dnnMathType_t DNN_DEFAULT_MATH = 0;
-constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = 0;
+constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = miopenTensorNCHW;
 constexpr dnnRNGType_t DNN_RNG_PSEUDO_XORWOW = MIOPEN_RNG_PSEUDO_XORWOW;
 constexpr dnnLRNMode_t DNN_LRN_CROSS_CHANNEL = miopenLRNCrossChannel;
 constexpr dnnMathType_t DNN_TENSOR_OP_MATH_ALLOW_CONVERSION = -1; // not supported with ROCm

diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp
@@ -133,7 +133,6 @@ struct CheckWeightsFunctor : DefaultErrorReporter
     // Get weights matrix and gradient
     const auto& weights_matrix = dtw.get_values();
     const auto& gradient = dtw.get_optimizer()->get_gradient();
-
     // Iterate through weights matrix entries
     for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
       for (El::Int row = 0; row < weights_matrix.Height(); ++row) {

diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp
@@ -332,7 +332,7 @@ setup_prev_activations_i(int index) const {
     const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
     t = std::make_unique<InputTensorDevType>(shape, loc, dist, local_shape);
     assert0(t->allocate());
-    t->zero(hydrogen::cuda::GetDefaultStream());
+    t->zero(default_hydrogen_stream());
   } else {
     // Create a shallow copy
     const auto &parent_activations =
@@ -427,7 +427,7 @@ setup_activations_i(int index) const {
   const auto local_shape = get_activations_local_shape(index);
   auto t = std::make_unique<OutputTensorDevType>(shape, loc, dist, local_shape);
   assert0(t->allocate());
-  t->zero(hydrogen::cuda::GetDefaultStream());
+  t->zero(default_hydrogen_stream());
   return t;
 }
 
@@ -483,7 +483,7 @@ setup_prev_error_signals_i(int index) const {
     const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
     t = std::make_unique<OutputTensorDevType>(shape, loc, dist, local_shape);
     assert0(t->allocate());
-    t->zero(hydrogen::cuda::GetDefaultStream());
+    t->zero(default_hydrogen_stream());
   } else {
     // Create a shallow copy
     const auto &child_error_signals =
@@ -550,7 +550,7 @@ setup_error_signals_i(int index) const {
   const auto local_shape = get_error_signals_local_shape(index);
   auto t = std::make_unique<InputTensorDevType>(shape, loc, dist, local_shape);
   assert0(t->allocate());
-  t->zero(hydrogen::cuda::GetDefaultStream());
+  t->zero(default_hydrogen_stream());
   return t;
 }
 
@@ -797,9 +797,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::ensu
         get_original_prev_activations(),
         get_prev_activations());
     shuffler.shuffle_forward(
-        get_original_prev_activations().get_const_base_ptr(),
-        get_prev_activations().get_base_ptr(),
-        hydrogen::cuda::GetDefaultStream());
+      get_original_prev_activations().get_const_base_ptr(),
+      get_prev_activations().get_base_ptr(),
+      default_hydrogen_stream());
   }
 }
 
@@ -818,10 +818,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::copy
     auto &shuffler = get_activations_shuffler(
         get_activations(),
         get_original_activations());
-    shuffler.shuffle_forward(
-        get_activations().get_const_base_ptr(),
-        get_original_activations().get_base_ptr(),
-        hydrogen::cuda::GetDefaultStream());
+    shuffler.shuffle_forward(get_activations().get_const_base_ptr(),
+                             get_original_activations().get_base_ptr(),
+                             default_hydrogen_stream());
   }
 }
 
@@ -846,9 +845,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::ensu
         get_original_prev_error_signals(i),
         get_prev_error_signals(i));
     shuffler.shuffle_forward(
-        get_original_prev_error_signals(i).get_const_base_ptr(),
-        get_prev_error_signals(i).get_base_ptr(),
-        hydrogen::cuda::GetDefaultStream());
+      get_original_prev_error_signals(i).get_const_base_ptr(),
+      get_prev_error_signals(i).get_base_ptr(),
+      default_hydrogen_stream());
   }
 }
 
@@ -868,10 +867,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::copy
     auto &shuffler = get_error_signals_shuffler(
         get_error_signals(i),
         get_original_error_signals(i));
-    shuffler.shuffle_forward(
-        get_error_signals(i).get_const_base_ptr(),
-        get_original_error_signals(i).get_base_ptr(),
-        hydrogen::cuda::GetDefaultStream());
+    shuffler.shuffle_forward(get_error_signals(i).get_const_base_ptr(),
+                             get_original_error_signals(i).get_base_ptr(),
+                             default_hydrogen_stream());
   }
 }
 

diff --git a/src/layers/io/input_layer.cpp b/src/layers/io/input_layer.cpp
@@ -304,7 +304,11 @@ void input_distconv_adapter<TensorDataType, T_layout, Dev>::setup_fp_tensors() {
       // only specialized for BaseAllocator.
       size_t buf_size = m_host_tensor->get_local_real_size() * sizeof(TensorDataType);
       TensorDataType *buf = nullptr;
+#if H2_HAS_CUDA
       CHECK_CUDA(cudaMallocHost(&buf, buf_size));
+#elif H2_HAS_ROCM
+      CHECK_ROCM(hipHostMalloc(&buf, buf_size));
+#endif
       // Note buf should be deallocated.
       dc::tensor::View(*m_host_tensor, buf);
       setup_shuffler_buffers(*m_original_host_tensor,
@@ -338,7 +342,7 @@ setup_activations_i(int index) const {
     const auto local_shape = get_activations_local_shape(index);
     auto t = std::make_unique<TensorDevType>(shape, loc, dist, local_shape);
     assert0(t->allocate());
-    t->zero(hydrogen::cuda::GetDefaultStream());
+    t->zero(default_hydrogen_stream());
     return t;
   }
   else {
@@ -434,7 +438,7 @@ template <typename TensorDataType,
 void input_distconv_adapter<TensorDataType, T_layout, Dev>::fp_compute() {
   auto &l = dynamic_cast<input_layer<
     TensorDataType, T_layout, Dev>&>(this->layer());
-  auto stream = hydrogen::cuda::GetDefaultStream();
+  auto stream = default_hydrogen_stream();
   // Note that the mini-batch size of the data reader is not
   // actually the one for the current mini-batch as the mini-batch
   // index is already updated by fp_compute.

diff --git a/src/layers/learning/distconv/distconv_layers.cpp b/src/layers/learning/distconv/distconv_layers.cpp
@@ -331,7 +331,7 @@ namespace distconv{
     const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &output_gradient, \
     tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &bias_gradient);
 
-ETI(float, cudnn::BackendCUDNN)
-ETI(double, cudnn::BackendCUDNN)
+  ETI(float, BackendDNNLib)
+  ETI(double, BackendDNNLib)
 #undef ETI
 } // namespace distconv