From 9b380f9b227ef6e0d034f946b403e832297cc6b8 Mon Sep 17 00:00:00 2001
From: "Thomas R. Benson" <benson31@llnl.gov>
Date: Fri, 14 Oct 2022 09:17:49 -0700
Subject: [PATCH 1/6] Updates to LBANN to support ROCm-enabled DistConv

---
 CMakeLists.txt                                | 22 +++++-----
 include/lbann/layers/activations/relu.hpp     |  4 ++
 .../learning/distconv/distconv_layers.hpp     | 18 ++++-----
 .../lbann/layers/transform/concatenate.hpp    |  4 +-
 include/lbann/layers/transform/pooling.hpp    | 40 ++++++++++++++++---
 include/lbann/utils/distconv.hpp              | 14 ++++++-
 src/callbacks/check_gradients.cpp             |  3 +-
 src/layers/data_type_distconv_adapter.cpp     | 16 ++++----
 src/layers/io/input_layer.cpp                 |  8 +++-
 .../learning/distconv/distconv_layers.cpp     |  4 +-
 src/layers/transform/split.cu                 | 10 ++---
 src/layers/transform/sum.cu                   | 10 ++---
 src/utils/distconv.cpp                        | 18 +++++----
 13 files changed, 112 insertions(+), 59 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d2a5cc7565..2b3575bd2e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -219,9 +219,19 @@ find_package(Hydrogen 1.5.0 CONFIG REQUIRED)
 message(STATUS "Found Hydrogen@${HYDROGEN_VERSION}: ${Hydrogen_DIR}")
 set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND})
 
+# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA,
+# LBANN gets CUDA.
+set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA})
+set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA})
+
+set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM})
+if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM)
+  set(LBANN_HAS_GPU TRUE)
+endif ()
+
 # DiHydrogen and Distconv
 if (LBANN_WITH_DISTCONV)
-  find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns DistConv)
+  find_package(DiHydrogen 0.3.0 CONFIG REQUIRED COMPONENTS Meta Patterns DistConv)
   set(LBANN_HAS_DISTCONV TRUE)
   set(LBANN_H2_LIBS
     H2::H2Meta
@@ -292,16 +302,6 @@ if (LBANN_WITH_ONEDNN)
   endif ()
 endif ()
 
-# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA,
-# LBANN gets CUDA.
-set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA})
-set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA})
-
-set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM})
-if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM)
-  set(LBANN_HAS_GPU TRUE)
-endif ()
-
 # Only used if have GPU and have CPU half.
 if (LBANN_HAS_GPU AND LBANN_HAS_HALF)
   set(LBANN_HAS_GPU_FP16 ${HYDROGEN_GPU_USE_FP16})
diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp
index 3f1ecb9d3f2..2bbd983c48d 100644
--- a/include/lbann/layers/activations/relu.hpp
+++ b/include/lbann/layers/activations/relu.hpp
@@ -30,6 +30,10 @@
 #include "lbann/layers/data_type_layer.hpp"
 #include "lbann/utils/distconv.hpp"
 
+#ifdef LBANN_HAS_DISTCONV
+#include "distconv/cudnn/relu.hpp"
+#endif
+
 namespace lbann {
 
 #ifdef LBANN_HAS_DISTCONV
diff --git a/include/lbann/layers/learning/distconv/distconv_layers.hpp b/include/lbann/layers/learning/distconv/distconv_layers.hpp
index abecb77723e..d807268c0de 100644
--- a/include/lbann/layers/learning/distconv/distconv_layers.hpp
+++ b/include/lbann/layers/learning/distconv/distconv_layers.hpp
@@ -48,25 +48,25 @@ namespace distconv{
             tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output);
 
     template <typename Allocator>
-    int apply_bias(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &bias, 
+    int apply_bias(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &bias,
                  tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output);
-    
+
     template <typename Allocator>
     int backward_wrt_input(
       bool transpose_A,
       const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
       const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &linearity,
       tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_grad);
-    
+
     template <typename Allocator>
     int backward_wrt_weight(
       bool transpose,
       DataType dst_scale,
       DataType gradient_scale,
-      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input, 
+      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input,
       const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
       tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &linearity_grad);
-    
+
     template <typename Allocator>
     int backward_wrt_bias(
       DataType gradient_scale,
@@ -80,21 +80,21 @@ namespace distconv{
 
 
   template <typename DataType, typename locale, typename Allocator>
-  tensor::Shape 
+  tensor::Shape
   get_fc_output_local_tensor_shape(const tensor::Tensor<DataType, locale, Allocator> &input,
                                    const int_vector &linearity_dims,
                                    bool transpose){
 
     //https://github.com/LLNL/DiHydrogen/blob/7f86db1f9701ac3afb5e16aefdd57563d57a1698/legacy/include/distconv/distconv.hpp#L173
 
-    //Get the input layer local tensor shape 
+    //Get the input layer local tensor shape
 
     auto output_local_shape = input.get_local_shape();
     output_local_shape[0] = transpose? linearity_dims[1] : linearity_dims[0];
     return output_local_shape;
   }
-extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, float>;
-extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, double>;
+extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, float>;
+extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, double>;
 }  // namespace distconv
 
 #endif // LBANN_HAS_DISTCONV
diff --git a/include/lbann/layers/transform/concatenate.hpp b/include/lbann/layers/transform/concatenate.hpp
index e2c26082f66..355c23d3407 100644
--- a/include/lbann/layers/transform/concatenate.hpp
+++ b/include/lbann/layers/transform/concatenate.hpp
@@ -468,7 +468,7 @@ fp_compute() {
   dc::tensor::Concatenate(this->get_activations(0),
                           this->get_prev_activations(0),
                           this->get_prev_activations(1),
-                          hydrogen::cuda::GetDefaultStream());
+                          default_hydrogen_stream());
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
@@ -477,7 +477,7 @@ bp_compute() {
   dc::tensor::Slice(this->get_error_signals(0),
                     this->get_error_signals(1),
                     this->get_prev_error_signals(0),
-                    hydrogen::cuda::GetDefaultStream());
+                    default_hydrogen_stream());
 }
 #endif // LBANN_HAS_DISTCONV
 
diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp
index 9a8e4765e3b..778c7772762 100644
--- a/include/lbann/layers/transform/pooling.hpp
+++ b/include/lbann/layers/transform/pooling.hpp
@@ -28,6 +28,7 @@
 #define LBANN_LAYER_POOLING_HPP_INCLUDED
 
 #include "lbann/layers/data_type_layer.hpp"
+#include "lbann/models/model.hpp"
 #include "lbann/utils/dim_helpers.hpp"
 #include "lbann/utils/dnn_enums.hpp"
 #ifdef LBANN_HAS_DNN_LIB
@@ -69,7 +70,7 @@ class pooling_distconv_adapter : public data_type_distconv_adapter<TensorDataTyp
   void setup_distributions(tensor_overlap_constraints &constraints) override;
   dc::Shape get_activations_local_shape(int index=0) const override;
   void setup_layer(size_t workspace_capacity) override;
-  void fp_compute();
+  void fp_compute(bool training = true); // training=true for max back-compatibility.
   void bp_compute();
   std::unique_ptr<dc::Pooling<TensorDataType>> m_pooling;
 };
@@ -295,12 +296,41 @@ class pooling_layer : public data_type_layer<TensorDataType> {
     if(this->using_gpus()) {
 #ifdef LBANN_HAS_DISTCONV
       if (this->distconv_enabled()) {
-        get_distconv_adapter().fp_compute();
+        static size_t counter = 0;
+        ++counter;
+        const auto& mode
+          = this->m_model->get_execution_context().get_execution_mode();
+        if (counter % 10000 == 0) {
+          std::cout << "** DC POOLING: " << counter
+                    << " (this=" << this << ", mode=" << to_string(mode) << ") **"
+                    << std::endl;
+        }
+        get_distconv_adapter().fp_compute(mode == execution_mode::training);
         return;
       }
 #endif // LBANN_HAS_DISTCONV
+      {
+        static size_t counter = 0;
+        ++counter;
+        if (counter % 10000 == 0) {
+          const auto& mode
+            = this->m_model->get_execution_context().get_execution_mode();
+          std::cout << "** POOLING: " << ++counter
+                    << " (this=" << this << ", mode=" << to_string(mode) << ") **"
+                    << std::endl;
+        }
+      }
       fp_compute_dnn();
-    } else {
+    }
+    else {
+      {
+        static size_t counter = 0;
+        const auto& mode
+          = this->m_model->get_execution_context().get_execution_mode();
+        std::cout << "** IM2COL POOLING: " << ++counter
+                  << " (this=" << this << ", mode=" << to_string(mode) << ") **"
+                  << std::endl;
+      }
       fp_compute_im2col();
     }
   }
@@ -802,9 +832,9 @@ setup_layer(size_t workspace_capacity) {
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
 void pooling_distconv_adapter<TensorDataType, Layout, Device>::
-fp_compute() {
+fp_compute(bool const training) {
   m_pooling->forward(El::To<TensorDataType>(1), this->get_prev_activations(),
-                     El::To<TensorDataType>(0), this->get_activations());
+                     El::To<TensorDataType>(0), this->get_activations(), training);
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp
index f21f288c194..c21d719b990 100644
--- a/include/lbann/utils/distconv.hpp
+++ b/include/lbann/utils/distconv.hpp
@@ -40,6 +40,7 @@
 #endif
 
 #include "distconv/distconv.hpp"
+#include "distconv/cudnn/backend.hpp"
 #include "distconv/tensor/tensor_mpi_cuda.hpp"
 #include "distconv/tensor/shuffle_mpi.hpp"
 #include "distconv/tensor/shuffle_mpi_cuda.hpp"
@@ -55,6 +56,15 @@
 #include "lbann/layers/learning/distconv/distconv_layers.hpp"
 namespace lbann {
 
+inline auto default_hydrogen_stream()
+{
+#if H2_HAS_CUDA
+  return hydrogen::cuda::GetDefaultStream();
+#elif H2_HAS_ROCM
+  return hydrogen::rocm::GetDefaultStream();
+#endif
+}
+
 class Layer;
 
 namespace dc {
@@ -109,7 +119,7 @@ using MPIRootPrintStreamInfo = ::distconv::util::MPIRootPrintStreamInfo;
 using MPIRootPrintStreamWaning = ::distconv::util::MPIRootPrintStreamWarning;
 
 // Distconv layer classes
-using Backend = ::distconv::cudnn::BackendCUDNN;
+using Backend = ::distconv::BackendDNNLib;
 using ReLU = ::distconv::ReLU<Backend>;
 using LeakyReLU = ::distconv::LeakyReLU<Backend>;
 template <typename TensorDataType>
@@ -232,7 +242,7 @@ Dist get_hydrogen_data_parallel_distribution(int num_dims);
 template <typename Tensor>
 void dump_tensor(const Tensor &t, const std::string &path) {
   dc::MPIPrintStreamDebug() << "Dumping tensor to " << path;
-  cudaDeviceSynchronize();
+  h2::gpu::sync();
   distconv::dump_tensor(t, path, true);
 }
 
diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp
index d76856f50ee..44dc3fd91c1 100644
--- a/src/callbacks/check_gradients.cpp
+++ b/src/callbacks/check_gradients.cpp
@@ -133,10 +133,11 @@ struct CheckWeightsFunctor : DefaultErrorReporter
     // Get weights matrix and gradient
     const auto& weights_matrix = dtw.get_values();
     const auto& gradient = dtw.get_optimizer()->get_gradient();
-
+    //std::cout << "*** CHECKING WEIGHTS: NAME=" << dtw.get_name() << ", SIZE=" << weights_matrix.Height() << "x" << weights_matrix.Width() << std::endl;
     // Iterate through weights matrix entries
     for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
       for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
+        //std::cout << "*** CHECKING WEIGHTS (name=" << dtw.get_name() << "): row=" << row << ", col=" << col << std::endl;
         const bool weight_is_local = weights_matrix.IsLocal(row, col);
         const El::Int local_row = (weight_is_local
                                    ? weights_matrix.LocalRow(row)
diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp
index 00763b58c8a..4581a915e54 100644
--- a/src/layers/data_type_distconv_adapter.cpp
+++ b/src/layers/data_type_distconv_adapter.cpp
@@ -332,7 +332,7 @@ setup_prev_activations_i(int index) const {
     const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
     t = std::make_unique<InputTensorDevType>(shape, loc, dist, local_shape);
     assert0(t->allocate());
-    t->zero(hydrogen::cuda::GetDefaultStream());
+    t->zero(default_hydrogen_stream());
   } else {
     // Create a shallow copy
     const auto &parent_activations =
@@ -427,7 +427,7 @@ setup_activations_i(int index) const {
   const auto local_shape = get_activations_local_shape(index);
   auto t = std::make_unique<OutputTensorDevType>(shape, loc, dist, local_shape);
   assert0(t->allocate());
-  t->zero(hydrogen::cuda::GetDefaultStream());
+  t->zero(default_hydrogen_stream());
   return t;
 }
 
@@ -483,7 +483,7 @@ setup_prev_error_signals_i(int index) const {
     const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
     t = std::make_unique<OutputTensorDevType>(shape, loc, dist, local_shape);
     assert0(t->allocate());
-    t->zero(hydrogen::cuda::GetDefaultStream());
+    t->zero(default_hydrogen_stream());
   } else {
     // Create a shallow copy
     const auto &child_error_signals =
@@ -550,7 +550,7 @@ setup_error_signals_i(int index) const {
   const auto local_shape = get_error_signals_local_shape(index);
   auto t = std::make_unique<InputTensorDevType>(shape, loc, dist, local_shape);
   assert0(t->allocate());
-  t->zero(hydrogen::cuda::GetDefaultStream());
+  t->zero(default_hydrogen_stream());
   return t;
 }
 
@@ -799,7 +799,7 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::ensu
     shuffler.shuffle_forward(
         get_original_prev_activations().get_const_base_ptr(),
         get_prev_activations().get_base_ptr(),
-        hydrogen::cuda::GetDefaultStream());
+        default_hydrogen_stream());
   }
 }
 
@@ -821,7 +821,7 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::copy
     shuffler.shuffle_forward(
         get_activations().get_const_base_ptr(),
         get_original_activations().get_base_ptr(),
-        hydrogen::cuda::GetDefaultStream());
+        default_hydrogen_stream());
   }
 }
 
@@ -848,7 +848,7 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::ensu
     shuffler.shuffle_forward(
         get_original_prev_error_signals(i).get_const_base_ptr(),
         get_prev_error_signals(i).get_base_ptr(),
-        hydrogen::cuda::GetDefaultStream());
+        default_hydrogen_stream());
   }
 }
 
@@ -871,7 +871,7 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::copy
     shuffler.shuffle_forward(
         get_error_signals(i).get_const_base_ptr(),
         get_original_error_signals(i).get_base_ptr(),
-        hydrogen::cuda::GetDefaultStream());
+        default_hydrogen_stream());
   }
 }
 
diff --git a/src/layers/io/input_layer.cpp b/src/layers/io/input_layer.cpp
index 68664436ff2..dd267e692b2 100644
--- a/src/layers/io/input_layer.cpp
+++ b/src/layers/io/input_layer.cpp
@@ -304,7 +304,11 @@ void input_distconv_adapter<TensorDataType, T_layout, Dev>::setup_fp_tensors() {
       // only specialized for BaseAllocator.
       size_t buf_size = m_host_tensor->get_local_real_size() * sizeof(TensorDataType);
       TensorDataType *buf = nullptr;
+#if H2_HAS_CUDA
       CHECK_CUDA(cudaMallocHost(&buf, buf_size));
+#elif H2_HAS_ROCM
+      CHECK_ROCM(hipHostMalloc(&buf, buf_size));
+#endif
       // Note buf should be deallocated.
       dc::tensor::View(*m_host_tensor, buf);
       setup_shuffler_buffers(*m_original_host_tensor,
@@ -338,7 +342,7 @@ setup_activations_i(int index) const {
     const auto local_shape = get_activations_local_shape(index);
     auto t = std::make_unique<TensorDevType>(shape, loc, dist, local_shape);
     assert0(t->allocate());
-    t->zero(hydrogen::cuda::GetDefaultStream());
+    t->zero(default_hydrogen_stream());
     return t;
   }
   else {
@@ -434,7 +438,7 @@ template <typename TensorDataType,
 void input_distconv_adapter<TensorDataType, T_layout, Dev>::fp_compute() {
   auto &l = dynamic_cast<input_layer<
     TensorDataType, T_layout, Dev>&>(this->layer());
-  auto stream = hydrogen::cuda::GetDefaultStream();
+  auto stream = default_hydrogen_stream();
   // Note that the mini-batch size of the data reader is not
   // actually the one for the current mini-batch as the mini-batch
   // index is already updated by fp_compute.
diff --git a/src/layers/learning/distconv/distconv_layers.cpp b/src/layers/learning/distconv/distconv_layers.cpp
index 92dd171dd35..22fa9f4df90 100644
--- a/src/layers/learning/distconv/distconv_layers.cpp
+++ b/src/layers/learning/distconv/distconv_layers.cpp
@@ -331,7 +331,7 @@ namespace distconv{
     const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &output_gradient, \
     tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &bias_gradient);
 
-ETI(float, cudnn::BackendCUDNN)
-ETI(double, cudnn::BackendCUDNN)
+ETI(float, BackendDNNLib)
+ETI(double, BackendDNNLib)
 #undef ETI
 } // namespace distconv
diff --git a/src/layers/transform/split.cu b/src/layers/transform/split.cu
index ca212564f89..2f2c881e49d 100644
--- a/src/layers/transform/split.cu
+++ b/src/layers/transform/split.cu
@@ -71,29 +71,29 @@ void split_distconv_adapter<TensorDataType, Layout, Dev>::bp_compute() {
   auto &error_signals = this->get_error_signals(0);
   switch (this->layer().get_num_children()) {
     case 0:
-      error_signals.zero(hydrogen::cuda::GetDefaultStream());
+      error_signals.zero(default_hydrogen_stream());
       break;
     case 1:
       dc::tensor::Copy(error_signals,
                        this->get_prev_error_signals(0),
-                       hydrogen::cuda::GetDefaultStream());
+                       default_hydrogen_stream());
       break;
     case 2:
       dc::tensor::Transform(error_signals,
                             this->get_prev_error_signals(0),
                             this->get_prev_error_signals(1),
                             sum_op<TensorDataType>(),
-                            hydrogen::cuda::GetDefaultStream());
+                            default_hydrogen_stream());
       break;
     default:
       dc::tensor::Copy(error_signals,
                        this->get_prev_error_signals(1),
-                       hydrogen::cuda::GetDefaultStream());
+                       default_hydrogen_stream());
       for (int i = 1; i < this->layer().get_num_children(); ++i) {
         const auto &prev_error = this->get_prev_error_signals(i);
         dc::tensor::Transform(error_signals, prev_error,
                               accumulate_op<TensorDataType>(),
-                              hydrogen::cuda::GetDefaultStream());
+                              default_hydrogen_stream());
       }
   }
   return;
diff --git a/src/layers/transform/sum.cu b/src/layers/transform/sum.cu
index c9ea29b76e4..578d357a16f 100644
--- a/src/layers/transform/sum.cu
+++ b/src/layers/transform/sum.cu
@@ -67,11 +67,11 @@ void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
   auto &activations = this->get_activations();
   switch (this->layer().get_num_parents()) {
     case 0:
-      activations.zero(hydrogen::cuda::GetDefaultStream());
+      activations.zero(default_hydrogen_stream());
       break;
     case 1:
       dc::tensor::Copy(activations, this->get_prev_activations(),
-                       hydrogen::cuda::GetDefaultStream());
+                       default_hydrogen_stream());
       break;
     case 2:
       // Optimization for layers with 2 parents (e.g.,
@@ -82,7 +82,7 @@ void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
                             this->get_prev_activations(0),
                             this->get_prev_activations(1),
                             sum_op<TensorDataType>(),
-                            hydrogen::cuda::GetDefaultStream());
+                            default_hydrogen_stream());
       break;
     default:
       for (int i = 0; i < this->layer().get_num_parents(); ++i) {
@@ -90,11 +90,11 @@ void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
         prev_activations.set_outermost_dimension(activations.get_shape()[-1]);
         if (i == 0) {
           dc::tensor::Copy(activations, prev_activations,
-                           hydrogen::cuda::GetDefaultStream());
+                           default_hydrogen_stream());
         } else {
           distconv::tensor::Transform(activations, prev_activations,
                                       accumulate_op<TensorDataType>(),
-                                      hydrogen::cuda::GetDefaultStream());
+                                      default_hydrogen_stream());
         }
       }
   }
diff --git a/src/utils/distconv.cpp b/src/utils/distconv.cpp
index 49fc966aef8..723c7749d3a 100644
--- a/src/utils/distconv.cpp
+++ b/src/utils/distconv.cpp
@@ -209,7 +209,7 @@ TensorDataType *get_shuffler_src_buf(const TensorDev<TensorDataType> &tensor) {
     shuffler_src_buf_size = TensorShuffler<TensorDataType>::get_buf_size(tensor);
     MPIPrintStreamDebug() << "Allocating shared shuffler buffer of size "
                           << shuffler_src_buf_size;
-    DISTCONV_CUDA_MALLOC(&shuffler_src_buf, shuffler_src_buf_size);
+    DISTCONV_GPU_MALLOC(&shuffler_src_buf, shuffler_src_buf_size);
   }
   // Returns the pre-allocated memory if it's large enough
   size_t required_size = TensorShuffler<TensorDataType>::get_buf_size(tensor);
@@ -228,7 +228,7 @@ TensorDataType *get_shuffler_dst_buf(const TensorDev<TensorDataType> &tensor) {
     shuffler_dst_buf_size = TensorShuffler<TensorDataType>::get_buf_size(tensor);
     MPIPrintStreamDebug() << "Allocating shared shuffler buffer of size "
                           << shuffler_src_buf_size;
-    DISTCONV_CUDA_MALLOC(&shuffler_dst_buf, shuffler_dst_buf_size);
+    DISTCONV_GPU_MALLOC(&shuffler_dst_buf, shuffler_dst_buf_size);
   }
   size_t required_size = TensorShuffler<TensorDataType>::get_buf_size(tensor);
   // Returns the pre-allocated memory if it's large enough
@@ -241,11 +241,11 @@ TensorDataType *get_shuffler_dst_buf(const TensorDev<TensorDataType> &tensor) {
 }
 void delete_shuffler_buffers() {
   if (shuffler_src_buf) {
-    CHECK_CUDA(cudaFree(shuffler_src_buf));
+    DISTCONV_CHECK_GPU(GPU_FREE(shuffler_src_buf));
     shuffler_src_buf = nullptr;
   }
   if (shuffler_dst_buf) {
-    CHECK_CUDA(cudaFree(shuffler_dst_buf));
+    DISTCONV_CHECK_GPU(GPU_FREE(shuffler_dst_buf));
     shuffler_dst_buf = nullptr;
   }
 }
@@ -291,12 +291,12 @@ void initialize(MPI_Comm comm) {
   p2p_instance = new p2p::P2P(mpi_comm);
 #endif // DISTCONV_HAS_P2P
   hosttransfer_comm_instance = new Al::hosttransfer_backend::comm_type(
-      mpi_comm, hydrogen::cuda::GetDefaultStream());
-  ::distconv::cudnn::Options backend_opts;
+      mpi_comm, default_hydrogen_stream());
+  ::distconv::backend::Options backend_opts;
   backend_opts.m_deterministic = opt_deterministic;
   backend_instance = new Backend(
       mpi_comm, lbann::dnn_lib::get_handle(),
-      hydrogen::cuda::GetDefaultStream(), backend_opts);
+      default_hydrogen_stream(), backend_opts);
   print_options(std::cout);
   initialized = true;
 }
@@ -454,7 +454,11 @@ Dist get_hydrogen_data_parallel_distribution(int num_dims) {
 
 size_t get_workspace_capacity() {
   size_t available, total;
+#if H2_HAS_CUDA
   FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
+#elif H2_HAS_ROCM
+  FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total));
+#endif
   size_t workspace_capacity = available;
   // set aside some space for shuffling, halo exchange, etc.
   workspace_capacity -= 1 << 28;

From 9b258f2c5441dae2cb4410b2910880fc71262693 Mon Sep 17 00:00:00 2001
From: "Thomas R. Benson" <benson31@llnl.gov>
Date: Fri, 14 Oct 2022 09:24:38 -0700
Subject: [PATCH 2/6] Clean up debugging output; clang-format the diff to
 develop.

---
 .../learning/distconv/distconv_layers.hpp     | 29 ++++++------
 include/lbann/layers/transform/pooling.hpp    | 45 +++++--------------
 include/lbann/utils/distconv.hpp              |  6 +--
 python/lbann/contrib/lc/systems.py            |  2 +-
 src/callbacks/check_gradients.cpp             |  6 ++-
 src/layers/data_type_distconv_adapter.cpp     | 26 +++++------
 .../learning/distconv/distconv_layers.cpp     |  4 +-
 src/layers/transform/split.cu                 |  3 +-
 src/layers/transform/sum.cu                   |  9 ++--
 src/utils/distconv.cpp                        | 12 ++---
 10 files changed, 65 insertions(+), 77 deletions(-)

diff --git a/include/lbann/layers/learning/distconv/distconv_layers.hpp b/include/lbann/layers/learning/distconv/distconv_layers.hpp
index d807268c0de..b5c31fc4544 100644
--- a/include/lbann/layers/learning/distconv/distconv_layers.hpp
+++ b/include/lbann/layers/learning/distconv/distconv_layers.hpp
@@ -48,8 +48,9 @@ namespace distconv{
             tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output);
 
     template <typename Allocator>
-    int apply_bias(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &bias,
-                 tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output);
+    int apply_bias(
+      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& bias,
+      tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& output);
 
     template <typename Allocator>
     int backward_wrt_input(
@@ -63,9 +64,9 @@ namespace distconv{
       bool transpose,
       DataType dst_scale,
       DataType gradient_scale,
-      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input,
-      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
-      tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &linearity_grad);
+      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& input,
+      const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& output_grad,
+      tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& linearity_grad);
 
     template <typename Allocator>
     int backward_wrt_bias(
@@ -78,23 +79,25 @@ namespace distconv{
     Backend &m_be;
   }; // class definition ChannelwiseFullyConnected
 
-
   template <typename DataType, typename locale, typename Allocator>
-  tensor::Shape
-  get_fc_output_local_tensor_shape(const tensor::Tensor<DataType, locale, Allocator> &input,
-                                   const int_vector &linearity_dims,
-                                   bool transpose){
+  tensor::Shape get_fc_output_local_tensor_shape(
+    const tensor::Tensor<DataType, locale, Allocator>& input,
+    const int_vector& linearity_dims,
+    bool transpose)
+  {
 
     //https://github.com/LLNL/DiHydrogen/blob/7f86db1f9701ac3afb5e16aefdd57563d57a1698/legacy/include/distconv/distconv.hpp#L173
 
-    //Get the input layer local tensor shape
+    // Get the input layer local tensor shape
 
     auto output_local_shape = input.get_local_shape();
     output_local_shape[0] = transpose? linearity_dims[1] : linearity_dims[0];
     return output_local_shape;
   }
-extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, float>;
-extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, double>;
+  extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib,
+                                                  float>;
+  extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib,
+                                                  double>;
 }  // namespace distconv
 
 #endif // LBANN_HAS_DISTCONV
diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp
index 778c7772762..6b2644dff01 100644
--- a/include/lbann/layers/transform/pooling.hpp
+++ b/include/lbann/layers/transform/pooling.hpp
@@ -70,7 +70,8 @@ class pooling_distconv_adapter : public data_type_distconv_adapter<TensorDataTyp
   void setup_distributions(tensor_overlap_constraints &constraints) override;
   dc::Shape get_activations_local_shape(int index=0) const override;
   void setup_layer(size_t workspace_capacity) override;
-  void fp_compute(bool training = true); // training=true for max back-compatibility.
+  void
+  fp_compute(bool training = true); // training=true for max back-compatibility.
   void bp_compute();
   std::unique_ptr<dc::Pooling<TensorDataType>> m_pooling;
 };
@@ -296,41 +297,15 @@ class pooling_layer : public data_type_layer<TensorDataType> {
     if(this->using_gpus()) {
 #ifdef LBANN_HAS_DISTCONV
       if (this->distconv_enabled()) {
-        static size_t counter = 0;
-        ++counter;
-        const auto& mode
-          = this->m_model->get_execution_context().get_execution_mode();
-        if (counter % 10000 == 0) {
-          std::cout << "** DC POOLING: " << counter
-                    << " (this=" << this << ", mode=" << to_string(mode) << ") **"
-                    << std::endl;
-        }
+        const auto& mode =
+          this->m_model->get_execution_context().get_execution_mode();
         get_distconv_adapter().fp_compute(mode == execution_mode::training);
         return;
       }
 #endif // LBANN_HAS_DISTCONV
-      {
-        static size_t counter = 0;
-        ++counter;
-        if (counter % 10000 == 0) {
-          const auto& mode
-            = this->m_model->get_execution_context().get_execution_mode();
-          std::cout << "** POOLING: " << ++counter
-                    << " (this=" << this << ", mode=" << to_string(mode) << ") **"
-                    << std::endl;
-        }
-      }
       fp_compute_dnn();
     }
     else {
-      {
-        static size_t counter = 0;
-        const auto& mode
-          = this->m_model->get_execution_context().get_execution_mode();
-        std::cout << "** IM2COL POOLING: " << ++counter
-                  << " (this=" << this << ", mode=" << to_string(mode) << ") **"
-                  << std::endl;
-      }
       fp_compute_im2col();
     }
   }
@@ -831,10 +806,14 @@ setup_layer(size_t workspace_capacity) {
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
-void pooling_distconv_adapter<TensorDataType, Layout, Device>::
-fp_compute(bool const training) {
-  m_pooling->forward(El::To<TensorDataType>(1), this->get_prev_activations(),
-                     El::To<TensorDataType>(0), this->get_activations(), training);
+void pooling_distconv_adapter<TensorDataType, Layout, Device>::fp_compute(
+  bool const training)
+{
+  m_pooling->forward(El::To<TensorDataType>(1),
+                     this->get_prev_activations(),
+                     El::To<TensorDataType>(0),
+                     this->get_activations(),
+                     training);
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp
index c21d719b990..dfb8fb03dee 100644
--- a/include/lbann/utils/distconv.hpp
+++ b/include/lbann/utils/distconv.hpp
@@ -39,13 +39,13 @@
 #define DISTCONV_DEBUG
 #endif
 
-#include "distconv/distconv.hpp"
 #include "distconv/cudnn/backend.hpp"
-#include "distconv/tensor/tensor_mpi_cuda.hpp"
+#include "distconv/distconv.hpp"
+#include "distconv/tensor/algorithms.hpp"
 #include "distconv/tensor/shuffle_mpi.hpp"
 #include "distconv/tensor/shuffle_mpi_cuda.hpp"
 #include "distconv/tensor/shuffle_mpi_cuda_al.hpp"
-#include "distconv/tensor/algorithms.hpp"
+#include "distconv/tensor/tensor_mpi_cuda.hpp"
 #include "distconv/util/util.hpp"
 #ifdef DISTCONV_HAS_P2P
 #include "p2p/p2p.hpp"
diff --git a/python/lbann/contrib/lc/systems.py b/python/lbann/contrib/lc/systems.py
index 08cec5a82d7..bee6bb0f313 100644
--- a/python/lbann/contrib/lc/systems.py
+++ b/python/lbann/contrib/lc/systems.py
@@ -25,7 +25,7 @@ def __init__(self, cores_per_node, gpus_per_node, scheduler):
     'sierra':   SystemParams(44, 4, 'lsf'),
     'rzansel':  SystemParams(44, 4, 'lsf'),
     'rzhasgpu': SystemParams(16, 2, 'slurm'),
-    'tioga':    SystemParams(64, 8, 'flux'),
+    'tioga':    SystemParams(64, 4, 'flux'),
 }
 
 # Detect system
diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp
index 44dc3fd91c1..673d0347dae 100644
--- a/src/callbacks/check_gradients.cpp
+++ b/src/callbacks/check_gradients.cpp
@@ -133,11 +133,13 @@ struct CheckWeightsFunctor : DefaultErrorReporter
     // Get weights matrix and gradient
     const auto& weights_matrix = dtw.get_values();
     const auto& gradient = dtw.get_optimizer()->get_gradient();
-    //std::cout << "*** CHECKING WEIGHTS: NAME=" << dtw.get_name() << ", SIZE=" << weights_matrix.Height() << "x" << weights_matrix.Width() << std::endl;
+    // std::cout << "*** CHECKING WEIGHTS: NAME=" << dtw.get_name() << ", SIZE="
+    // << weights_matrix.Height() << "x" << weights_matrix.Width() << std::endl;
     // Iterate through weights matrix entries
     for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
       for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
-        //std::cout << "*** CHECKING WEIGHTS (name=" << dtw.get_name() << "): row=" << row << ", col=" << col << std::endl;
+        // std::cout << "*** CHECKING WEIGHTS (name=" << dtw.get_name() << "):
+        // row=" << row << ", col=" << col << std::endl;
         const bool weight_is_local = weights_matrix.IsLocal(row, col);
         const El::Int local_row = (weight_is_local
                                    ? weights_matrix.LocalRow(row)
diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp
index 4581a915e54..55684aaf81a 100644
--- a/src/layers/data_type_distconv_adapter.cpp
+++ b/src/layers/data_type_distconv_adapter.cpp
@@ -797,9 +797,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::ensu
         get_original_prev_activations(),
         get_prev_activations());
     shuffler.shuffle_forward(
-        get_original_prev_activations().get_const_base_ptr(),
-        get_prev_activations().get_base_ptr(),
-        default_hydrogen_stream());
+      get_original_prev_activations().get_const_base_ptr(),
+      get_prev_activations().get_base_ptr(),
+      default_hydrogen_stream());
   }
 }
 
@@ -818,10 +818,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::copy
     auto &shuffler = get_activations_shuffler(
         get_activations(),
         get_original_activations());
-    shuffler.shuffle_forward(
-        get_activations().get_const_base_ptr(),
-        get_original_activations().get_base_ptr(),
-        default_hydrogen_stream());
+    shuffler.shuffle_forward(get_activations().get_const_base_ptr(),
+                             get_original_activations().get_base_ptr(),
+                             default_hydrogen_stream());
   }
 }
 
@@ -846,9 +845,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::ensu
         get_original_prev_error_signals(i),
         get_prev_error_signals(i));
     shuffler.shuffle_forward(
-        get_original_prev_error_signals(i).get_const_base_ptr(),
-        get_prev_error_signals(i).get_base_ptr(),
-        default_hydrogen_stream());
+      get_original_prev_error_signals(i).get_const_base_ptr(),
+      get_prev_error_signals(i).get_base_ptr(),
+      default_hydrogen_stream());
   }
 }
 
@@ -868,10 +867,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::copy
     auto &shuffler = get_error_signals_shuffler(
         get_error_signals(i),
         get_original_error_signals(i));
-    shuffler.shuffle_forward(
-        get_error_signals(i).get_const_base_ptr(),
-        get_original_error_signals(i).get_base_ptr(),
-        default_hydrogen_stream());
+    shuffler.shuffle_forward(get_error_signals(i).get_const_base_ptr(),
+                             get_original_error_signals(i).get_base_ptr(),
+                             default_hydrogen_stream());
   }
 }
 
diff --git a/src/layers/learning/distconv/distconv_layers.cpp b/src/layers/learning/distconv/distconv_layers.cpp
index 22fa9f4df90..6da0a9e546c 100644
--- a/src/layers/learning/distconv/distconv_layers.cpp
+++ b/src/layers/learning/distconv/distconv_layers.cpp
@@ -331,7 +331,7 @@ namespace distconv{
     const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &output_gradient, \
     tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &bias_gradient);
 
-ETI(float, BackendDNNLib)
-ETI(double, BackendDNNLib)
+  ETI(float, BackendDNNLib)
+  ETI(double, BackendDNNLib)
 #undef ETI
 } // namespace distconv
diff --git a/src/layers/transform/split.cu b/src/layers/transform/split.cu
index 2f2c881e49d..b9f8ecdcb89 100644
--- a/src/layers/transform/split.cu
+++ b/src/layers/transform/split.cu
@@ -91,7 +91,8 @@ void split_distconv_adapter<TensorDataType, Layout, Dev>::bp_compute() {
                        default_hydrogen_stream());
       for (int i = 1; i < this->layer().get_num_children(); ++i) {
         const auto &prev_error = this->get_prev_error_signals(i);
-        dc::tensor::Transform(error_signals, prev_error,
+        dc::tensor::Transform(error_signals,
+                              prev_error,
                               accumulate_op<TensorDataType>(),
                               default_hydrogen_stream());
       }
diff --git a/src/layers/transform/sum.cu b/src/layers/transform/sum.cu
index 578d357a16f..d0ee8adf1c8 100644
--- a/src/layers/transform/sum.cu
+++ b/src/layers/transform/sum.cu
@@ -70,7 +70,8 @@ void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
       activations.zero(default_hydrogen_stream());
       break;
     case 1:
-      dc::tensor::Copy(activations, this->get_prev_activations(),
+      dc::tensor::Copy(activations,
+                       this->get_prev_activations(),
                        default_hydrogen_stream());
       break;
     case 2:
@@ -89,10 +90,12 @@ void sum_distconv_adapter<TensorDataType, Layout, Dev>::fp_compute() {
         auto &prev_activations = this->get_prev_activations(i);
         prev_activations.set_outermost_dimension(activations.get_shape()[-1]);
         if (i == 0) {
-          dc::tensor::Copy(activations, prev_activations,
+          dc::tensor::Copy(activations,
+                           prev_activations,
                            default_hydrogen_stream());
         } else {
-          distconv::tensor::Transform(activations, prev_activations,
+          distconv::tensor::Transform(activations,
+                                      prev_activations,
                                       accumulate_op<TensorDataType>(),
                                       default_hydrogen_stream());
         }
diff --git a/src/utils/distconv.cpp b/src/utils/distconv.cpp
index 723c7749d3a..d89f9da5052 100644
--- a/src/utils/distconv.cpp
+++ b/src/utils/distconv.cpp
@@ -290,13 +290,15 @@ void initialize(MPI_Comm comm) {
 #ifdef DISTCONV_HAS_P2P
   p2p_instance = new p2p::P2P(mpi_comm);
 #endif // DISTCONV_HAS_P2P
-  hosttransfer_comm_instance = new Al::hosttransfer_backend::comm_type(
-      mpi_comm, default_hydrogen_stream());
+  hosttransfer_comm_instance =
+    new Al::hosttransfer_backend::comm_type(mpi_comm,
+                                            default_hydrogen_stream());
   ::distconv::backend::Options backend_opts;
   backend_opts.m_deterministic = opt_deterministic;
-  backend_instance = new Backend(
-      mpi_comm, lbann::dnn_lib::get_handle(),
-      default_hydrogen_stream(), backend_opts);
+  backend_instance = new Backend(mpi_comm,
+                                 lbann::dnn_lib::get_handle(),
+                                 default_hydrogen_stream(),
+                                 backend_opts);
   print_options(std::cout);
   initialized = true;
 }

From 102729ab9861792ab5956ddea3eef9c3a6f3d632 Mon Sep 17 00:00:00 2001
From: "Thomas R. Benson" <benson31@llnl.gov>
Date: Tue, 18 Oct 2022 08:51:06 -0700
Subject: [PATCH 3/6] Undo accidentally-committed debugging changes

---
 python/lbann/contrib/lc/systems.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/lbann/contrib/lc/systems.py b/python/lbann/contrib/lc/systems.py
index bee6bb0f313..08cec5a82d7 100644
--- a/python/lbann/contrib/lc/systems.py
+++ b/python/lbann/contrib/lc/systems.py
@@ -25,7 +25,7 @@ def __init__(self, cores_per_node, gpus_per_node, scheduler):
     'sierra':   SystemParams(44, 4, 'lsf'),
     'rzansel':  SystemParams(44, 4, 'lsf'),
     'rzhasgpu': SystemParams(16, 2, 'slurm'),
-    'tioga':    SystemParams(64, 4, 'flux'),
+    'tioga':    SystemParams(64, 8, 'flux'),
 }
 
 # Detect system

From a75fbad75644af60eb40927fd773e1772d562c99 Mon Sep 17 00:00:00 2001
From: "Thomas R. Benson" <benson31@llnl.gov>
Date: Fri, 11 Nov 2022 08:54:05 -0800
Subject: [PATCH 4/6] update miopen typedefs/default values

---
 include/lbann/utils/dnn_lib/miopen.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/lbann/utils/dnn_lib/miopen.hpp b/include/lbann/utils/dnn_lib/miopen.hpp
index a100e6c840f..fb99b23a146 100644
--- a/include/lbann/utils/dnn_lib/miopen.hpp
+++ b/include/lbann/utils/dnn_lib/miopen.hpp
@@ -82,7 +82,7 @@ using dnnHandle_t = miopenHandle_t;
 using dnnDataType_t = miopenDataType_t;
 using dnnTensorDescriptor_t = miopenTensorDescriptor_t;
 using dnnFilterDescriptor_t = miopenTensorDescriptor_t;
-using dnnTensorFormat_t = int;
+using dnnTensorFormat_t = miopenTensorLayout_t;
 using dnnDropoutDescriptor_t = miopenDropoutDescriptor_t;
 using dnnRNGType_t = miopenRNGType_t;
 using dnnRNNDescriptor_t = miopenRNNDescriptor_t;
@@ -110,7 +110,7 @@ using dnnConvolutionBwdFilterAlgo_t = miopenConvBwdWeightsAlgorithm_t;
 constexpr dnnConvolutionMode_t DNN_CROSS_CORRELATION = miopenConvolution;
 constexpr dnnNanPropagation_t DNN_PROPAGATE_NAN = MIOPEN_PROPAGATE_NAN;
 constexpr dnnMathType_t DNN_DEFAULT_MATH = 0;
-constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = 0;
+constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = miopenTensorNCHW;
 constexpr dnnRNGType_t DNN_RNG_PSEUDO_XORWOW = MIOPEN_RNG_PSEUDO_XORWOW;
 constexpr dnnLRNMode_t DNN_LRN_CROSS_CHANNEL = miopenLRNCrossChannel;
 constexpr dnnMathType_t DNN_TENSOR_OP_MATH_ALLOW_CONVERSION = -1; // not supported with ROCm

From 547814ea9101b2ee8b9d2c5972409069234faf0a Mon Sep 17 00:00:00 2001
From: "Thomas R. Benson" <benson31@llnl.gov>
Date: Fri, 11 Nov 2022 08:55:31 -0800
Subject: [PATCH 5/6] Update filepaths for distconv

---
 include/lbann/layers/activations/relu.hpp | 2 +-
 include/lbann/utils/distconv.hpp          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp
index 2bbd983c48d..2cd2eb79dde 100644
--- a/include/lbann/layers/activations/relu.hpp
+++ b/include/lbann/layers/activations/relu.hpp
@@ -31,7 +31,7 @@
 #include "lbann/utils/distconv.hpp"
 
 #ifdef LBANN_HAS_DISTCONV
-#include "distconv/cudnn/relu.hpp"
+#include "distconv/dnn_backend/relu.hpp"
 #endif
 
 namespace lbann {
diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp
index dfb8fb03dee..a3606e689cb 100644
--- a/include/lbann/utils/distconv.hpp
+++ b/include/lbann/utils/distconv.hpp
@@ -39,7 +39,7 @@
 #define DISTCONV_DEBUG
 #endif
 
-#include "distconv/cudnn/backend.hpp"
+#include "distconv/dnn_backend/backend.hpp"
 #include "distconv/distconv.hpp"
 #include "distconv/tensor/algorithms.hpp"
 #include "distconv/tensor/shuffle_mpi.hpp"

From 47c6aca107b4cd6fd903d99ae0dc6e4a04d45986 Mon Sep 17 00:00:00 2001
From: "Thomas R. Benson" <benson31@llnl.gov>
Date: Fri, 11 Nov 2022 16:06:58 -0800
Subject: [PATCH 6/6] remove debugging

---
 src/callbacks/check_gradients.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp
index 673d0347dae..2cf1a3cfbc7 100644
--- a/src/callbacks/check_gradients.cpp
+++ b/src/callbacks/check_gradients.cpp
@@ -133,13 +133,9 @@ struct CheckWeightsFunctor : DefaultErrorReporter
     // Get weights matrix and gradient
     const auto& weights_matrix = dtw.get_values();
     const auto& gradient = dtw.get_optimizer()->get_gradient();
-    // std::cout << "*** CHECKING WEIGHTS: NAME=" << dtw.get_name() << ", SIZE="
-    // << weights_matrix.Height() << "x" << weights_matrix.Width() << std::endl;
     // Iterate through weights matrix entries
     for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
       for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
-        // std::cout << "*** CHECKING WEIGHTS (name=" << dtw.get_name() << "):
-        // row=" << row << ", col=" << col << std::endl;
         const bool weight_is_local = weights_matrix.IsLocal(row, col);
         const El::Int local_row = (weight_is_local
                                    ? weights_matrix.LocalRow(row)