From 9b380f9b227ef6e0d034f946b403e832297cc6b8 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 14 Oct 2022 09:17:49 -0700 Subject: [PATCH 1/6] Updates to LBANN to support ROCm-enabled DistConv --- CMakeLists.txt | 22 +++++----- include/lbann/layers/activations/relu.hpp | 4 ++ .../learning/distconv/distconv_layers.hpp | 18 ++++----- .../lbann/layers/transform/concatenate.hpp | 4 +- include/lbann/layers/transform/pooling.hpp | 40 ++++++++++++++++--- include/lbann/utils/distconv.hpp | 14 ++++++- src/callbacks/check_gradients.cpp | 3 +- src/layers/data_type_distconv_adapter.cpp | 16 ++++---- src/layers/io/input_layer.cpp | 8 +++- .../learning/distconv/distconv_layers.cpp | 4 +- src/layers/transform/split.cu | 10 ++--- src/layers/transform/sum.cu | 10 ++--- src/utils/distconv.cpp | 18 +++++---- 13 files changed, 112 insertions(+), 59 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d2a5cc7565..2b3575bd2e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -219,9 +219,19 @@ find_package(Hydrogen 1.5.0 CONFIG REQUIRED) message(STATUS "Found Hydrogen@${HYDROGEN_VERSION}: ${Hydrogen_DIR}") set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND}) +# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA, +# LBANN gets CUDA. +set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA}) +set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA}) + +set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM}) +if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM) + set(LBANN_HAS_GPU TRUE) +endif () + # DiHydrogen and Distconv if (LBANN_WITH_DISTCONV) - find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns DistConv) + find_package(DiHydrogen 0.3.0 CONFIG REQUIRED COMPONENTS Meta Patterns DistConv) set(LBANN_HAS_DISTCONV TRUE) set(LBANN_H2_LIBS H2::H2Meta @@ -292,16 +302,6 @@ if (LBANN_WITH_ONEDNN) endif () endif () -# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA, -# LBANN gets CUDA. -set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA}) -set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA}) - -set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM}) -if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM) - set(LBANN_HAS_GPU TRUE) -endif () - # Only used if have GPU and have CPU half. if (LBANN_HAS_GPU AND LBANN_HAS_HALF) set(LBANN_HAS_GPU_FP16 ${HYDROGEN_GPU_USE_FP16}) diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp index 3f1ecb9d3f2..2bbd983c48d 100644 --- a/include/lbann/layers/activations/relu.hpp +++ b/include/lbann/layers/activations/relu.hpp @@ -30,6 +30,10 @@ #include "lbann/layers/data_type_layer.hpp" #include "lbann/utils/distconv.hpp" +#ifdef LBANN_HAS_DISTCONV +#include "distconv/cudnn/relu.hpp" +#endif + namespace lbann { #ifdef LBANN_HAS_DISTCONV diff --git a/include/lbann/layers/learning/distconv/distconv_layers.hpp b/include/lbann/layers/learning/distconv/distconv_layers.hpp index abecb77723e..d807268c0de 100644 --- a/include/lbann/layers/learning/distconv/distconv_layers.hpp +++ b/include/lbann/layers/learning/distconv/distconv_layers.hpp @@ -48,25 +48,25 @@ namespace distconv{ tensor::Tensor &output); template - int apply_bias(const tensor::Tensor &bias, + int apply_bias(const tensor::Tensor &bias, tensor::Tensor &output); - + template int backward_wrt_input( bool transpose_A, const tensor::Tensor &output_grad, const tensor::Tensor &linearity, tensor::Tensor &input_grad); - + template int backward_wrt_weight( bool transpose, DataType dst_scale, DataType gradient_scale, - const tensor::Tensor &input, + const tensor::Tensor &input, const tensor::Tensor &output_grad, tensor::Tensor &linearity_grad); - + template int backward_wrt_bias( DataType gradient_scale, @@ -80,21 +80,21 @@ namespace distconv{ template - tensor::Shape + tensor::Shape get_fc_output_local_tensor_shape(const tensor::Tensor &input, const int_vector &linearity_dims, bool transpose){ //https://github.com/LLNL/DiHydrogen/blob/7f86db1f9701ac3afb5e16aefdd57563d57a1698/legacy/include/distconv/distconv.hpp#L173 - //Get the input layer local tensor shape + //Get the input layer local tensor shape auto output_local_shape = input.get_local_shape(); output_local_shape[0] = transpose? linearity_dims[1] : linearity_dims[0]; return output_local_shape; } -extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, float>; -extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, double>; +extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, float>; +extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, double>; } // namespace distconv #endif // LBANN_HAS_DISTCONV diff --git a/include/lbann/layers/transform/concatenate.hpp b/include/lbann/layers/transform/concatenate.hpp index e2c26082f66..355c23d3407 100644 --- a/include/lbann/layers/transform/concatenate.hpp +++ b/include/lbann/layers/transform/concatenate.hpp @@ -468,7 +468,7 @@ fp_compute() { dc::tensor::Concatenate(this->get_activations(0), this->get_prev_activations(0), this->get_prev_activations(1), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } template @@ -477,7 +477,7 @@ bp_compute() { dc::tensor::Slice(this->get_error_signals(0), this->get_error_signals(1), this->get_prev_error_signals(0), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } #endif // LBANN_HAS_DISTCONV diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp index 9a8e4765e3b..778c7772762 100644 --- a/include/lbann/layers/transform/pooling.hpp +++ b/include/lbann/layers/transform/pooling.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_POOLING_HPP_INCLUDED #include "lbann/layers/data_type_layer.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/dim_helpers.hpp" #include "lbann/utils/dnn_enums.hpp" #ifdef LBANN_HAS_DNN_LIB @@ -69,7 +70,7 @@ class pooling_distconv_adapter : public data_type_distconv_adapter> m_pooling; }; @@ -295,12 +296,41 @@ class pooling_layer : public data_type_layer { if(this->using_gpus()) { #ifdef LBANN_HAS_DISTCONV if (this->distconv_enabled()) { - get_distconv_adapter().fp_compute(); + static size_t counter = 0; + ++counter; + const auto& mode + = this->m_model->get_execution_context().get_execution_mode(); + if (counter % 10000 == 0) { + std::cout << "** DC POOLING: " << counter + << " (this=" << this << ", mode=" << to_string(mode) << ") **" + << std::endl; + } + get_distconv_adapter().fp_compute(mode == execution_mode::training); return; } #endif // LBANN_HAS_DISTCONV + { + static size_t counter = 0; + ++counter; + if (counter % 10000 == 0) { + const auto& mode + = this->m_model->get_execution_context().get_execution_mode(); + std::cout << "** POOLING: " << ++counter + << " (this=" << this << ", mode=" << to_string(mode) << ") **" + << std::endl; + } + } fp_compute_dnn(); - } else { + } + else { + { + static size_t counter = 0; + const auto& mode + = this->m_model->get_execution_context().get_execution_mode(); + std::cout << "** IM2COL POOLING: " << ++counter + << " (this=" << this << ", mode=" << to_string(mode) << ") **" + << std::endl; + } fp_compute_im2col(); } } @@ -802,9 +832,9 @@ setup_layer(size_t workspace_capacity) { template void pooling_distconv_adapter:: -fp_compute() { +fp_compute(bool const training) { m_pooling->forward(El::To(1), this->get_prev_activations(), - El::To(0), this->get_activations()); + El::To(0), this->get_activations(), training); } template diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp index f21f288c194..c21d719b990 100644 --- a/include/lbann/utils/distconv.hpp +++ b/include/lbann/utils/distconv.hpp @@ -40,6 +40,7 @@ #endif #include "distconv/distconv.hpp" +#include "distconv/cudnn/backend.hpp" #include "distconv/tensor/tensor_mpi_cuda.hpp" #include "distconv/tensor/shuffle_mpi.hpp" #include "distconv/tensor/shuffle_mpi_cuda.hpp" @@ -55,6 +56,15 @@ #include "lbann/layers/learning/distconv/distconv_layers.hpp" namespace lbann { +inline auto default_hydrogen_stream() +{ +#if H2_HAS_CUDA + return hydrogen::cuda::GetDefaultStream(); +#elif H2_HAS_ROCM + return hydrogen::rocm::GetDefaultStream(); +#endif +} + class Layer; namespace dc { @@ -109,7 +119,7 @@ using MPIRootPrintStreamInfo = ::distconv::util::MPIRootPrintStreamInfo; using MPIRootPrintStreamWaning = ::distconv::util::MPIRootPrintStreamWarning; // Distconv layer classes -using Backend = ::distconv::cudnn::BackendCUDNN; +using Backend = ::distconv::BackendDNNLib; using ReLU = ::distconv::ReLU; using LeakyReLU = ::distconv::LeakyReLU; template @@ -232,7 +242,7 @@ Dist get_hydrogen_data_parallel_distribution(int num_dims); template void dump_tensor(const Tensor &t, const std::string &path) { dc::MPIPrintStreamDebug() << "Dumping tensor to " << path; - cudaDeviceSynchronize(); + h2::gpu::sync(); distconv::dump_tensor(t, path, true); } diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp index d76856f50ee..44dc3fd91c1 100644 --- a/src/callbacks/check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -133,10 +133,11 @@ struct CheckWeightsFunctor : DefaultErrorReporter // Get weights matrix and gradient const auto& weights_matrix = dtw.get_values(); const auto& gradient = dtw.get_optimizer()->get_gradient(); - + //std::cout << "*** CHECKING WEIGHTS: NAME=" << dtw.get_name() << ", SIZE=" << weights_matrix.Height() << "x" << weights_matrix.Width() << std::endl; // Iterate through weights matrix entries for (El::Int col = 0; col < weights_matrix.Width(); ++col) { for (El::Int row = 0; row < weights_matrix.Height(); ++row) { + //std::cout << "*** CHECKING WEIGHTS (name=" << dtw.get_name() << "): row=" << row << ", col=" << col << std::endl; const bool weight_is_local = weights_matrix.IsLocal(row, col); const El::Int local_row = (weight_is_local ? weights_matrix.LocalRow(row) diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp index 00763b58c8a..4581a915e54 100644 --- a/src/layers/data_type_distconv_adapter.cpp +++ b/src/layers/data_type_distconv_adapter.cpp @@ -332,7 +332,7 @@ setup_prev_activations_i(int index) const { const dc::LocaleMPI loc(dc::get_mpi_comm(), false); t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); } else { // Create a shallow copy const auto &parent_activations = @@ -427,7 +427,7 @@ setup_activations_i(int index) const { const auto local_shape = get_activations_local_shape(index); auto t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); return t; } @@ -483,7 +483,7 @@ setup_prev_error_signals_i(int index) const { const dc::LocaleMPI loc(dc::get_mpi_comm(), false); t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); } else { // Create a shallow copy const auto &child_error_signals = @@ -550,7 +550,7 @@ setup_error_signals_i(int index) const { const auto local_shape = get_error_signals_local_shape(index); auto t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); return t; } @@ -799,7 +799,7 @@ void data_type_distconv_adapter::ensu shuffler.shuffle_forward( get_original_prev_activations().get_const_base_ptr(), get_prev_activations().get_base_ptr(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } } @@ -821,7 +821,7 @@ void data_type_distconv_adapter::copy shuffler.shuffle_forward( get_activations().get_const_base_ptr(), get_original_activations().get_base_ptr(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } } @@ -848,7 +848,7 @@ void data_type_distconv_adapter::ensu shuffler.shuffle_forward( get_original_prev_error_signals(i).get_const_base_ptr(), get_prev_error_signals(i).get_base_ptr(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } } @@ -871,7 +871,7 @@ void data_type_distconv_adapter::copy shuffler.shuffle_forward( get_error_signals(i).get_const_base_ptr(), get_original_error_signals(i).get_base_ptr(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } } diff --git a/src/layers/io/input_layer.cpp b/src/layers/io/input_layer.cpp index 68664436ff2..dd267e692b2 100644 --- a/src/layers/io/input_layer.cpp +++ b/src/layers/io/input_layer.cpp @@ -304,7 +304,11 @@ void input_distconv_adapter::setup_fp_tensors() { // only specialized for BaseAllocator. size_t buf_size = m_host_tensor->get_local_real_size() * sizeof(TensorDataType); TensorDataType *buf = nullptr; +#if H2_HAS_CUDA CHECK_CUDA(cudaMallocHost(&buf, buf_size)); +#elif H2_HAS_ROCM + CHECK_ROCM(hipHostMalloc(&buf, buf_size)); +#endif // Note buf should be deallocated. dc::tensor::View(*m_host_tensor, buf); setup_shuffler_buffers(*m_original_host_tensor, @@ -338,7 +342,7 @@ setup_activations_i(int index) const { const auto local_shape = get_activations_local_shape(index); auto t = std::make_unique(shape, loc, dist, local_shape); assert0(t->allocate()); - t->zero(hydrogen::cuda::GetDefaultStream()); + t->zero(default_hydrogen_stream()); return t; } else { @@ -434,7 +438,7 @@ template ::fp_compute() { auto &l = dynamic_cast&>(this->layer()); - auto stream = hydrogen::cuda::GetDefaultStream(); + auto stream = default_hydrogen_stream(); // Note that the mini-batch size of the data reader is not // actually the one for the current mini-batch as the mini-batch // index is already updated by fp_compute. diff --git a/src/layers/learning/distconv/distconv_layers.cpp b/src/layers/learning/distconv/distconv_layers.cpp index 92dd171dd35..22fa9f4df90 100644 --- a/src/layers/learning/distconv/distconv_layers.cpp +++ b/src/layers/learning/distconv/distconv_layers.cpp @@ -331,7 +331,7 @@ namespace distconv{ const tensor::Tensor &output_gradient, \ tensor::Tensor &bias_gradient); -ETI(float, cudnn::BackendCUDNN) -ETI(double, cudnn::BackendCUDNN) +ETI(float, BackendDNNLib) +ETI(double, BackendDNNLib) #undef ETI } // namespace distconv diff --git a/src/layers/transform/split.cu b/src/layers/transform/split.cu index ca212564f89..2f2c881e49d 100644 --- a/src/layers/transform/split.cu +++ b/src/layers/transform/split.cu @@ -71,29 +71,29 @@ void split_distconv_adapter::bp_compute() { auto &error_signals = this->get_error_signals(0); switch (this->layer().get_num_children()) { case 0: - error_signals.zero(hydrogen::cuda::GetDefaultStream()); + error_signals.zero(default_hydrogen_stream()); break; case 1: dc::tensor::Copy(error_signals, this->get_prev_error_signals(0), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); break; case 2: dc::tensor::Transform(error_signals, this->get_prev_error_signals(0), this->get_prev_error_signals(1), sum_op(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); break; default: dc::tensor::Copy(error_signals, this->get_prev_error_signals(1), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); for (int i = 1; i < this->layer().get_num_children(); ++i) { const auto &prev_error = this->get_prev_error_signals(i); dc::tensor::Transform(error_signals, prev_error, accumulate_op(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } } return; diff --git a/src/layers/transform/sum.cu b/src/layers/transform/sum.cu index c9ea29b76e4..578d357a16f 100644 --- a/src/layers/transform/sum.cu +++ b/src/layers/transform/sum.cu @@ -67,11 +67,11 @@ void sum_distconv_adapter::fp_compute() { auto &activations = this->get_activations(); switch (this->layer().get_num_parents()) { case 0: - activations.zero(hydrogen::cuda::GetDefaultStream()); + activations.zero(default_hydrogen_stream()); break; case 1: dc::tensor::Copy(activations, this->get_prev_activations(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); break; case 2: // Optimization for layers with 2 parents (e.g., @@ -82,7 +82,7 @@ void sum_distconv_adapter::fp_compute() { this->get_prev_activations(0), this->get_prev_activations(1), sum_op(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); break; default: for (int i = 0; i < this->layer().get_num_parents(); ++i) { @@ -90,11 +90,11 @@ void sum_distconv_adapter::fp_compute() { prev_activations.set_outermost_dimension(activations.get_shape()[-1]); if (i == 0) { dc::tensor::Copy(activations, prev_activations, - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } else { distconv::tensor::Transform(activations, prev_activations, accumulate_op(), - hydrogen::cuda::GetDefaultStream()); + default_hydrogen_stream()); } } } diff --git a/src/utils/distconv.cpp b/src/utils/distconv.cpp index 49fc966aef8..723c7749d3a 100644 --- a/src/utils/distconv.cpp +++ b/src/utils/distconv.cpp @@ -209,7 +209,7 @@ TensorDataType *get_shuffler_src_buf(const TensorDev &tensor) { shuffler_src_buf_size = TensorShuffler::get_buf_size(tensor); MPIPrintStreamDebug() << "Allocating shared shuffler buffer of size " << shuffler_src_buf_size; - DISTCONV_CUDA_MALLOC(&shuffler_src_buf, shuffler_src_buf_size); + DISTCONV_GPU_MALLOC(&shuffler_src_buf, shuffler_src_buf_size); } // Returns the pre-allocated memory if it's large enough size_t required_size = TensorShuffler::get_buf_size(tensor); @@ -228,7 +228,7 @@ TensorDataType *get_shuffler_dst_buf(const TensorDev &tensor) { shuffler_dst_buf_size = TensorShuffler::get_buf_size(tensor); MPIPrintStreamDebug() << "Allocating shared shuffler buffer of size " << shuffler_src_buf_size; - DISTCONV_CUDA_MALLOC(&shuffler_dst_buf, shuffler_dst_buf_size); + DISTCONV_GPU_MALLOC(&shuffler_dst_buf, shuffler_dst_buf_size); } size_t required_size = TensorShuffler::get_buf_size(tensor); // Returns the pre-allocated memory if it's large enough @@ -241,11 +241,11 @@ TensorDataType *get_shuffler_dst_buf(const TensorDev &tensor) { } void delete_shuffler_buffers() { if (shuffler_src_buf) { - CHECK_CUDA(cudaFree(shuffler_src_buf)); + DISTCONV_CHECK_GPU(GPU_FREE(shuffler_src_buf)); shuffler_src_buf = nullptr; } if (shuffler_dst_buf) { - CHECK_CUDA(cudaFree(shuffler_dst_buf)); + DISTCONV_CHECK_GPU(GPU_FREE(shuffler_dst_buf)); shuffler_dst_buf = nullptr; } } @@ -291,12 +291,12 @@ void initialize(MPI_Comm comm) { p2p_instance = new p2p::P2P(mpi_comm); #endif // DISTCONV_HAS_P2P hosttransfer_comm_instance = new Al::hosttransfer_backend::comm_type( - mpi_comm, hydrogen::cuda::GetDefaultStream()); - ::distconv::cudnn::Options backend_opts; + mpi_comm, default_hydrogen_stream()); + ::distconv::backend::Options backend_opts; backend_opts.m_deterministic = opt_deterministic; backend_instance = new Backend( mpi_comm, lbann::dnn_lib::get_handle(), - hydrogen::cuda::GetDefaultStream(), backend_opts); + default_hydrogen_stream(), backend_opts); print_options(std::cout); initialized = true; } @@ -454,7 +454,11 @@ Dist get_hydrogen_data_parallel_distribution(int num_dims) { size_t get_workspace_capacity() { size_t available, total; +#if H2_HAS_CUDA FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total)); +#elif H2_HAS_ROCM + FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total)); +#endif size_t workspace_capacity = available; // set aside some space for shuffling, halo exchange, etc. workspace_capacity -= 1 << 28; From 9b258f2c5441dae2cb4410b2910880fc71262693 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 14 Oct 2022 09:24:38 -0700 Subject: [PATCH 2/6] Clean up debugging output; clang-format the diff to develop. --- .../learning/distconv/distconv_layers.hpp | 29 ++++++------ include/lbann/layers/transform/pooling.hpp | 45 +++++-------------- include/lbann/utils/distconv.hpp | 6 +-- python/lbann/contrib/lc/systems.py | 2 +- src/callbacks/check_gradients.cpp | 6 ++- src/layers/data_type_distconv_adapter.cpp | 26 +++++------ .../learning/distconv/distconv_layers.cpp | 4 +- src/layers/transform/split.cu | 3 +- src/layers/transform/sum.cu | 9 ++-- src/utils/distconv.cpp | 12 ++--- 10 files changed, 65 insertions(+), 77 deletions(-) diff --git a/include/lbann/layers/learning/distconv/distconv_layers.hpp b/include/lbann/layers/learning/distconv/distconv_layers.hpp index d807268c0de..b5c31fc4544 100644 --- a/include/lbann/layers/learning/distconv/distconv_layers.hpp +++ b/include/lbann/layers/learning/distconv/distconv_layers.hpp @@ -48,8 +48,9 @@ namespace distconv{ tensor::Tensor &output); template - int apply_bias(const tensor::Tensor &bias, - tensor::Tensor &output); + int apply_bias( + const tensor::Tensor& bias, + tensor::Tensor& output); template int backward_wrt_input( @@ -63,9 +64,9 @@ namespace distconv{ bool transpose, DataType dst_scale, DataType gradient_scale, - const tensor::Tensor &input, - const tensor::Tensor &output_grad, - tensor::Tensor &linearity_grad); + const tensor::Tensor& input, + const tensor::Tensor& output_grad, + tensor::Tensor& linearity_grad); template int backward_wrt_bias( @@ -78,23 +79,25 @@ namespace distconv{ Backend &m_be; }; // class definition ChannelwiseFullyConnected - template - tensor::Shape - get_fc_output_local_tensor_shape(const tensor::Tensor &input, - const int_vector &linearity_dims, - bool transpose){ + tensor::Shape get_fc_output_local_tensor_shape( + const tensor::Tensor& input, + const int_vector& linearity_dims, + bool transpose) + { //https://github.com/LLNL/DiHydrogen/blob/7f86db1f9701ac3afb5e16aefdd57563d57a1698/legacy/include/distconv/distconv.hpp#L173 - //Get the input layer local tensor shape + // Get the input layer local tensor shape auto output_local_shape = input.get_local_shape(); output_local_shape[0] = transpose? linearity_dims[1] : linearity_dims[0]; return output_local_shape; } -extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, float>; -extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, double>; + extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, + float>; + extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib, + double>; } // namespace distconv #endif // LBANN_HAS_DISTCONV diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp index 778c7772762..6b2644dff01 100644 --- a/include/lbann/layers/transform/pooling.hpp +++ b/include/lbann/layers/transform/pooling.hpp @@ -70,7 +70,8 @@ class pooling_distconv_adapter : public data_type_distconv_adapter> m_pooling; }; @@ -296,41 +297,15 @@ class pooling_layer : public data_type_layer { if(this->using_gpus()) { #ifdef LBANN_HAS_DISTCONV if (this->distconv_enabled()) { - static size_t counter = 0; - ++counter; - const auto& mode - = this->m_model->get_execution_context().get_execution_mode(); - if (counter % 10000 == 0) { - std::cout << "** DC POOLING: " << counter - << " (this=" << this << ", mode=" << to_string(mode) << ") **" - << std::endl; - } + const auto& mode = + this->m_model->get_execution_context().get_execution_mode(); get_distconv_adapter().fp_compute(mode == execution_mode::training); return; } #endif // LBANN_HAS_DISTCONV - { - static size_t counter = 0; - ++counter; - if (counter % 10000 == 0) { - const auto& mode - = this->m_model->get_execution_context().get_execution_mode(); - std::cout << "** POOLING: " << ++counter - << " (this=" << this << ", mode=" << to_string(mode) << ") **" - << std::endl; - } - } fp_compute_dnn(); } else { - { - static size_t counter = 0; - const auto& mode - = this->m_model->get_execution_context().get_execution_mode(); - std::cout << "** IM2COL POOLING: " << ++counter - << " (this=" << this << ", mode=" << to_string(mode) << ") **" - << std::endl; - } fp_compute_im2col(); } } @@ -831,10 +806,14 @@ setup_layer(size_t workspace_capacity) { } template -void pooling_distconv_adapter:: -fp_compute(bool const training) { - m_pooling->forward(El::To(1), this->get_prev_activations(), - El::To(0), this->get_activations(), training); +void pooling_distconv_adapter::fp_compute( + bool const training) +{ + m_pooling->forward(El::To(1), + this->get_prev_activations(), + El::To(0), + this->get_activations(), + training); } template diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp index c21d719b990..dfb8fb03dee 100644 --- a/include/lbann/utils/distconv.hpp +++ b/include/lbann/utils/distconv.hpp @@ -39,13 +39,13 @@ #define DISTCONV_DEBUG #endif -#include "distconv/distconv.hpp" #include "distconv/cudnn/backend.hpp" -#include "distconv/tensor/tensor_mpi_cuda.hpp" +#include "distconv/distconv.hpp" +#include "distconv/tensor/algorithms.hpp" #include "distconv/tensor/shuffle_mpi.hpp" #include "distconv/tensor/shuffle_mpi_cuda.hpp" #include "distconv/tensor/shuffle_mpi_cuda_al.hpp" -#include "distconv/tensor/algorithms.hpp" +#include "distconv/tensor/tensor_mpi_cuda.hpp" #include "distconv/util/util.hpp" #ifdef DISTCONV_HAS_P2P #include "p2p/p2p.hpp" diff --git a/python/lbann/contrib/lc/systems.py b/python/lbann/contrib/lc/systems.py index 08cec5a82d7..bee6bb0f313 100644 --- a/python/lbann/contrib/lc/systems.py +++ b/python/lbann/contrib/lc/systems.py @@ -25,7 +25,7 @@ def __init__(self, cores_per_node, gpus_per_node, scheduler): 'sierra': SystemParams(44, 4, 'lsf'), 'rzansel': SystemParams(44, 4, 'lsf'), 'rzhasgpu': SystemParams(16, 2, 'slurm'), - 'tioga': SystemParams(64, 8, 'flux'), + 'tioga': SystemParams(64, 4, 'flux'), } # Detect system diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp index 44dc3fd91c1..673d0347dae 100644 --- a/src/callbacks/check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -133,11 +133,13 @@ struct CheckWeightsFunctor : DefaultErrorReporter // Get weights matrix and gradient const auto& weights_matrix = dtw.get_values(); const auto& gradient = dtw.get_optimizer()->get_gradient(); - //std::cout << "*** CHECKING WEIGHTS: NAME=" << dtw.get_name() << ", SIZE=" << weights_matrix.Height() << "x" << weights_matrix.Width() << std::endl; + // std::cout << "*** CHECKING WEIGHTS: NAME=" << dtw.get_name() << ", SIZE=" + // << weights_matrix.Height() << "x" << weights_matrix.Width() << std::endl; // Iterate through weights matrix entries for (El::Int col = 0; col < weights_matrix.Width(); ++col) { for (El::Int row = 0; row < weights_matrix.Height(); ++row) { - //std::cout << "*** CHECKING WEIGHTS (name=" << dtw.get_name() << "): row=" << row << ", col=" << col << std::endl; + // std::cout << "*** CHECKING WEIGHTS (name=" << dtw.get_name() << "): + // row=" << row << ", col=" << col << std::endl; const bool weight_is_local = weights_matrix.IsLocal(row, col); const El::Int local_row = (weight_is_local ? weights_matrix.LocalRow(row) diff --git a/src/layers/data_type_distconv_adapter.cpp b/src/layers/data_type_distconv_adapter.cpp index 4581a915e54..55684aaf81a 100644 --- a/src/layers/data_type_distconv_adapter.cpp +++ b/src/layers/data_type_distconv_adapter.cpp @@ -797,9 +797,9 @@ void data_type_distconv_adapter::ensu get_original_prev_activations(), get_prev_activations()); shuffler.shuffle_forward( - get_original_prev_activations().get_const_base_ptr(), - get_prev_activations().get_base_ptr(), - default_hydrogen_stream()); + get_original_prev_activations().get_const_base_ptr(), + get_prev_activations().get_base_ptr(), + default_hydrogen_stream()); } } @@ -818,10 +818,9 @@ void data_type_distconv_adapter::copy auto &shuffler = get_activations_shuffler( get_activations(), get_original_activations()); - shuffler.shuffle_forward( - get_activations().get_const_base_ptr(), - get_original_activations().get_base_ptr(), - default_hydrogen_stream()); + shuffler.shuffle_forward(get_activations().get_const_base_ptr(), + get_original_activations().get_base_ptr(), + default_hydrogen_stream()); } } @@ -846,9 +845,9 @@ void data_type_distconv_adapter::ensu get_original_prev_error_signals(i), get_prev_error_signals(i)); shuffler.shuffle_forward( - get_original_prev_error_signals(i).get_const_base_ptr(), - get_prev_error_signals(i).get_base_ptr(), - default_hydrogen_stream()); + get_original_prev_error_signals(i).get_const_base_ptr(), + get_prev_error_signals(i).get_base_ptr(), + default_hydrogen_stream()); } } @@ -868,10 +867,9 @@ void data_type_distconv_adapter::copy auto &shuffler = get_error_signals_shuffler( get_error_signals(i), get_original_error_signals(i)); - shuffler.shuffle_forward( - get_error_signals(i).get_const_base_ptr(), - get_original_error_signals(i).get_base_ptr(), - default_hydrogen_stream()); + shuffler.shuffle_forward(get_error_signals(i).get_const_base_ptr(), + get_original_error_signals(i).get_base_ptr(), + default_hydrogen_stream()); } } diff --git a/src/layers/learning/distconv/distconv_layers.cpp b/src/layers/learning/distconv/distconv_layers.cpp index 22fa9f4df90..6da0a9e546c 100644 --- a/src/layers/learning/distconv/distconv_layers.cpp +++ b/src/layers/learning/distconv/distconv_layers.cpp @@ -331,7 +331,7 @@ namespace distconv{ const tensor::Tensor &output_gradient, \ tensor::Tensor &bias_gradient); -ETI(float, BackendDNNLib) -ETI(double, BackendDNNLib) + ETI(float, BackendDNNLib) + ETI(double, BackendDNNLib) #undef ETI } // namespace distconv diff --git a/src/layers/transform/split.cu b/src/layers/transform/split.cu index 2f2c881e49d..b9f8ecdcb89 100644 --- a/src/layers/transform/split.cu +++ b/src/layers/transform/split.cu @@ -91,7 +91,8 @@ void split_distconv_adapter::bp_compute() { default_hydrogen_stream()); for (int i = 1; i < this->layer().get_num_children(); ++i) { const auto &prev_error = this->get_prev_error_signals(i); - dc::tensor::Transform(error_signals, prev_error, + dc::tensor::Transform(error_signals, + prev_error, accumulate_op(), default_hydrogen_stream()); } diff --git a/src/layers/transform/sum.cu b/src/layers/transform/sum.cu index 578d357a16f..d0ee8adf1c8 100644 --- a/src/layers/transform/sum.cu +++ b/src/layers/transform/sum.cu @@ -70,7 +70,8 @@ void sum_distconv_adapter::fp_compute() { activations.zero(default_hydrogen_stream()); break; case 1: - dc::tensor::Copy(activations, this->get_prev_activations(), + dc::tensor::Copy(activations, + this->get_prev_activations(), default_hydrogen_stream()); break; case 2: @@ -89,10 +90,12 @@ void sum_distconv_adapter::fp_compute() { auto &prev_activations = this->get_prev_activations(i); prev_activations.set_outermost_dimension(activations.get_shape()[-1]); if (i == 0) { - dc::tensor::Copy(activations, prev_activations, + dc::tensor::Copy(activations, + prev_activations, default_hydrogen_stream()); } else { - distconv::tensor::Transform(activations, prev_activations, + distconv::tensor::Transform(activations, + prev_activations, accumulate_op(), default_hydrogen_stream()); } diff --git a/src/utils/distconv.cpp b/src/utils/distconv.cpp index 723c7749d3a..d89f9da5052 100644 --- a/src/utils/distconv.cpp +++ b/src/utils/distconv.cpp @@ -290,13 +290,15 @@ void initialize(MPI_Comm comm) { #ifdef DISTCONV_HAS_P2P p2p_instance = new p2p::P2P(mpi_comm); #endif // DISTCONV_HAS_P2P - hosttransfer_comm_instance = new Al::hosttransfer_backend::comm_type( - mpi_comm, default_hydrogen_stream()); + hosttransfer_comm_instance = + new Al::hosttransfer_backend::comm_type(mpi_comm, + default_hydrogen_stream()); ::distconv::backend::Options backend_opts; backend_opts.m_deterministic = opt_deterministic; - backend_instance = new Backend( - mpi_comm, lbann::dnn_lib::get_handle(), - default_hydrogen_stream(), backend_opts); + backend_instance = new Backend(mpi_comm, + lbann::dnn_lib::get_handle(), + default_hydrogen_stream(), + backend_opts); print_options(std::cout); initialized = true; } From 102729ab9861792ab5956ddea3eef9c3a6f3d632 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 18 Oct 2022 08:51:06 -0700 Subject: [PATCH 3/6] Undo accidentally-committed debugging changes --- python/lbann/contrib/lc/systems.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lbann/contrib/lc/systems.py b/python/lbann/contrib/lc/systems.py index bee6bb0f313..08cec5a82d7 100644 --- a/python/lbann/contrib/lc/systems.py +++ b/python/lbann/contrib/lc/systems.py @@ -25,7 +25,7 @@ def __init__(self, cores_per_node, gpus_per_node, scheduler): 'sierra': SystemParams(44, 4, 'lsf'), 'rzansel': SystemParams(44, 4, 'lsf'), 'rzhasgpu': SystemParams(16, 2, 'slurm'), - 'tioga': SystemParams(64, 4, 'flux'), + 'tioga': SystemParams(64, 8, 'flux'), } # Detect system From a75fbad75644af60eb40927fd773e1772d562c99 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 11 Nov 2022 08:54:05 -0800 Subject: [PATCH 4/6] update miopen typedefs/default values --- include/lbann/utils/dnn_lib/miopen.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/lbann/utils/dnn_lib/miopen.hpp b/include/lbann/utils/dnn_lib/miopen.hpp index a100e6c840f..fb99b23a146 100644 --- a/include/lbann/utils/dnn_lib/miopen.hpp +++ b/include/lbann/utils/dnn_lib/miopen.hpp @@ -82,7 +82,7 @@ using dnnHandle_t = miopenHandle_t; using dnnDataType_t = miopenDataType_t; using dnnTensorDescriptor_t = miopenTensorDescriptor_t; using dnnFilterDescriptor_t = miopenTensorDescriptor_t; -using dnnTensorFormat_t = int; +using dnnTensorFormat_t = miopenTensorLayout_t; using dnnDropoutDescriptor_t = miopenDropoutDescriptor_t; using dnnRNGType_t = miopenRNGType_t; using dnnRNNDescriptor_t = miopenRNNDescriptor_t; @@ -110,7 +110,7 @@ using dnnConvolutionBwdFilterAlgo_t = miopenConvBwdWeightsAlgorithm_t; constexpr dnnConvolutionMode_t DNN_CROSS_CORRELATION = miopenConvolution; constexpr dnnNanPropagation_t DNN_PROPAGATE_NAN = MIOPEN_PROPAGATE_NAN; constexpr dnnMathType_t DNN_DEFAULT_MATH = 0; -constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = 0; +constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = miopenTensorNCHW; constexpr dnnRNGType_t DNN_RNG_PSEUDO_XORWOW = MIOPEN_RNG_PSEUDO_XORWOW; constexpr dnnLRNMode_t DNN_LRN_CROSS_CHANNEL = miopenLRNCrossChannel; constexpr dnnMathType_t DNN_TENSOR_OP_MATH_ALLOW_CONVERSION = -1; // not supported with ROCm From 547814ea9101b2ee8b9d2c5972409069234faf0a Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 11 Nov 2022 08:55:31 -0800 Subject: [PATCH 5/6] Update filepaths for distconv --- include/lbann/layers/activations/relu.hpp | 2 +- include/lbann/utils/distconv.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp index 2bbd983c48d..2cd2eb79dde 100644 --- a/include/lbann/layers/activations/relu.hpp +++ b/include/lbann/layers/activations/relu.hpp @@ -31,7 +31,7 @@ #include "lbann/utils/distconv.hpp" #ifdef LBANN_HAS_DISTCONV -#include "distconv/cudnn/relu.hpp" +#include "distconv/dnn_backend/relu.hpp" #endif namespace lbann { diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp index dfb8fb03dee..a3606e689cb 100644 --- a/include/lbann/utils/distconv.hpp +++ b/include/lbann/utils/distconv.hpp @@ -39,7 +39,7 @@ #define DISTCONV_DEBUG #endif -#include "distconv/cudnn/backend.hpp" +#include "distconv/dnn_backend/backend.hpp" #include "distconv/distconv.hpp" #include "distconv/tensor/algorithms.hpp" #include "distconv/tensor/shuffle_mpi.hpp" From 47c6aca107b4cd6fd903d99ae0dc6e4a04d45986 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 11 Nov 2022 16:06:58 -0800 Subject: [PATCH 6/6] remove debugging --- src/callbacks/check_gradients.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp index 673d0347dae..2cf1a3cfbc7 100644 --- a/src/callbacks/check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -133,13 +133,9 @@ struct CheckWeightsFunctor : DefaultErrorReporter // Get weights matrix and gradient const auto& weights_matrix = dtw.get_values(); const auto& gradient = dtw.get_optimizer()->get_gradient(); - // std::cout << "*** CHECKING WEIGHTS: NAME=" << dtw.get_name() << ", SIZE=" - // << weights_matrix.Height() << "x" << weights_matrix.Width() << std::endl; // Iterate through weights matrix entries for (El::Int col = 0; col < weights_matrix.Width(); ++col) { for (El::Int row = 0; row < weights_matrix.Height(); ++row) { - // std::cout << "*** CHECKING WEIGHTS (name=" << dtw.get_name() << "): - // row=" << row << ", col=" << col << std::endl; const bool weight_is_local = weights_matrix.IsLocal(row, col); const El::Int local_row = (weight_is_local ? weights_matrix.LocalRow(row)