Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,19 @@ find_package(Hydrogen 1.5.0 CONFIG REQUIRED)
message(STATUS "Found Hydrogen@${HYDROGEN_VERSION}: ${Hydrogen_DIR}")
set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND})

# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA,
# LBANN gets CUDA.
set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA})
set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA})

set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM})
if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM)
set(LBANN_HAS_GPU TRUE)
endif ()

# DiHydrogen and Distconv
if (LBANN_WITH_DISTCONV)
find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns DistConv)
find_package(DiHydrogen 0.3.0 CONFIG REQUIRED COMPONENTS Meta Patterns DistConv)
set(LBANN_HAS_DISTCONV TRUE)
set(LBANN_H2_LIBS
H2::H2Meta
Expand Down Expand Up @@ -292,16 +302,6 @@ if (LBANN_WITH_ONEDNN)
endif ()
endif ()

# CUDA-ness of LBANN is 1:1 with Hydrogen. Iff Hydrogen has CUDA,
# LBANN gets CUDA.
set(LBANN_HAS_CUDA ${_HYDROGEN_HAVE_CUDA})
set(LBANN_WITH_CUDA ${LBANN_HAS_CUDA})

set(LBANN_HAS_ROCM ${_HYDROGEN_HAVE_ROCM})
if (LBANN_HAS_CUDA OR LBANN_HAS_ROCM)
set(LBANN_HAS_GPU TRUE)
endif ()

# Only used if have GPU and have CPU half.
if (LBANN_HAS_GPU AND LBANN_HAS_HALF)
set(LBANN_HAS_GPU_FP16 ${HYDROGEN_GPU_USE_FP16})
Expand Down
4 changes: 4 additions & 0 deletions include/lbann/layers/activations/relu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
#include "lbann/layers/data_type_layer.hpp"
#include "lbann/utils/distconv.hpp"

#ifdef LBANN_HAS_DISTCONV
#include "distconv/dnn_backend/relu.hpp"
#endif

namespace lbann {

#ifdef LBANN_HAS_DISTCONV
Expand Down
35 changes: 19 additions & 16 deletions include/lbann/layers/learning/distconv/distconv_layers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,26 @@ namespace distconv{
tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output);

template <typename Allocator>
int apply_bias(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &bias,
tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output);

int apply_bias(
const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& bias,
tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& output);

template <typename Allocator>
int backward_wrt_input(
bool transpose_A,
const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &linearity,
tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_grad);

template <typename Allocator>
int backward_wrt_weight(
bool transpose,
DataType dst_scale,
DataType gradient_scale,
const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input,
const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &linearity_grad);
const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& input,
const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& output_grad,
tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& linearity_grad);

template <typename Allocator>
int backward_wrt_bias(
DataType gradient_scale,
Expand All @@ -78,23 +79,25 @@ namespace distconv{
Backend &m_be;
}; // class definition ChannelwiseFullyConnected


template <typename DataType, typename locale, typename Allocator>
tensor::Shape
get_fc_output_local_tensor_shape(const tensor::Tensor<DataType, locale, Allocator> &input,
const int_vector &linearity_dims,
bool transpose){
tensor::Shape get_fc_output_local_tensor_shape(
const tensor::Tensor<DataType, locale, Allocator>& input,
const int_vector& linearity_dims,
bool transpose)
{

//https://github.com/LLNL/DiHydrogen/blob/7f86db1f9701ac3afb5e16aefdd57563d57a1698/legacy/include/distconv/distconv.hpp#L173

//Get the input layer local tensor shape
// Get the input layer local tensor shape

auto output_local_shape = input.get_local_shape();
output_local_shape[0] = transpose? linearity_dims[1] : linearity_dims[0];
return output_local_shape;
}
extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, float>;
extern template class ChannelwiseFullyConnected<::distconv::cudnn::BackendCUDNN, double>;
extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib,
float>;
extern template class ChannelwiseFullyConnected<::distconv::BackendDNNLib,
double>;
} // namespace distconv

#endif // LBANN_HAS_DISTCONV
Expand Down
4 changes: 2 additions & 2 deletions include/lbann/layers/transform/concatenate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ fp_compute() {
dc::tensor::Concatenate(this->get_activations(0),
this->get_prev_activations(0),
this->get_prev_activations(1),
hydrogen::cuda::GetDefaultStream());
default_hydrogen_stream());
}

template <typename TensorDataType, data_layout Layout, El::Device Device>
Expand All @@ -477,7 +477,7 @@ bp_compute() {
dc::tensor::Slice(this->get_error_signals(0),
this->get_error_signals(1),
this->get_prev_error_signals(0),
hydrogen::cuda::GetDefaultStream());
default_hydrogen_stream());
}
#endif // LBANN_HAS_DISTCONV

Expand Down
23 changes: 16 additions & 7 deletions include/lbann/layers/transform/pooling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#define LBANN_LAYER_POOLING_HPP_INCLUDED

#include "lbann/layers/data_type_layer.hpp"
#include "lbann/models/model.hpp"
#include "lbann/utils/dim_helpers.hpp"
#include "lbann/utils/dnn_enums.hpp"
#ifdef LBANN_HAS_DNN_LIB
Expand Down Expand Up @@ -69,7 +70,8 @@ class pooling_distconv_adapter : public data_type_distconv_adapter<TensorDataTyp
void setup_distributions(tensor_overlap_constraints &constraints) override;
dc::Shape get_activations_local_shape(int index=0) const override;
void setup_layer(size_t workspace_capacity) override;
void fp_compute();
void
fp_compute(bool training = true); // training=true for max back-compatibility.
void bp_compute();
std::unique_ptr<dc::Pooling<TensorDataType>> m_pooling;
};
Expand Down Expand Up @@ -295,12 +297,15 @@ class pooling_layer : public data_type_layer<TensorDataType> {
if(this->using_gpus()) {
#ifdef LBANN_HAS_DISTCONV
if (this->distconv_enabled()) {
get_distconv_adapter().fp_compute();
const auto& mode =
this->m_model->get_execution_context().get_execution_mode();
get_distconv_adapter().fp_compute(mode == execution_mode::training);
return;
}
#endif // LBANN_HAS_DISTCONV
fp_compute_dnn();
} else {
}
else {
fp_compute_im2col();
}
}
Expand Down Expand Up @@ -801,10 +806,14 @@ setup_layer(size_t workspace_capacity) {
}

template <typename TensorDataType, data_layout Layout, El::Device Device>
void pooling_distconv_adapter<TensorDataType, Layout, Device>::
fp_compute() {
m_pooling->forward(El::To<TensorDataType>(1), this->get_prev_activations(),
El::To<TensorDataType>(0), this->get_activations());
void pooling_distconv_adapter<TensorDataType, Layout, Device>::fp_compute(
bool const training)
{
m_pooling->forward(El::To<TensorDataType>(1),
this->get_prev_activations(),
El::To<TensorDataType>(0),
this->get_activations(),
training);
}

template <typename TensorDataType, data_layout Layout, El::Device Device>
Expand Down
18 changes: 14 additions & 4 deletions include/lbann/utils/distconv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,13 @@
#define DISTCONV_DEBUG
#endif

#include "distconv/dnn_backend/backend.hpp"
#include "distconv/distconv.hpp"
#include "distconv/tensor/tensor_mpi_cuda.hpp"
#include "distconv/tensor/algorithms.hpp"
#include "distconv/tensor/shuffle_mpi.hpp"
#include "distconv/tensor/shuffle_mpi_cuda.hpp"
#include "distconv/tensor/shuffle_mpi_cuda_al.hpp"
#include "distconv/tensor/algorithms.hpp"
#include "distconv/tensor/tensor_mpi_cuda.hpp"
#include "distconv/util/util.hpp"
#ifdef DISTCONV_HAS_P2P
#include "p2p/p2p.hpp"
Expand All @@ -55,6 +56,15 @@
#include "lbann/layers/learning/distconv/distconv_layers.hpp"
namespace lbann {

inline auto default_hydrogen_stream()
{
#if H2_HAS_CUDA
return hydrogen::cuda::GetDefaultStream();
#elif H2_HAS_ROCM
return hydrogen::rocm::GetDefaultStream();
#endif
}

class Layer;

namespace dc {
Expand Down Expand Up @@ -109,7 +119,7 @@ using MPIRootPrintStreamInfo = ::distconv::util::MPIRootPrintStreamInfo;
using MPIRootPrintStreamWaning = ::distconv::util::MPIRootPrintStreamWarning;

// Distconv layer classes
using Backend = ::distconv::cudnn::BackendCUDNN;
using Backend = ::distconv::BackendDNNLib;
using ReLU = ::distconv::ReLU<Backend>;
using LeakyReLU = ::distconv::LeakyReLU<Backend>;
template <typename TensorDataType>
Expand Down Expand Up @@ -232,7 +242,7 @@ Dist get_hydrogen_data_parallel_distribution(int num_dims);
template <typename Tensor>
void dump_tensor(const Tensor &t, const std::string &path) {
dc::MPIPrintStreamDebug() << "Dumping tensor to " << path;
cudaDeviceSynchronize();
h2::gpu::sync();
distconv::dump_tensor(t, path, true);
}

Expand Down
4 changes: 2 additions & 2 deletions include/lbann/utils/dnn_lib/miopen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ using dnnHandle_t = miopenHandle_t;
using dnnDataType_t = miopenDataType_t;
using dnnTensorDescriptor_t = miopenTensorDescriptor_t;
using dnnFilterDescriptor_t = miopenTensorDescriptor_t;
using dnnTensorFormat_t = int;
using dnnTensorFormat_t = miopenTensorLayout_t;
using dnnDropoutDescriptor_t = miopenDropoutDescriptor_t;
using dnnRNGType_t = miopenRNGType_t;
using dnnRNNDescriptor_t = miopenRNNDescriptor_t;
Expand Down Expand Up @@ -110,7 +110,7 @@ using dnnConvolutionBwdFilterAlgo_t = miopenConvBwdWeightsAlgorithm_t;
constexpr dnnConvolutionMode_t DNN_CROSS_CORRELATION = miopenConvolution;
constexpr dnnNanPropagation_t DNN_PROPAGATE_NAN = MIOPEN_PROPAGATE_NAN;
constexpr dnnMathType_t DNN_DEFAULT_MATH = 0;
constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = 0;
constexpr dnnTensorFormat_t DNN_TENSOR_NCHW = miopenTensorNCHW;
constexpr dnnRNGType_t DNN_RNG_PSEUDO_XORWOW = MIOPEN_RNG_PSEUDO_XORWOW;
constexpr dnnLRNMode_t DNN_LRN_CROSS_CHANNEL = miopenLRNCrossChannel;
constexpr dnnMathType_t DNN_TENSOR_OP_MATH_ALLOW_CONVERSION = -1; // not supported with ROCm
Expand Down
1 change: 0 additions & 1 deletion src/callbacks/check_gradients.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ struct CheckWeightsFunctor : DefaultErrorReporter
// Get weights matrix and gradient
const auto& weights_matrix = dtw.get_values();
const auto& gradient = dtw.get_optimizer()->get_gradient();

// Iterate through weights matrix entries
for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
Expand Down
34 changes: 16 additions & 18 deletions src/layers/data_type_distconv_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ setup_prev_activations_i(int index) const {
const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
t = std::make_unique<InputTensorDevType>(shape, loc, dist, local_shape);
assert0(t->allocate());
t->zero(hydrogen::cuda::GetDefaultStream());
t->zero(default_hydrogen_stream());
} else {
// Create a shallow copy
const auto &parent_activations =
Expand Down Expand Up @@ -427,7 +427,7 @@ setup_activations_i(int index) const {
const auto local_shape = get_activations_local_shape(index);
auto t = std::make_unique<OutputTensorDevType>(shape, loc, dist, local_shape);
assert0(t->allocate());
t->zero(hydrogen::cuda::GetDefaultStream());
t->zero(default_hydrogen_stream());
return t;
}

Expand Down Expand Up @@ -483,7 +483,7 @@ setup_prev_error_signals_i(int index) const {
const dc::LocaleMPI loc(dc::get_mpi_comm(), false);
t = std::make_unique<OutputTensorDevType>(shape, loc, dist, local_shape);
assert0(t->allocate());
t->zero(hydrogen::cuda::GetDefaultStream());
t->zero(default_hydrogen_stream());
} else {
// Create a shallow copy
const auto &child_error_signals =
Expand Down Expand Up @@ -550,7 +550,7 @@ setup_error_signals_i(int index) const {
const auto local_shape = get_error_signals_local_shape(index);
auto t = std::make_unique<InputTensorDevType>(shape, loc, dist, local_shape);
assert0(t->allocate());
t->zero(hydrogen::cuda::GetDefaultStream());
t->zero(default_hydrogen_stream());
return t;
}

Expand Down Expand Up @@ -797,9 +797,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::ensu
get_original_prev_activations(),
get_prev_activations());
shuffler.shuffle_forward(
get_original_prev_activations().get_const_base_ptr(),
get_prev_activations().get_base_ptr(),
hydrogen::cuda::GetDefaultStream());
get_original_prev_activations().get_const_base_ptr(),
get_prev_activations().get_base_ptr(),
default_hydrogen_stream());
}
}

Expand All @@ -818,10 +818,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::copy
auto &shuffler = get_activations_shuffler(
get_activations(),
get_original_activations());
shuffler.shuffle_forward(
get_activations().get_const_base_ptr(),
get_original_activations().get_base_ptr(),
hydrogen::cuda::GetDefaultStream());
shuffler.shuffle_forward(get_activations().get_const_base_ptr(),
get_original_activations().get_base_ptr(),
default_hydrogen_stream());
}
}

Expand All @@ -846,9 +845,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::ensu
get_original_prev_error_signals(i),
get_prev_error_signals(i));
shuffler.shuffle_forward(
get_original_prev_error_signals(i).get_const_base_ptr(),
get_prev_error_signals(i).get_base_ptr(),
hydrogen::cuda::GetDefaultStream());
get_original_prev_error_signals(i).get_const_base_ptr(),
get_prev_error_signals(i).get_base_ptr(),
default_hydrogen_stream());
}
}

Expand All @@ -868,10 +867,9 @@ void data_type_distconv_adapter<InputTensorDataType, OutputTensorDataType>::copy
auto &shuffler = get_error_signals_shuffler(
get_error_signals(i),
get_original_error_signals(i));
shuffler.shuffle_forward(
get_error_signals(i).get_const_base_ptr(),
get_original_error_signals(i).get_base_ptr(),
hydrogen::cuda::GetDefaultStream());
shuffler.shuffle_forward(get_error_signals(i).get_const_base_ptr(),
get_original_error_signals(i).get_base_ptr(),
default_hydrogen_stream());
}
}

Expand Down
8 changes: 6 additions & 2 deletions src/layers/io/input_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,11 @@ void input_distconv_adapter<TensorDataType, T_layout, Dev>::setup_fp_tensors() {
// only specialized for BaseAllocator.
size_t buf_size = m_host_tensor->get_local_real_size() * sizeof(TensorDataType);
TensorDataType *buf = nullptr;
#if H2_HAS_CUDA
CHECK_CUDA(cudaMallocHost(&buf, buf_size));
#elif H2_HAS_ROCM
CHECK_ROCM(hipHostMalloc(&buf, buf_size));
#endif
// Note buf should be deallocated.
dc::tensor::View(*m_host_tensor, buf);
setup_shuffler_buffers(*m_original_host_tensor,
Expand Down Expand Up @@ -338,7 +342,7 @@ setup_activations_i(int index) const {
const auto local_shape = get_activations_local_shape(index);
auto t = std::make_unique<TensorDevType>(shape, loc, dist, local_shape);
assert0(t->allocate());
t->zero(hydrogen::cuda::GetDefaultStream());
t->zero(default_hydrogen_stream());
return t;
}
else {
Expand Down Expand Up @@ -434,7 +438,7 @@ template <typename TensorDataType,
void input_distconv_adapter<TensorDataType, T_layout, Dev>::fp_compute() {
auto &l = dynamic_cast<input_layer<
TensorDataType, T_layout, Dev>&>(this->layer());
auto stream = hydrogen::cuda::GetDefaultStream();
auto stream = default_hydrogen_stream();
// Note that the mini-batch size of the data reader is not
// actually the one for the current mini-batch as the mini-batch
// index is already updated by fp_compute.
Expand Down
4 changes: 2 additions & 2 deletions src/layers/learning/distconv/distconv_layers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ namespace distconv{
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &output_gradient, \
tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &bias_gradient);

ETI(float, cudnn::BackendCUDNN)
ETI(double, cudnn::BackendCUDNN)
ETI(float, BackendDNNLib)
ETI(double, BackendDNNLib)
#undef ETI
} // namespace distconv
Loading