diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d1f37eb2b..7ec8df6133 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,6 +132,10 @@ if((ENABLE_GPERFTOOLS OR ENABLE_TCMALLOC_MINIMAL) AND CMAKE_SYSTEM_NAME MATCHES set(ENABLE_LIBUNWIND ON) add_feature_info(Libunwind ENABLE_LIBUNWIND "Libunwind provides stack unwinding") endif() + +option(TA_TENSOR_MEM_PROFILE "Turn on instrumented profiling of TA::Tensor memory use" OFF) +add_feature_info(TENSOR_MEM_PROFILE TA_TENSOR_MEM_PROFILE "instrumented profiling of TA::Tensor memory use") + option(TA_BUILD_UNITTEST "Causes building TiledArray unit tests" ON) option(TA_EXPERT "TiledArray Expert mode: disables automatically downloading or building dependencies" OFF) @@ -264,7 +268,7 @@ check_type_size("long double" TILEDARRAY_HAS_LONG_DOUBLE LANGUAGE CXX) check_type_size("long long" TILEDARRAY_HAS_LONG_LONG LANGUAGE CXX) # TA_ASSERT -set (TA_ASSERT_POLICY TA_ASSERT_THROW CACHE STRING "") +set (TA_ASSERT_POLICY TA_ASSERT_THROW CACHE STRING "Controls the behavior of TA_ASSERT; TA_ASSERT_THROW causes TA_ASSERT to throw, TA_ASSERT_ABORT causes TA_ASSERT to abort, TA_ASSERT_IGNORE makes TA_ASSERT a no-op") set_property( CACHE TA_ASSERT_POLICY PROPERTY STRINGS TA_ASSERT_THROW TA_ASSERT_ABORT TA_ASSERT_IGNORE) @@ -294,6 +298,7 @@ endif() include(external/madness.cmake) detect_MADNESS_configuration() include(external/eigen.cmake) +include(external/umpire.cmake) ###### discover linear algebra diff --git a/INSTALL.md b/INSTALL.md index f2dc253661..f532e84d9b 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](https://github.com/ValeevGroup/BTAS), tag d7794799e4510cf66844081dd8f1f5b648112d33 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 7ce06234c23aa8e0ab3d9e9b87eff9cd85390d80 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 12bd24c6cb984a639be863fc0e1364226713f7ff . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. @@ -65,7 +65,7 @@ Compiling BTAS requires the following prerequisites: Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing: - [cuTT](github.com/ValeevGroup/cutt) -- CUDA transpose library; note that our fork of the [original cuTT repo](github.com/ap-hynninen/cutt) is required to provide thread-safety (tag 0e8685bf82910bc7435835f846e88f1b39f47f09). - - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f04abd1dd038c84262915a493d8f78576bb80fd0). + - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 5201a47a35e3844160dcbecd0916f8c96aa7dd07). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: - [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS (tag 88076f1706be083ead882f6ce0bfc6884a72fc03) @@ -405,6 +405,7 @@ support may be added. * `TA_ENABLE_RANGEV3` -- Set to `ON` to find or fetch the Range-V3 library and enable additional tests of TA components with constructs anticipated to be supported in the future. [Default=OFF]. * `TA_SIGNED_1INDEX_TYPE` -- Set to `OFF` to use unsigned 1-index coordinate type (default for TiledArray 1.0.0-alpha.2 and older). The default is `ON`, which enables the use of negative indices in coordinates. * `TA_MAX_SOO_RANK_METADATA` -- Specifies the maximum rank for which to use Small Object Optimization (hence, avoid the use of the heap) for metadata. The default is `8`. +* `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile memory allocations in TA::Tensor. # Build TiledArray diff --git a/cmake/modules/FetchBLT.cmake b/cmake/modules/FetchBLT.cmake deleted file mode 100644 index 706c34c327..0000000000 --- a/cmake/modules/FetchBLT.cmake +++ /dev/null @@ -1,9 +0,0 @@ -FetchContent_Declare( - BLT - GIT_REPOSITORY https://github.com/evaleev/blt.git - GIT_TAG origin/develop -) -FetchContent_MakeAvailable(BLT) -FetchContent_GetProperties(BLT - SOURCE_DIR BLT_SOURCE_DIR -) diff --git a/external/cutt.cmake b/external/cutt.cmake index 7dc0d6e83d..f945b54b15 100644 --- a/external/cutt.cmake +++ b/external/cutt.cmake @@ -23,10 +23,10 @@ else() enable_language(C) # set source and build path for cuTT in the TiledArray project - set(EXTERNAL_SOURCE_DIR ${PROJECT_BINARY_DIR}/external/source/cutt) + set(EXTERNAL_SOURCE_DIR ${CMAKE_BINARY_DIR}/_deps/cutt-src) # cutt only supports in source build - set(EXTERNAL_BUILD_DIR ${PROJECT_BINARY_DIR}/external/build/cutt) - set(EXTERNAL_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/external/cutt) + set(EXTERNAL_BUILD_DIR ${CMAKE_BINARY_DIR}/_deps/cutt-build) + set(EXTERNAL_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}) if (NOT CUTT_URL) set(CUTT_URL https://github.com/ValeevGroup/cutt.git) @@ -72,6 +72,9 @@ else() -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER} -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT} ) + if (DEFINED CMAKE_CUDA_ARCHITECTURES) + list(APPEND CUTT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}) + endif(DEFINED CMAKE_CUDA_ARCHITECTURES) if (CMAKE_TOOLCHAIN_FILE) set(CUTT_CMAKE_ARGS "${CUTT_CMAKE_ARGS}" "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") @@ -89,8 +92,8 @@ else() ExternalProject_Add(cutt PREFIX ${CMAKE_INSTALL_PREFIX} - STAMP_DIR ${PROJECT_BINARY_DIR}/external/cutt-stamp - TMP_DIR ${PROJECT_BINARY_DIR}/external/tmp + STAMP_DIR ${CMAKE_BINARY_DIR}/_deps/cutt-ep-artifacts + TMP_DIR ${CMAKE_BINARY_DIR}/_deps/cutt-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable #--Download step-------------- DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR} GIT_REPOSITORY ${CUTT_URL} diff --git a/external/eigen.cmake b/external/eigen.cmake index 1b034177b7..361a8fe8ae 100644 --- a/external/eigen.cmake +++ b/external/eigen.cmake @@ -93,15 +93,13 @@ else() include(ExternalProject) # Set source and build path for Eigen3 in the TiledArray Project - set(EXTERNAL_SOURCE_DIR ${PROJECT_BINARY_DIR}/external/source/eigen) - set(EXTERNAL_BUILD_DIR ${PROJECT_BINARY_DIR}/external/build/eigen) + set(EXTERNAL_SOURCE_DIR ${CMAKE_BINARY_DIR}/_deps/eigen-src) + set(EXTERNAL_BUILD_DIR ${CMAKE_BINARY_DIR}/_deps/eigen-build) message("** Will build Eigen from ${EIGEN3_URL}") ExternalProject_Add(eigen3 PREFIX ${CMAKE_INSTALL_PREFIX} - STAMP_DIR ${EXTERNAL_BUILD_DIR}/stamp - TMP_DIR ${EXTERNAL_BUILD_DIR}/tmp #--Download step-------------- DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR} URL ${EIGEN3_URL} diff --git a/external/madness.cmake b/external/madness.cmake index 0c9f2fa482..9ab766f7e6 100644 --- a/external/madness.cmake +++ b/external/madness.cmake @@ -129,9 +129,9 @@ else() # Create a cache entry for MADNESS build variables. # Note: This will not overwrite user specified values. - set(MADNESS_SOURCE_DIR "${PROJECT_BINARY_DIR}/external/madness-src" CACHE PATH + set(MADNESS_SOURCE_DIR "${CMAKE_BINARY_DIR}/_deps/madness-src" CACHE PATH "Path to the MADNESS source directory") - set(MADNESS_BINARY_DIR "${PROJECT_BINARY_DIR}/external/madness-build" CACHE PATH + set(MADNESS_BINARY_DIR "${CMAKE_BINARY_DIR}/_deps/madness-build" CACHE PATH "Path to the MADNESS build directory") set(MADNESS_URL "https://github.com/m-a-d-n-e-s-s/madness.git" CACHE STRING "Path to the MADNESS repository") @@ -152,16 +152,16 @@ else() # If the MADNESS source directory is the default location and does not exist, # MADNESS will be downloaded from git. message(STATUS "Checking MADNESS source directory: ${MADNESS_SOURCE_DIR}") - if("${MADNESS_SOURCE_DIR}" STREQUAL "${PROJECT_BINARY_DIR}/external/madness-src") + if("${MADNESS_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}/_deps/madness-src") - # Create the external source directory - if(NOT EXISTS ${PROJECT_BINARY_DIR}/external) + # Create the source directory + if(NOT EXISTS ${CMAKE_BINARY_DIR}/_deps) set(error_code 1) execute_process( - COMMAND "${CMAKE_COMMAND}" -E make_directory "${PROJECT_BINARY_DIR}/external" + COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_BINARY_DIR}/_deps" RESULT_VARIABLE error_code) if(error_code) - message(FATAL_ERROR "Failed to create directory \"${PROJECT_BINARY_DIR}/external\"") + message(FATAL_ERROR "Failed to create directory \"${CMAKE_BINARY_DIR}/_deps\"") endif() endif() @@ -173,7 +173,7 @@ else() while(error_code AND number_of_tries LESS 3) execute_process( COMMAND ${GIT_EXECUTABLE} clone ${MADNESS_URL} madness-src - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/external + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/_deps RESULT_VARIABLE error_code) math(EXPR number_of_tries "${number_of_tries} + 1") endwhile() diff --git a/external/umpire.cmake b/external/umpire.cmake index ad8632e2c0..d90100cf4d 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -2,6 +2,8 @@ ## find Umpire ## +if (NOT TARGET TiledArray_UMPIRE) + find_path(_UMPIRE_INSTALL_DIR NAMES include/umpire/Umpire.hpp HINTS ${UMPIRE_INSTALL_DIR}) # if user provides UMPIRE, use it @@ -27,9 +29,9 @@ else() enable_language(C) # set source and build path for Umpire in the TiledArray project - set(EXTERNAL_SOURCE_DIR ${PROJECT_BINARY_DIR}/external/source/Umpire) - set(EXTERNAL_BUILD_DIR ${PROJECT_BINARY_DIR}/external/build/Umpire) - set(EXTERNAL_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/external/Umpire) + set(EXTERNAL_SOURCE_DIR ${CMAKE_BINARY_DIR}/_deps/umpire-src) + set(EXTERNAL_BUILD_DIR ${CMAKE_BINARY_DIR}/_deps/umpire-build) + set(EXTERNAL_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}) if (NOT UMPIRE_URL) set(UMPIRE_URL https://github.com/LLNL/Umpire.git) @@ -40,9 +42,6 @@ else() message("** Will clone Umpire from ${UMPIRE_URL}") - ## use patched BLT - include(FetchBLT) - if (TA_ASSERT_POLICY EQUAL TA_ASSERT_IGNORE) set(enable_umpire_asserts OFF) else() @@ -69,21 +68,27 @@ else() -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} -DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS} -DCMAKE_AR=${CMAKE_AR} - -DBLT_SOURCE_DIR=${BLT_SOURCE_DIR} - -DBLT_CXX_STD=c++${CMAKE_CUDA_STANDARD} - -DENABLE_CUDA=ON + -DBLT_CXX_STD=c++${CMAKE_CXX_STANDARD} -DENABLE_BENCHMARKS=OFF -DENABLE_OPENMP=OFF -DENABLE_TESTS=OFF -DENABLE_EXAMPLES=OFF -DENABLE_LOGGING=OFF -DENABLE_ASSERTS=${enable_umpire_asserts} - -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER} - -DCMAKE_CUDA_STANDARD=${CMAKE_CUDA_STANDARD} - -DCMAKE_CUDA_EXTENSIONS=${CMAKE_CUDA_EXTENSIONS} - -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER} - -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT} ) + if (ENABLE_CUDA) + list(APPEND UMPIRE_CMAKE_ARGS + -DENABLE_CUDA=ON + -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER} + -DCMAKE_CUDA_STANDARD=${CMAKE_CUDA_STANDARD} + -DCMAKE_CUDA_EXTENSIONS=${CMAKE_CUDA_EXTENSIONS} + -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER} + -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT} + ) + if (DEFINED CMAKE_CUDA_ARCHITECTURES) + list(APPEND UMPIRE_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}) + endif(DEFINED CMAKE_CUDA_ARCHITECTURES) + endif(ENABLE_CUDA) if (CMAKE_TOOLCHAIN_FILE) set(UMPIRE_CMAKE_ARGS "${UMPIRE_CMAKE_ARGS}" "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}" @@ -102,13 +107,12 @@ else() ExternalProject_Add(Umpire PREFIX ${CMAKE_INSTALL_PREFIX} - STAMP_DIR ${PROJECT_BINARY_DIR}/external/Umpire-stamp - TMP_DIR ${PROJECT_BINARY_DIR}/external/tmp + STAMP_DIR ${CMAKE_BINARY_DIR}/_deps/umpire-ep-artifacts + TMP_DIR ${CMAKE_BINARY_DIR}/_deps/umpire-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable #--Download step-------------- DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR} GIT_REPOSITORY ${UMPIRE_URL} GIT_TAG ${UMPIRE_TAG} - GIT_SUBMODULES "" # N.B. do not initialize modules! #--Configure step------------- SOURCE_DIR ${EXTERNAL_SOURCE_DIR} LIST_SEPARATOR :: @@ -152,7 +156,7 @@ set_target_properties( TiledArray_UMPIRE PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "$;$;$" + "$;$;$;$" INTERFACE_LINK_LIBRARIES "$;$" ) @@ -160,3 +164,5 @@ set_target_properties( install(TARGETS TiledArray_UMPIRE EXPORT tiledarray COMPONENT tiledarray) #TODO test Umpire + +endif(NOT TARGET TiledArray_UMPIRE) diff --git a/external/versions.cmake b/external/versions.cmake index bd82b1adf5..f4451bd0c4 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH b9e98a200d2455f06db9c661c5610496) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 6694b3adc9204dc86aba9911444aa6737171c9e3) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG f47f962213be0b1e149f5b56826992f27278128e) +set(TA_TRACKED_MADNESS_TAG 12bd24c6cb984a639be863fc0e1364226713f7ff) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 6694b3adc9204dc86aba9911444aa6737171c9e3) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) @@ -30,7 +30,7 @@ set(TA_TRACKED_BTAS_PREVIOUS_TAG f4c965b778ff7df74f276f7fcf6224ac419e8ee3) set(TA_TRACKED_CUTT_TAG 0e8685bf82910bc7435835f846e88f1b39f47f09) set(TA_TRACKED_CUTT_PREVIOUS_TAG 592198b93c93b7ca79e7900b9a9f2e79f9dafec3) -set(TA_TRACKED_UMPIRE_TAG f04abd1dd038c84262915a493d8f78576bb80fd0) +set(TA_TRACKED_UMPIRE_TAG 5201a47a35e3844160dcbecd0916f8c96aa7dd07) set(TA_TRACKED_UMPIRE_PREVIOUS_TAG f04abd1dd038c84262915a493d8f78576bb80fd0) #set(TA_TRACKED_BLACSPP_TAG 20cfd414c5b719be1c958f4a2d57abef06df83b6 ) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e1ca7b7943..0f54a0348a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,7 +35,6 @@ TiledArray/dense_shape.h TiledArray/dist_array.h TiledArray/distributed_storage.h TiledArray/error.h -TiledArray/external/madness.h TiledArray/initialize.h TiledArray/perm_index.h TiledArray/permutation.h @@ -117,6 +116,10 @@ TiledArray/expressions/unary_engine.h TiledArray/expressions/unary_expr.h TiledArray/expressions/index_list.h TiledArray/external/btas.h +TiledArray/external/madness.h +TiledArray/external/umpire.h +TiledArray/host/env.h +TiledArray/host/allocator.h TiledArray/math/blas.h TiledArray/math/gemm_helper.h TiledArray/math/outer.h @@ -217,7 +220,7 @@ TiledArray/math/linalg/rank-local.cpp ) # the list of libraries on which TiledArray depends on, will be cached later -set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers) +set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers TiledArray_UMPIRE) # TODO better ways to handle tiledarray cuda dependency if(CUDA_FOUND) @@ -234,7 +237,7 @@ if(CUDA_FOUND) LANGUAGE CUDA) # the list of libraries on which TiledArray depends on - list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_UMPIRE TiledArray_CUTT) + list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_CUTT) endif(CUDA_FOUND) diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index f038919ea2..86fef14166 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -78,6 +78,9 @@ #cmakedefine TILEDARRAY_HAS_CUDA @TILEDARRAY_HAS_CUDA@ #cmakedefine TILEDARRAY_CHECK_CUDA_ERROR @TILEDARRAY_CHECK_CUDA_ERROR@ +/* Is TA::Tensor memory profiling enabled? */ +#cmakedefine TA_TENSOR_MEM_PROFILE 1 + /* Use preprocessor to check if BTAS is available */ #ifndef TILEDARRAY_HAS_BTAS #ifdef __has_include diff --git a/src/TiledArray/cuda/um_allocator.h b/src/TiledArray/cuda/um_allocator.h index 437527e28d..94dfb0de82 100644 --- a/src/TiledArray/cuda/um_allocator.h +++ b/src/TiledArray/cuda/um_allocator.h @@ -29,6 +29,7 @@ #ifdef TILEDARRAY_HAS_CUDA #include +#include #include #include @@ -38,49 +39,31 @@ namespace TiledArray { /// CUDA UM allocator, based on boilerplate by Howard Hinnant /// (https://howardhinnant.github.io/allocator_boilerplate.html) template -class cuda_um_allocator_impl { +class cuda_um_allocator_impl : public umpire_allocator_impl { public: - using value_type = T; - using pointer = T*; - using reference = T&; - using const_reference = const T&; + using base_type = umpire_allocator_impl; + using typename base_type::const_pointer; + using typename base_type::const_reference; + using typename base_type::pointer; + using typename base_type::reference; + using typename base_type::value_type; cuda_um_allocator_impl() noexcept - : um_dynamic_pool_(&cudaEnv::instance()->um_dynamic_pool()) {} + : base_type(&cudaEnv::instance()->um_dynamic_pool()) {} template cuda_um_allocator_impl(const cuda_um_allocator_impl& rhs) noexcept - : um_dynamic_pool_(rhs.um_dynamic_pool_) {} - - /// allocates um memory using umpire dynamic pool - pointer allocate(size_t n) { - pointer result = nullptr; - - TA_ASSERT(um_dynamic_pool_); - - result = static_cast(um_dynamic_pool_->allocate(n * sizeof(T))); - - return result; - } - - /// deallocate um memory using umpire dynamic pool - void deallocate(value_type* ptr, size_t) { - TA_ASSERT(um_dynamic_pool_); - um_dynamic_pool_->deallocate(ptr); - } + : base_type(static_cast&>(rhs)) {} template friend bool operator==(const cuda_um_allocator_impl& lhs, const cuda_um_allocator_impl& rhs) noexcept; - - private: - umpire::Allocator* um_dynamic_pool_; }; // class cuda_um_allocator template bool operator==(const cuda_um_allocator_impl& lhs, const cuda_um_allocator_impl& rhs) noexcept { - return lhs.um_dynamic_pool_ == rhs.um_dynamic_pool_; + return lhs.umpire_allocator() == rhs.umpire_allocator(); } template @@ -89,37 +72,6 @@ bool operator!=(const cuda_um_allocator_impl& lhs, return !(lhs == rhs); } -/// see -/// https://stackoverflow.com/questions/21028299/is-this-behavior-of-vectorresizesize-type-n-under-c11-and-boost-container/21028912#21028912 -template -class default_init_allocator : public A { - using a_t = std::allocator_traits; - - public: - using reference = typename A::reference; // std::allocator::reference - // deprecated in C++17, but thrust - // still relying on this - using const_reference = typename A::const_reference; // ditto - - template - struct rebind { - using other = - default_init_allocator>; - }; - - using A::A; - - template - void construct(U* ptr) noexcept( - std::is_nothrow_default_constructible::value) { - ::new (static_cast(ptr)) U; - } - template - void construct(U* ptr, Args&&... args) { - a_t::construct(static_cast(*this), ptr, std::forward(args)...); - } -}; - template using cuda_um_allocator = default_init_allocator>; diff --git a/src/TiledArray/external/cuda.h b/src/TiledArray/external/cuda.h index bf9e3171a1..78a210f784 100644 --- a/src/TiledArray/external/cuda.h +++ b/src/TiledArray/external/cuda.h @@ -38,11 +38,7 @@ #include #include -// for memory management -#include -#include -#include -#include +#include #include #include @@ -209,11 +205,10 @@ class cudaEnv { } } - /// no copy constructor - cudaEnv(cudaEnv& cuda_global) = delete; - - /// no assignment constructor - cudaEnv operator=(cudaEnv& cuda_global) = delete; + cudaEnv(const cudaEnv&) = delete; + cudaEnv(cudaEnv&&) = delete; + cudaEnv& operator=(const cudaEnv&) = delete; + cudaEnv& operator=(cudaEnv&&) = delete; /// access to static member static std::unique_ptr& instance() { @@ -257,7 +252,7 @@ class cudaEnv { // subsequent allocs will use 1/10 of the total device memory auto alloc_grain = mem_total_free.second / 10; auto um_dynamic_pool = - rm.makeAllocator( + rm.makeAllocator( "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second, alloc_grain); auto thread_safe_um_dynamic_pool = @@ -270,7 +265,7 @@ class cudaEnv { "size_limited_alloc", rm.getAllocator("DEVICE"), mem_total_free.first); auto dev_dynamic_pool = - rm.makeAllocator( + rm.makeAllocator( "CUDADynamicPool", dev_size_limited_alloc, 0, alloc_grain); auto thread_safe_dev_dynamic_pool = rm.makeAllocator( diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h new file mode 100644 index 0000000000..a55a34b615 --- /dev/null +++ b/src/TiledArray/external/umpire.h @@ -0,0 +1,139 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2021 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Eduard Valeyev + * Department of Chemistry, Virginia Tech + * Jan 31, 2018 + * + */ + +#ifndef TILEDARRAY_EXTERNAL_UMPIRE_H___INCLUDED +#define TILEDARRAY_EXTERNAL_UMPIRE_H___INCLUDED + +#include + +#include + +// for memory management +#include +#include +#include +#include + +#include +#include + +namespace TiledArray { + +/// wraps Umpire allocator into a standard-compliant allocator, +/// based on the boilerplate by Howard Hinnant +/// (https://howardhinnant.github.io/allocator_boilerplate.html) +template +class umpire_allocator_impl { + public: + using value_type = T; + using pointer = value_type*; + using const_pointer = + typename std::pointer_traits::template rebind; + using void_pointer = + typename std::pointer_traits::template rebind; + using const_void_pointer = + typename std::pointer_traits::template rebind; + + using reference = T&; + using const_reference = const T&; + + using difference_type = + typename std::pointer_traits::difference_type; + using size_type = std::make_unsigned_t; + + umpire_allocator_impl(umpire::Allocator* umpalloc) noexcept + : umpalloc_(umpalloc) {} + + template + umpire_allocator_impl(const umpire_allocator_impl& rhs) noexcept + : umpalloc_(rhs.umpalloc_) {} + + /// allocates um memory using umpire dynamic pool + pointer allocate(size_t n) { + pointer result = nullptr; + + TA_ASSERT(umpalloc_); + + result = static_cast(umpalloc_->allocate(n * sizeof(T))); + + return result; + } + + /// deallocate um memory using umpire dynamic pool + void deallocate(pointer ptr, size_t) { + TA_ASSERT(umpalloc_); + umpalloc_->deallocate(ptr); + } + + const umpire::Allocator* umpire_allocator() const { return umpalloc_; } + + private: + umpire::Allocator* umpalloc_; +}; // class umpire_allocator + +template +bool operator==(const umpire_allocator_impl& lhs, + const umpire_allocator_impl& rhs) noexcept { + return lhs.um_dynamic_pool() == rhs.um_dynamic_pool(); +} + +template +bool operator!=(const umpire_allocator_impl& lhs, + const umpire_allocator_impl& rhs) noexcept { + return !(lhs == rhs); +} + +/// see +/// https://stackoverflow.com/questions/21028299/is-this-behavior-of-vectorresizesize-type-n-under-c11-and-boost-container/21028912#21028912 +template +class default_init_allocator : public A { + using a_t = std::allocator_traits; + + public: + using reference = typename A::reference; // std::allocator::reference + // deprecated in C++17, but thrust + // still relying on this + using const_reference = typename A::const_reference; // ditto + + template + struct rebind { + using other = + default_init_allocator>; + }; + + using A::A; + + template + void construct(U* ptr) noexcept( + std::is_nothrow_default_constructible::value) { + ::new (static_cast(ptr)) U; + } + template + void construct(U* ptr, Args&&... args) { + a_t::construct(static_cast(*this), ptr, std::forward(args)...); + } +}; + +} // namespace TiledArray + +#endif // TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index b53626333e..6f2cd33701 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -25,14 +25,25 @@ #include #include -// #include // fwddecl for std::allocator +// uncomment to import fwddecl for std::allocator +// #include -namespace Eigen { // fwd define Eigen's aligned allocator for - // TiledArray::Tensor +// fwddecl Eigen::aligned_allocator +namespace Eigen { template class aligned_allocator; } // namespace Eigen +// fwddecl host_allocator +namespace TiledArray { +template +class host_allocator_impl; +template +class default_init_allocator; +template +using host_allocator = default_init_allocator>; +} // namespace TiledArray + namespace madness { class World; } @@ -53,8 +64,8 @@ class DensePolicy; class SparsePolicy; // TiledArray Tensors -template /* std::allocator */> +// can also use host_allocator and std::allocator for A +template > class Tensor; typedef Tensor TensorD; diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h new file mode 100644 index 0000000000..efaaeff9c4 --- /dev/null +++ b/src/TiledArray/host/allocator.h @@ -0,0 +1,77 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2021 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Eduard Valeyev + * Department of Chemistry, Virginia Tech + * Jan 31, 2018 + * + */ + +#ifndef TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED +#define TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED + +#include + +#include +#include + +#include + +#include +#include + +namespace TiledArray { + +/// CUDA UM allocator, based on boilerplate by Howard Hinnant +/// (https://howardhinnant.github.io/allocator_boilerplate.html) +template +class host_allocator_impl : public umpire_allocator_impl { + public: + using base_type = umpire_allocator_impl; + using typename base_type::const_pointer; + using typename base_type::const_reference; + using typename base_type::pointer; + using typename base_type::reference; + using typename base_type::value_type; + + host_allocator_impl() noexcept + : base_type(&hostEnv::instance()->host_allocator()) {} + + template + host_allocator_impl(const host_allocator_impl& rhs) noexcept + : base_type(static_cast&>(rhs)) {} + + template + friend bool operator==(const host_allocator_impl& lhs, + const host_allocator_impl& rhs) noexcept; +}; // class host_allocator + +template +bool operator==(const host_allocator_impl& lhs, + const host_allocator_impl& rhs) noexcept { + return lhs.umpire_allocator() == rhs.umpire_allocator(); +} + +template +bool operator!=(const host_allocator_impl& lhs, + const host_allocator_impl& rhs) noexcept { + return !(lhs == rhs); +} + +} // namespace TiledArray + +#endif // TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h new file mode 100644 index 0000000000..2ae0bf6930 --- /dev/null +++ b/src/TiledArray/host/env.h @@ -0,0 +1,130 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2021 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Chong Peng + * Department of Chemistry, Virginia Tech + * July 23, 2018 + * + */ + +#ifndef TILEDARRAY_HOST_ENV_H__INCLUDED +#define TILEDARRAY_HOST_ENV_H__INCLUDED + +#include + +// for memory management +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace TiledArray { + +/** + * hostEnv set up global environment + * + * Singleton class + */ + +class hostEnv { + public: + ~hostEnv() = default; + + hostEnv(const hostEnv&) = delete; + hostEnv(hostEnv&&) = delete; + hostEnv& operator=(const hostEnv&) = delete; + hostEnv& operator=(hostEnv&&) = delete; + + /// access the instance, if not initialized will be initialized using default + /// params + static std::unique_ptr& instance() { + if (!instance_accessor()) { + initialize(TiledArray::get_default_world()); + } + return instance_accessor(); + } + + /// initialize the instance using explicit params + static void initialize(World& world, + const std::uint64_t max_memory_size = (1ul << 40), + const std::uint64_t page_size = (1ul << 22)) { + // initialize only when not initialized + if (instance_accessor() == nullptr) { + // uncomment to debug umpire ops + // + // umpire::util::Logger::getActiveLogger()->setLoggingMsgLevel( + // umpire::util::message::Debug); + + // make thread-safe size-limited pool of host memory + + auto& rm = umpire::ResourceManager::getInstance(); + + // turn off Umpire introspection for non-Debug builds +#ifndef NDEBUG + constexpr auto introspect = true; +#else + constexpr auto introspect = false; +#endif + + // allocate zero memory for device pool, same grain for subsequent allocs + auto host_size_limited_alloc = + rm.makeAllocator( + "size_limited_alloc", rm.getAllocator("HOST"), max_memory_size); + auto host_dynamic_pool = + rm.makeAllocator( + "HostDynamicPool", host_size_limited_alloc, 0, page_size); + auto thread_safe_host_dynamic_pool = + rm.makeAllocator( + "ThreadSafeHostDynamicPool", host_dynamic_pool); + + auto host_env = std::unique_ptr( + new hostEnv(world, thread_safe_host_dynamic_pool)); + instance_accessor() = std::move(host_env); + } + } + + World& world() const { return *world_; } + + umpire::Allocator& host_allocator() { return host_allocator_; } + + protected: + hostEnv(World& world, umpire::Allocator host_alloc) + : world_(&world), host_allocator_(host_alloc) {} + + private: + // the world used to initialize this + World* world_; + + /// allocates from a thread-safe, dynamic, size-limited host memory pool + umpire::Allocator host_allocator_; + + inline static std::unique_ptr& instance_accessor() { + static std::unique_ptr instance_{nullptr}; + return instance_; + } +}; + +} // namespace TiledArray + +#endif // TILEDARRAY_HOST_ENV_H__INCLUDED diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index afe32a038f..6846ebb5bc 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -20,6 +20,8 @@ #ifndef TILEDARRAY_TENSOR_TENSOR_H__INCLUDED #define TILEDARRAY_TENSOR_TENSOR_H__INCLUDED +#include "TiledArray/host/allocator.h" + #include "TiledArray/math/blas.h" #include "TiledArray/math/gemm_helper.h" #include "TiledArray/tensor/complex.h" @@ -28,14 +30,18 @@ #include "TiledArray/tile_interface/permute.h" #include "TiledArray/tile_interface/trace.h" #include "TiledArray/util/logger.h" -namespace TiledArray { -// Forward declare Tensor for type traits -template -class Tensor; +namespace TiledArray { namespace detail { +#ifdef TA_TENSOR_MEM_PROFILE +inline static std::mutex + ta_tensor_mem_profile_mtx; // protects the following statics +inline static std::uint64_t nbytes_allocated = 0; +inline static std::uint64_t max_nbytes_allocated = 0; +#endif // TA_TENSOR_MEM_PROFILE + /// Signals that we can take the trace of a Tensor (for numeric \c T) template struct TraceIsDefined, enable_if_numeric_t> : std::true_type {}; @@ -90,6 +96,46 @@ class Tensor { /// This tensor is used as an evaluated intermediate for other tensors. class Impl : public allocator_type { +#ifdef TA_TENSOR_MEM_PROFILE + enum class MemOp{Alloc, Dealloc}; + void alloc_record(std::uint64_t n, MemOp action) { + const double to_MiB = + 1 / (1024.0 * 1024.0); /* Convert from bytes to MiB */ + const auto nbytes = n * sizeof(value_type); + { + std::scoped_lock lock(detail::ta_tensor_mem_profile_mtx); + if (action == MemOp::Alloc) { + detail::nbytes_allocated += nbytes; + detail::max_nbytes_allocated = + std::max(detail::nbytes_allocated, detail::max_nbytes_allocated); + } else + detail::nbytes_allocated -= nbytes; + } + char buf[1024]; + auto value_type_str = []() { + if constexpr (std::is_same_v) + return "double"; + else if constexpr (std::is_same_v) + return "float"; + else if constexpr (std::is_same_v>) + return "zdouble"; + else if constexpr (std::is_same_v>) + return "zfloat"; + else + return ""; + }; + std::snprintf( + buf, 1023, + "TA::Tensor<%s>: %sallocated %lf MiB [wm = %lf MiB hwm = %lf MiB]\n", + value_type_str(), (action == MemOp::Dealloc ? "de" : " "), + nbytes * to_MiB, detail::nbytes_allocated * to_MiB, + detail::max_nbytes_allocated * to_MiB); + auto& os = madness::print_meminfo_ostream(); + os << buf; + os.flush(); + } +#endif + public: /// Default constructor @@ -101,21 +147,30 @@ class Tensor { /// \param range The N-dimensional range for this tensor explicit Impl(const range_type& range) : allocator_type(), range_(range), data_(NULL) { - data_ = allocator_type::allocate(range.volume()); + data_ = allocator_type::allocate(range_.volume()); +#ifdef TA_TENSOR_MEM_PROFILE + alloc_record(range_.volume(), MemOp::Alloc); +#endif } /// Construct with rvalue range /// \param range The N-dimensional range for this tensor explicit Impl(range_type&& range) - : allocator_type(), range_(range), data_(NULL) { - data_ = allocator_type::allocate(range.volume()); + : allocator_type(), range_(std::move(range)), data_(NULL) { + data_ = allocator_type::allocate(range_.volume()); +#ifdef TA_TENSOR_MEM_PROFILE + alloc_record(range_.volume(), MemOp::Alloc); +#endif } ~Impl() { math::destroy_vector(range_.volume(), data_); allocator_type::deallocate(data_, range_.volume()); data_ = NULL; +#ifdef TA_TENSOR_MEM_PROFILE + alloc_record(range_.volume(), MemOp::Dealloc); +#endif } range_type range_; ///< Tensor size info @@ -627,12 +682,12 @@ class Tensor { /// \tparam Archive The output archive type /// \param[out] ar The output archive template >::type* = nullptr> + typename std::enable_if< + madness::is_output_archive_v>::type* = nullptr> void serialize(Archive& ar) { if (pimpl_) { const std::uint64_t volume = pimpl_->range_.volume(); - ar & volume; + ar& volume; ar& madness::archive::wrap(pimpl_->data_, volume); ar & pimpl_->range_; } else { @@ -646,8 +701,8 @@ class Tensor { /// \tparam Archive The input archive type /// \param[out] ar The input archive template >::type* = nullptr> + typename std::enable_if< + madness::is_input_archive_v>::type* = nullptr> void serialize(Archive& ar) { std::uint64_t n = 0; ar& n; @@ -1872,7 +1927,7 @@ class Tensor { } } } -#else // TA_ENABLE_TILE_OPS_LOGGING +#else // TA_ENABLE_TILE_OPS_LOGGING math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, factor, left.data(), lda, right.data(), ldb, numeric_type(1), pimpl_->data_, n); diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index aa56ea2fec..29b60a61d6 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -2,6 +2,8 @@ #include #include +#include + #ifdef TILEDARRAY_HAS_CUDA #include #include @@ -78,9 +80,8 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv, if (!initialized()) { if (!madness::initialized()) { initialized_madworld_accessor() = true; - } - else { // if MADWorld initialized, we must assume that comm is its default - // World. + } else { // if MADWorld initialized, we must assume that comm is its + // default World. if (madness::World::is_default(comm)) throw Exception( "MADWorld initialized before TiledArray::initialize(argc, argv, " @@ -120,16 +121,13 @@ void TiledArray::finalize() { finalized_accessor() = true; } -void TiledArray::ta_abort() { - std::abort(); -} +void TiledArray::ta_abort() { SafeMPI::COMM_WORLD.Abort(); } -void TiledArray::ta_abort(const std::string &m) { +void TiledArray::ta_abort(const std::string& m) { std::cerr << m << std::endl; ta_abort(); } - void TiledArray::taskq_wait_busy() { madness::threadpool_wait_policy(madness::WaitPolicy::Busy); }