From 1d003738e12ccf4845e7465c460e39736fd99149 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 18:10:25 -0400 Subject: [PATCH 01/16] build compat ABI Signed-off-by: Jinzhe Zeng --- source/CMakeLists.txt | 29 ++++++++-- source/api_cc/CMakeLists.txt | 2 +- source/lib/CMakeLists.txt | 90 +++++++++++++++++++------------ source/lib/src/gpu/CMakeLists.txt | 80 ++++++++++++++++++--------- source/op/pt/CMakeLists.txt | 11 +++- 5 files changed, 147 insertions(+), 65 deletions(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 105a7c2695..4c60bd95f4 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -186,13 +186,34 @@ if(ENABLE_PYTORCH AND NOT DEEPMD_C_ROOT) string(REGEX MATCH "_GLIBCXX_USE_CXX11_ABI=([0-9]+)" CXXABI_PT_MATCH "${TORCH_CXX_FLAGS}") if(CXXABI_PT_MATCH) + set(OP_CXX_ABI_PT ${CMAKE_MATCH_1}) message(STATUS "PyTorch CXX11 ABI: ${CMAKE_MATCH_1}") if(DEFINED OP_CXX_ABI) if(NOT ${CMAKE_MATCH_1} EQUAL ${OP_CXX_ABI}) - message( - FATAL_ERROR - "PyTorch CXX11 ABI mismatch TensorFlow: ${CMAKE_MATCH_1} != ${OP_CXX_ABI}" - ) + if(NOT BUILD_PY_IF) + message( + FATAL_ERROR + "PyTorch CXX11 ABI mismatch TensorFlow: ${CMAKE_MATCH_1} != ${OP_CXX_ABI}" + ) + else() + if(NOT BUILD_CPP_IF) + message( + STATUS + "PyTorch CXX11 ABI mismatch TensorFlow: ${CMAKE_MATCH_1} != ${OP_CXX_ABI}. " + "Try to build libraries with both ABIs.") + else() + message( + WARNING + "PyTorch CXX11 ABI mismatch TensorFlow: ${CMAKE_MATCH_1} != ${OP_CXX_ABI}. " + "PyTorch C++ OP will be built but PyTorch support for C++ libraries will be disabled. " + "Note that we don't officially support building C++ libraries in the Python package, " + "except for the wheels we officially release.") + endif() + set(DEEPMD_BUILD_COMPAT_CXXABI ON) + set(OP_CXX_ABI_COMPAT ${OP_CXX_ABI_PT}) + endif() + else() + set(DEEPMD_BUILD_COMPAT_CXXABI OFF) endif() else() set(OP_CXX_ABI ${CMAKE_MATCH_1}) diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt index e2c889b3b4..43acc6b534 100644 --- a/source/api_cc/CMakeLists.txt +++ b/source/api_cc/CMakeLists.txt @@ -16,7 +16,7 @@ if(ENABLE_TENSORFLOW) TensorFlow::tensorflow_framework) target_compile_definitions(${libname} PRIVATE BUILD_TENSORFLOW) endif() -if(ENABLE_PYTORCH) +if(ENABLE_PYTORCH AND ${OP_CXX_ABI_PT} EQUAL ${OP_CXX_ABI}) target_link_libraries(${libname} PRIVATE "${TORCH_LIBRARIES}") target_compile_definitions(${libname} PRIVATE BUILD_PYTORCH) endif() diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index 1631eb8c35..f7f262f18e 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -1,54 +1,78 @@ -# libmd set(libname ${LIB_DEEPMD}) file(GLOB LIB_SRC src/*.cc src/*.cpp) file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h) -add_library(${libname} SHARED ${LIB_SRC}) -target_include_directories( - ${libname} PUBLIC $ - $) - if(USE_CUDA_TOOLKIT) add_definitions("-DGOOGLE_CUDA") add_subdirectory(src/gpu) - set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda) - target_link_libraries(${libname} PUBLIC deepmd_dyn_cudart) - target_link_libraries(${libname} INTERFACE ${EXTRA_LIBS}) - # gpu_cuda.h - target_include_directories( - ${libname} PUBLIC $ - $) endif() - if(USE_ROCM_TOOLKIT) add_definitions("-DTENSORFLOW_USE_ROCM") add_subdirectory(src/gpu) - set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) - # to define __HIP_PLATFORM_AMD__ in hip_runtime.h - target_link_libraries(${libname} PUBLIC hip::host) - target_link_libraries(${libname} INTERFACE ${EXTRA_LIBS}) - # gpu_rocm.h - target_include_directories( - ${libname} PUBLIC $ - $) endif() -set_target_properties(${libname} PROPERTIES INSTALL_RPATH $ORIGIN) +function(create_library _suffix) + set(libname_suffix "${libname}${_suffix}") + add_library(${libname_suffix} SHARED ${LIB_SRC}) + target_include_directories( + ${libname_suffix} + PUBLIC $ + $) + + if(USE_CUDA_TOOLKIT) + target_link_libraries(${libname_suffix} PUBLIC deepmd_dyn_cudart) + target_link_libraries(${libname_suffix} INTERFACE deepmd_op_cuda${_suffix}) + target_link_libraries(${libname_suffix} INTERFACE ${EXTRA_LIBS}) + # gpu_cuda.h + target_include_directories( + ${libname_suffix} PUBLIC $ + $) + endif() -if(CMAKE_TESTING_ENABLED) - target_link_libraries(${libname} PRIVATE coverage_config) + if(USE_ROCM_TOOLKIT) + # to define __HIP_PLATFORM_AMD__ in hip_runtime.h + target_link_libraries(${libname_suffix} PUBLIC hip::host) + target_link_libraries(${libname_suffix} INTERFACE deepmd_op_rocm${_suffix}) + target_link_libraries(${libname_suffix} INTERFACE ${EXTRA_LIBS}) + # gpu_rocm.h + target_include_directories( + ${libname_suffix} PUBLIC $ + $) + endif() + + set_target_properties(${libname_suffix} PROPERTIES INSTALL_RPATH $ORIGIN) + + if(CMAKE_TESTING_ENABLED) + target_link_libraries(${libname_suffix} PRIVATE coverage_config) + endif() + + if(BUILD_PY_IF) + install(TARGETS ${libname_suffix} DESTINATION deepmd/lib/) + else(BUILD_PY_IF) + install( + TARGETS ${libname_suffix} + EXPORT ${CMAKE_PROJECT_NAME}Targets + DESTINATION lib/) + endif(BUILD_PY_IF) +endfunction() + +remove_definitions(-D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}) +create_library("") +target_compile_definitions( + ${libname} + PUBLIC "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>") +if(DEEPMD_BUILD_COMPAT_CXXABI) + create_library("_compat_cxxabi") + target_compile_definitions( + ${libname}_compat_cxxabi + PUBLIC + "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI_COMPAT}>") endif() -if(BUILD_PY_IF) - install(TARGETS ${libname} DESTINATION deepmd/lib/) -else(BUILD_PY_IF) - install( - TARGETS ${libname} - EXPORT ${CMAKE_PROJECT_NAME}Targets - DESTINATION lib/) +if(NOT BUILD_PY_IF) install(FILES ${INC_SRC} DESTINATION include/deepmd) -endif(BUILD_PY_IF) +endif() if(BUILD_CPP_IF AND CMAKE_TESTING_ENABLED) add_subdirectory(tests) diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt index 804e1c0506..39869fe719 100644 --- a/source/lib/src/gpu/CMakeLists.txt +++ b/source/lib/src/gpu/CMakeLists.txt @@ -10,10 +10,6 @@ if(USE_CUDA_TOOLKIT) endif() enable_language(CUDA) set(CMAKE_CUDA_STANDARD 11) - if(DEFINED OP_CXX_ABI) - add_compile_definitions( - "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>") - endif() find_package(CUDAToolkit REQUIRED) @@ -55,9 +51,6 @@ if(USE_CUDA_TOOLKIT) file(GLOB SOURCE_FILES "*.cu") - add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES}) - target_link_libraries(${GPU_LIB_NAME} PRIVATE deepmd_dyn_cudart) - elseif(USE_ROCM_TOOLKIT) # required cmake version @@ -88,26 +81,61 @@ elseif(USE_ROCM_TOOLKIT) # -fpic set_property(TARGET ${GPU_LIB_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON) target_link_libraries(${GPU_LIB_NAME} PRIVATE hip::hipcub) - + set_source_files_properties(${SOURCE_FILES} PROPERTIES LANGUAGE HIP) endif() -target_include_directories( +function(create_gpu_lib _suffix) + set(GPU_LIB_NAME_SUFFIX ${GPU_LIB_NAME}${_suffix}) + add_library(${GPU_LIB_NAME_SUFFIX} SHARED ${SOURCE_FILES}) + + if(USE_CUDA_TOOLKIT) + target_link_libraries(${GPU_LIB_NAME_SUFFIX} PRIVATE deepmd_dyn_cudart) + + elseif(USE_ROCM_TOOLKIT) + add_library(${GPU_LIB_NAME_SUFFIX} SHARED ${SOURCE_FILES}) + # -fpic + set_property(TARGET ${GPU_LIB_NAME_SUFFIX} + PROPERTY POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${GPU_LIB_NAME_SUFFIX} PRIVATE hip::hipcub) + endif() + + target_include_directories( + ${GPU_LIB_NAME_SUFFIX} + PUBLIC $ + $) + target_precompile_headers(${GPU_LIB_NAME_SUFFIX} PUBLIC [["device.h"]]) + if(APPLE) + set_target_properties(${GPU_LIB_NAME_SUFFIX} PROPERTIES INSTALL_RPATH + @loader_path) + else() + set_target_properties(${GPU_LIB_NAME_SUFFIX} PROPERTIES INSTALL_RPATH + "$ORIGIN") + endif() + + if(BUILD_CPP_IF AND NOT BUILD_PY_IF) + install( + TARGETS ${GPU_LIB_NAME_SUFFIX} + EXPORT ${CMAKE_PROJECT_NAME}Targets + DESTINATION lib/) + endif(BUILD_CPP_IF AND NOT BUILD_PY_IF) + if(BUILD_PY_IF) + install(TARGETS ${GPU_LIB_NAME_SUFFIX} DESTINATION deepmd/lib/) + endif(BUILD_PY_IF) +endfunction() + +remove_definitions(-D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}) +create_gpu_lib("") +target_compile_definitions( ${GPU_LIB_NAME} - PUBLIC $ - $) -target_precompile_headers(${GPU_LIB_NAME} PUBLIC [["device.h"]]) -if(APPLE) - set_target_properties(${GPU_LIB_NAME} PROPERTIES INSTALL_RPATH @loader_path) -else() - set_target_properties(${GPU_LIB_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN") + PUBLIC "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>" + "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>" + "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>") +if(DEEPMD_BUILD_COMPAT_CXXABI) + create_gpu_lib("_compat_cxxabi") + target_compile_definitions( + ${GPU_LIB_NAME}_compat_cxxabi + PUBLIC + "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI_COMPAT}>" + "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI_COMPAT}>" + "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI_COMPAT}>") endif() - -if(BUILD_CPP_IF AND NOT BUILD_PY_IF) - install( - TARGETS ${GPU_LIB_NAME} - EXPORT ${CMAKE_PROJECT_NAME}Targets - DESTINATION lib/) -endif(BUILD_CPP_IF AND NOT BUILD_PY_IF) -if(BUILD_PY_IF) - install(TARGETS ${GPU_LIB_NAME} DESTINATION deepmd/lib/) -endif(BUILD_PY_IF) diff --git a/source/op/pt/CMakeLists.txt b/source/op/pt/CMakeLists.txt index 3254e5e852..11b141a3bf 100644 --- a/source/op/pt/CMakeLists.txt +++ b/source/op/pt/CMakeLists.txt @@ -2,7 +2,16 @@ file(GLOB OP_SRC print_summary.cc comm.cc) add_library(deepmd_op_pt MODULE ${OP_SRC}) # link: libdeepmd libtorch -target_link_libraries(deepmd_op_pt PRIVATE ${TORCH_LIBRARIES} ${LIB_DEEPMD}) +target_link_libraries(deepmd_op_pt PRIVATE ${TORCH_LIBRARIES}) +if(${OP_CXX_ABI_PT} EQUAL ${OP_CXX_ABI}) + target_link_libraries(deepmd_op_pt PRIVATE ${LIB_DEEPMD}) +else() + target_link_libraries(deepmd_op_pt PRIVATE ${LIB_DEEPMD}_compat_cxxabi) +endif() +remove_definitions(-D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}) +target_compile_definitions( + deepmd_op_pt + PUBLIC "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI_PT}>") if(APPLE) set_target_properties(deepmd_op_pt PROPERTIES INSTALL_RPATH "@loader_path") else() From ca3497bbdbbf7f7d8252e2e4d5398f3106f3ba37 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 18:14:28 -0400 Subject: [PATCH 02/16] enable pytorch by default; remove the workaround Signed-off-by: Jinzhe Zeng --- backend/find_pytorch.py | 2 +- backend/read_env.py | 2 +- deepmd/pt/model/descriptor/repformers.py | 20 -------------------- doc/install/install-from-source.md | 2 +- 4 files changed, 3 insertions(+), 23 deletions(-) diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py index f039b6f289..4f945dae61 100644 --- a/backend/find_pytorch.py +++ b/backend/find_pytorch.py @@ -40,7 +40,7 @@ def find_pytorch() -> Optional[str]: str, optional PyTorch library path if found. """ - if os.environ.get("DP_ENABLE_PYTORCH", "0") == "0": + if os.environ.get("DP_ENABLE_PYTORCH", "1") == "0": return None pt_spec = None diff --git a/backend/read_env.py b/backend/read_env.py index 14935dcc0f..ae252b3ad6 100644 --- a/backend/read_env.py +++ b/backend/read_env.py @@ -102,7 +102,7 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]: cmake_args.append("-DENABLE_TENSORFLOW=OFF") tf_version = None - if os.environ.get("DP_ENABLE_PYTORCH", "0") == "1": + if os.environ.get("DP_ENABLE_PYTORCH", "1") == "1": pt_install_dir = find_pytorch() if pt_install_dir is None: raise RuntimeError("Cannot find installed PyTorch.") diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py index 8653d79140..3d427e3f74 100644 --- a/deepmd/pt/model/descriptor/repformers.py +++ b/deepmd/pt/model/descriptor/repformers.py @@ -46,26 +46,6 @@ ) from .repformer_layer_old_impl import RepformerLayer as RepformerLayerOld -if not hasattr(torch.ops.deepmd, "border_op"): - - def border_op( - argument0, - argument1, - argument2, - argument3, - argument4, - argument5, - argument6, - argument7, - argument8, - ) -> torch.Tensor: - raise NotImplementedError( - "border_op is not available since customized PyTorch OP library is not built when freezing the model." - ) - - # Note: this hack cannot actually save a model that can be runned using LAMMPS. - torch.ops.deepmd.border_op = border_op - @DescriptorBlock.register("se_repformer") @DescriptorBlock.register("se_uni") diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md index 5195992853..abdc2f7852 100644 --- a/doc/install/install-from-source.md +++ b/doc/install/install-from-source.md @@ -145,7 +145,7 @@ One may set the following environment variables before executing `pip`: | CUDAToolkit_ROOT | Path | Detected automatically | The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required. | | ROCM_ROOT | Path | Detected automatically | The path to the ROCM toolkit directory. | | DP_ENABLE_TENSORFLOW | 0, 1 | 1 | {{ tensorflow_icon }} Enable the TensorFlow backend. | -| DP_ENABLE_PYTORCH | 0, 1 | 0 | {{ pytorch_icon }} Enable customized C++ OPs for the PyTorch backend. PyTorch can still run without customized C++ OPs, but features will be limited. | +| DP_ENABLE_PYTORCH | 0, 1 | 1 | {{ pytorch_icon }} Enable customized C++ OPs for the PyTorch backend. PyTorch can still run without customized C++ OPs, but features will be limited. | | TENSORFLOW_ROOT | Path | Detected automatically | {{ tensorflow_icon }} The path to TensorFlow Python library. By default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against. | | DP_ENABLE_NATIVE_OPTIMIZATION | 0, 1 | 0 | Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs. | | CMAKE_ARGS | str | - | Additional CMake arguments | From 593c94dc04a14060378ff981c030be99da6e5e73 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 18:16:27 -0400 Subject: [PATCH 03/16] update documentation Signed-off-by: Jinzhe Zeng --- doc/install/install-from-source.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md index abdc2f7852..b3f76272e0 100644 --- a/doc/install/install-from-source.md +++ b/doc/install/install-from-source.md @@ -120,12 +120,8 @@ Note that TensorFlow may have specific requirements for the compiler version to :::{tab-item} PyTorch {{ pytorch_icon }} -You can set the environment variable `export DP_ENABLE_PYTORCH=1` to enable customized C++ OPs in the PyTorch backend. Note that PyTorch may have specific requirements for the compiler version to support the C++ standard version and [`_GLIBCXX_USE_CXX11_ABI`](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) used by PyTorch. -The customized C++ OPs are not enabled by default because TensorFlow and PyTorch packages from the PyPI use different `_GLIBCXX_USE_CXX11_ABI` flags. -We recommend conda-forge packages in this case. - ::: :::: From 1164aabc70145e9dd0004a61638c6f54ddb04228 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 18:18:12 -0400 Subject: [PATCH 04/16] force pytorch customized ops to be compiled Signed-off-by: Jinzhe Zeng --- deepmd/pt/cxx_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/cxx_op.py b/deepmd/pt/cxx_op.py index 7887b5722c..9b86b0b9b1 100644 --- a/deepmd/pt/cxx_op.py +++ b/deepmd/pt/cxx_op.py @@ -33,7 +33,7 @@ def load_library(module_name: str) -> bool: if module_file.is_file(): torch.ops.load_library(module_file) return True - return False + raise RuntimeError("The PyTorch backend is not enabled.") ENABLE_CUSTOMIZED_OP = load_library("deepmd_op_pt") From ebb1a873bc2cdc48bf3fac3193741123c35c8bfd Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 18:31:03 -0400 Subject: [PATCH 05/16] automatically set pytorch requires Signed-off-by: Jinzhe Zeng --- backend/dp_backend.py | 15 +++++++++++++-- backend/find_pytorch.py | 40 +++++++++++++++++++++++++++++++++++++--- backend/read_env.py | 2 +- pyproject.toml | 3 --- 4 files changed, 51 insertions(+), 9 deletions(-) diff --git a/backend/dp_backend.py b/backend/dp_backend.py index 2ca0ff2f93..dbd2d2a52b 100644 --- a/backend/dp_backend.py +++ b/backend/dp_backend.py @@ -7,6 +7,9 @@ from scikit_build_core import build as _orig +from .find_pytorch import ( + find_pytorch, +) from .find_tensorflow import ( find_tensorflow, ) @@ -40,10 +43,18 @@ def __dir__() -> List[str]: def get_requires_for_build_wheel( config_settings: dict, ) -> List[str]: - return _orig.get_requires_for_build_wheel(config_settings) + find_tensorflow()[1] + return ( + _orig.get_requires_for_build_wheel(config_settings) + + find_tensorflow()[1] + + find_pytorch()[1] + ) def get_requires_for_build_editable( config_settings: dict, ) -> List[str]: - return _orig.get_requires_for_build_editable(config_settings) + find_tensorflow()[1] + return ( + _orig.get_requires_for_build_editable(config_settings) + + find_tensorflow()[1] + + find_pytorch()[1] + ) diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py index 4f945dae61..23409d5afb 100644 --- a/backend/find_pytorch.py +++ b/backend/find_pytorch.py @@ -17,12 +17,14 @@ get_path, ) from typing import ( + List, Optional, + Tuple, ) @lru_cache -def find_pytorch() -> Optional[str]: +def find_pytorch() -> Tuple[Optional[str], List[str]]: """Find PyTorch library. Tries to find PyTorch in the order of: @@ -39,9 +41,12 @@ def find_pytorch() -> Optional[str]: ------- str, optional PyTorch library path if found. + list of str + TensorFlow requirement if not found. Empty if found. """ if os.environ.get("DP_ENABLE_PYTORCH", "1") == "0": - return None + return None, [] + requires = [] pt_spec = None if (pt_spec is None or not pt_spec) and os.environ.get("PYTORCH_ROOT") is not None: @@ -73,4 +78,33 @@ def find_pytorch() -> Optional[str]: # IndexError if submodule_search_locations is an empty list except (AttributeError, TypeError, IndexError): pt_install_dir = None - return pt_install_dir + requires.extend(get_pt_requirement()["torch"]) + return pt_install_dir, requires + + +@lru_cache +def get_pt_requirement(pt_version: str = "") -> dict: + """Get PyTorch requirement when PT is not installed. + + If pt_version is not given and the environment variable `PYTORCH_VERSION` is set, use it as the requirement. + + Parameters + ---------- + pt_version : str, optional + PT version + + Returns + ------- + dict + PyTorch requirement. + """ + if pt_version is None: + return {"torch": []} + if pt_version == "": + pt_version = os.environ.get("PYTORCH_VERSION", "") + + return { + "torch": [ + f"torch=={pt_version}" if pt_version != "" else "torch>=2a", + ], + } diff --git a/backend/read_env.py b/backend/read_env.py index ae252b3ad6..3b9557ccd0 100644 --- a/backend/read_env.py +++ b/backend/read_env.py @@ -103,7 +103,7 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]: tf_version = None if os.environ.get("DP_ENABLE_PYTORCH", "1") == "1": - pt_install_dir = find_pytorch() + pt_install_dir, _ = find_pytorch() if pt_install_dir is None: raise RuntimeError("Cannot find installed PyTorch.") cmake_args.extend( diff --git a/pyproject.toml b/pyproject.toml index 2cb489ce43..39ade063b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -126,9 +126,6 @@ cu12 = [ "nvidia-cudnn-cu12<9", "nvidia-cuda-nvcc-cu12", ] -torch = [ - "torch>=2a", -] [tool.deepmd_build_backend.scripts] dp = "deepmd.main:main" From 2e9c0d4eb6bb1fc8a9483566ca8646f3ccd63eea Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 18:39:40 -0400 Subject: [PATCH 06/16] fix typos Signed-off-by: Jinzhe Zeng --- source/lib/src/gpu/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt index 39869fe719..d6677b86a3 100644 --- a/source/lib/src/gpu/CMakeLists.txt +++ b/source/lib/src/gpu/CMakeLists.txt @@ -76,7 +76,6 @@ elseif(USE_ROCM_TOOLKIT) file(GLOB SOURCE_FILES "*.cu") - add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES}) set_source_files_properties(${SOURCE_FILES} PROPERTIES LANGUAGE HIP) # -fpic set_property(TARGET ${GPU_LIB_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON) From 32831df14b3c1379307d2d1c0a724580d2efdeb1 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 18:41:02 -0400 Subject: [PATCH 07/16] fix the situation when pt is not enabled Signed-off-by: Jinzhe Zeng --- source/api_cc/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt index 43acc6b534..6a60a91b57 100644 --- a/source/api_cc/CMakeLists.txt +++ b/source/api_cc/CMakeLists.txt @@ -16,7 +16,7 @@ if(ENABLE_TENSORFLOW) TensorFlow::tensorflow_framework) target_compile_definitions(${libname} PRIVATE BUILD_TENSORFLOW) endif() -if(ENABLE_PYTORCH AND ${OP_CXX_ABI_PT} EQUAL ${OP_CXX_ABI}) +if(ENABLE_PYTORCH AND "${OP_CXX_ABI_PT}" EQUAL "${OP_CXX_ABI}") target_link_libraries(${libname} PRIVATE "${TORCH_LIBRARIES}") target_compile_definitions(${libname} PRIVATE BUILD_PYTORCH) endif() From 60b690392b8b5ed7c9c8af811046490dcba24380 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 18:44:53 -0400 Subject: [PATCH 08/16] remove the error message when PT is not preinstalled Signed-off-by: Jinzhe Zeng --- backend/read_env.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/read_env.py b/backend/read_env.py index 3b9557ccd0..740d094905 100644 --- a/backend/read_env.py +++ b/backend/read_env.py @@ -104,8 +104,6 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]: if os.environ.get("DP_ENABLE_PYTORCH", "1") == "1": pt_install_dir, _ = find_pytorch() - if pt_install_dir is None: - raise RuntimeError("Cannot find installed PyTorch.") cmake_args.extend( [ "-DENABLE_PYTORCH=ON", From 3b182b9e718809abb95c9de5948f93025866adb5 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 18:52:14 -0400 Subject: [PATCH 09/16] cleanup Signed-off-by: Jinzhe Zeng --- source/lib/src/gpu/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt index d6677b86a3..c50ded29a9 100644 --- a/source/lib/src/gpu/CMakeLists.txt +++ b/source/lib/src/gpu/CMakeLists.txt @@ -77,10 +77,6 @@ elseif(USE_ROCM_TOOLKIT) file(GLOB SOURCE_FILES "*.cu") set_source_files_properties(${SOURCE_FILES} PROPERTIES LANGUAGE HIP) - # -fpic - set_property(TARGET ${GPU_LIB_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${GPU_LIB_NAME} PRIVATE hip::hipcub) - set_source_files_properties(${SOURCE_FILES} PROPERTIES LANGUAGE HIP) endif() function(create_gpu_lib _suffix) From c6d126bb5c0e71ce32518d192ada3827597def33 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 19:35:08 -0400 Subject: [PATCH 10/16] Revert "remove the error message when PT is not preinstalled" This reverts commit 60b690392b8b5ed7c9c8af811046490dcba24380. --- backend/read_env.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/read_env.py b/backend/read_env.py index 740d094905..3b9557ccd0 100644 --- a/backend/read_env.py +++ b/backend/read_env.py @@ -104,6 +104,8 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]: if os.environ.get("DP_ENABLE_PYTORCH", "1") == "1": pt_install_dir, _ = find_pytorch() + if pt_install_dir is None: + raise RuntimeError("Cannot find installed PyTorch.") cmake_args.extend( [ "-DENABLE_PYTORCH=ON", From a28d3cede98177f2b462259067f099e613e41872 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 19:35:09 -0400 Subject: [PATCH 11/16] Revert "automatically set pytorch requires" This reverts commit ebb1a873bc2cdc48bf3fac3193741123c35c8bfd. --- backend/dp_backend.py | 15 ++------------- backend/find_pytorch.py | 40 +++------------------------------------- backend/read_env.py | 2 +- pyproject.toml | 3 +++ 4 files changed, 9 insertions(+), 51 deletions(-) diff --git a/backend/dp_backend.py b/backend/dp_backend.py index dbd2d2a52b..2ca0ff2f93 100644 --- a/backend/dp_backend.py +++ b/backend/dp_backend.py @@ -7,9 +7,6 @@ from scikit_build_core import build as _orig -from .find_pytorch import ( - find_pytorch, -) from .find_tensorflow import ( find_tensorflow, ) @@ -43,18 +40,10 @@ def __dir__() -> List[str]: def get_requires_for_build_wheel( config_settings: dict, ) -> List[str]: - return ( - _orig.get_requires_for_build_wheel(config_settings) - + find_tensorflow()[1] - + find_pytorch()[1] - ) + return _orig.get_requires_for_build_wheel(config_settings) + find_tensorflow()[1] def get_requires_for_build_editable( config_settings: dict, ) -> List[str]: - return ( - _orig.get_requires_for_build_editable(config_settings) - + find_tensorflow()[1] - + find_pytorch()[1] - ) + return _orig.get_requires_for_build_editable(config_settings) + find_tensorflow()[1] diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py index 23409d5afb..4f945dae61 100644 --- a/backend/find_pytorch.py +++ b/backend/find_pytorch.py @@ -17,14 +17,12 @@ get_path, ) from typing import ( - List, Optional, - Tuple, ) @lru_cache -def find_pytorch() -> Tuple[Optional[str], List[str]]: +def find_pytorch() -> Optional[str]: """Find PyTorch library. Tries to find PyTorch in the order of: @@ -41,12 +39,9 @@ def find_pytorch() -> Tuple[Optional[str], List[str]]: ------- str, optional PyTorch library path if found. - list of str - TensorFlow requirement if not found. Empty if found. """ if os.environ.get("DP_ENABLE_PYTORCH", "1") == "0": - return None, [] - requires = [] + return None pt_spec = None if (pt_spec is None or not pt_spec) and os.environ.get("PYTORCH_ROOT") is not None: @@ -78,33 +73,4 @@ def find_pytorch() -> Tuple[Optional[str], List[str]]: # IndexError if submodule_search_locations is an empty list except (AttributeError, TypeError, IndexError): pt_install_dir = None - requires.extend(get_pt_requirement()["torch"]) - return pt_install_dir, requires - - -@lru_cache -def get_pt_requirement(pt_version: str = "") -> dict: - """Get PyTorch requirement when PT is not installed. - - If pt_version is not given and the environment variable `PYTORCH_VERSION` is set, use it as the requirement. - - Parameters - ---------- - pt_version : str, optional - PT version - - Returns - ------- - dict - PyTorch requirement. - """ - if pt_version is None: - return {"torch": []} - if pt_version == "": - pt_version = os.environ.get("PYTORCH_VERSION", "") - - return { - "torch": [ - f"torch=={pt_version}" if pt_version != "" else "torch>=2a", - ], - } + return pt_install_dir diff --git a/backend/read_env.py b/backend/read_env.py index 3b9557ccd0..ae252b3ad6 100644 --- a/backend/read_env.py +++ b/backend/read_env.py @@ -103,7 +103,7 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]: tf_version = None if os.environ.get("DP_ENABLE_PYTORCH", "1") == "1": - pt_install_dir, _ = find_pytorch() + pt_install_dir = find_pytorch() if pt_install_dir is None: raise RuntimeError("Cannot find installed PyTorch.") cmake_args.extend( diff --git a/pyproject.toml b/pyproject.toml index 39ade063b4..2cb489ce43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -126,6 +126,9 @@ cu12 = [ "nvidia-cudnn-cu12<9", "nvidia-cuda-nvcc-cu12", ] +torch = [ + "torch>=2a", +] [tool.deepmd_build_backend.scripts] dp = "deepmd.main:main" From 7ee15c09af1a8e4923f90bba422d1224624d737d Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 19:35:10 -0400 Subject: [PATCH 12/16] Revert "force pytorch customized ops to be compiled" This reverts commit 1164aabc70145e9dd0004a61638c6f54ddb04228. --- deepmd/pt/cxx_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/cxx_op.py b/deepmd/pt/cxx_op.py index 9b86b0b9b1..7887b5722c 100644 --- a/deepmd/pt/cxx_op.py +++ b/deepmd/pt/cxx_op.py @@ -33,7 +33,7 @@ def load_library(module_name: str) -> bool: if module_file.is_file(): torch.ops.load_library(module_file) return True - raise RuntimeError("The PyTorch backend is not enabled.") + return False ENABLE_CUSTOMIZED_OP = load_library("deepmd_op_pt") From deda1f5099ba8fa59a1e15a9bcc244d3ce0f7776 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 19:35:11 -0400 Subject: [PATCH 13/16] Revert "enable pytorch by default; remove the workaround" This reverts commit ca3497bbdbbf7f7d8252e2e4d5398f3106f3ba37. --- backend/find_pytorch.py | 2 +- backend/read_env.py | 2 +- deepmd/pt/model/descriptor/repformers.py | 20 ++++++++++++++++++++ doc/install/install-from-source.md | 2 +- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py index 4f945dae61..f039b6f289 100644 --- a/backend/find_pytorch.py +++ b/backend/find_pytorch.py @@ -40,7 +40,7 @@ def find_pytorch() -> Optional[str]: str, optional PyTorch library path if found. """ - if os.environ.get("DP_ENABLE_PYTORCH", "1") == "0": + if os.environ.get("DP_ENABLE_PYTORCH", "0") == "0": return None pt_spec = None diff --git a/backend/read_env.py b/backend/read_env.py index ae252b3ad6..14935dcc0f 100644 --- a/backend/read_env.py +++ b/backend/read_env.py @@ -102,7 +102,7 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]: cmake_args.append("-DENABLE_TENSORFLOW=OFF") tf_version = None - if os.environ.get("DP_ENABLE_PYTORCH", "1") == "1": + if os.environ.get("DP_ENABLE_PYTORCH", "0") == "1": pt_install_dir = find_pytorch() if pt_install_dir is None: raise RuntimeError("Cannot find installed PyTorch.") diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py index 3d427e3f74..8653d79140 100644 --- a/deepmd/pt/model/descriptor/repformers.py +++ b/deepmd/pt/model/descriptor/repformers.py @@ -46,6 +46,26 @@ ) from .repformer_layer_old_impl import RepformerLayer as RepformerLayerOld +if not hasattr(torch.ops.deepmd, "border_op"): + + def border_op( + argument0, + argument1, + argument2, + argument3, + argument4, + argument5, + argument6, + argument7, + argument8, + ) -> torch.Tensor: + raise NotImplementedError( + "border_op is not available since customized PyTorch OP library is not built when freezing the model." + ) + + # Note: this hack cannot actually save a model that can be runned using LAMMPS. + torch.ops.deepmd.border_op = border_op + @DescriptorBlock.register("se_repformer") @DescriptorBlock.register("se_uni") diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md index b3f76272e0..404f3ab114 100644 --- a/doc/install/install-from-source.md +++ b/doc/install/install-from-source.md @@ -141,7 +141,7 @@ One may set the following environment variables before executing `pip`: | CUDAToolkit_ROOT | Path | Detected automatically | The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required. | | ROCM_ROOT | Path | Detected automatically | The path to the ROCM toolkit directory. | | DP_ENABLE_TENSORFLOW | 0, 1 | 1 | {{ tensorflow_icon }} Enable the TensorFlow backend. | -| DP_ENABLE_PYTORCH | 0, 1 | 1 | {{ pytorch_icon }} Enable customized C++ OPs for the PyTorch backend. PyTorch can still run without customized C++ OPs, but features will be limited. | +| DP_ENABLE_PYTORCH | 0, 1 | 0 | {{ pytorch_icon }} Enable customized C++ OPs for the PyTorch backend. PyTorch can still run without customized C++ OPs, but features will be limited. | | TENSORFLOW_ROOT | Path | Detected automatically | {{ tensorflow_icon }} The path to TensorFlow Python library. By default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against. | | DP_ENABLE_NATIVE_OPTIMIZATION | 0, 1 | 0 | Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs. | | CMAKE_ARGS | str | - | Additional CMake arguments | From c85b559b5bf047aa97f9b0fba613eed1e55b7a5c Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 19:39:51 -0400 Subject: [PATCH 14/16] remove duplicated add_library --- source/lib/src/gpu/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt index c50ded29a9..0d176dc320 100644 --- a/source/lib/src/gpu/CMakeLists.txt +++ b/source/lib/src/gpu/CMakeLists.txt @@ -87,7 +87,6 @@ function(create_gpu_lib _suffix) target_link_libraries(${GPU_LIB_NAME_SUFFIX} PRIVATE deepmd_dyn_cudart) elseif(USE_ROCM_TOOLKIT) - add_library(${GPU_LIB_NAME_SUFFIX} SHARED ${SOURCE_FILES}) # -fpic set_property(TARGET ${GPU_LIB_NAME_SUFFIX} PROPERTY POSITION_INDEPENDENT_CODE ON) From 83d71acede77bb774eec9f40a8da44fbadcec7ef Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 19:41:54 -0400 Subject: [PATCH 15/16] revert doc changes --- doc/install/install-from-source.md | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md index 404f3ab114..9b2cf27be2 100644 --- a/doc/install/install-from-source.md +++ b/doc/install/install-from-source.md @@ -120,6 +120,7 @@ Note that TensorFlow may have specific requirements for the compiler version to :::{tab-item} PyTorch {{ pytorch_icon }} +You can set the environment variable `export DP_ENABLE_PYTORCH=1` to enable customized C++ OPs in the PyTorch backend. Note that PyTorch may have specific requirements for the compiler version to support the C++ standard version and [`_GLIBCXX_USE_CXX11_ABI`](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) used by PyTorch. ::: From ee0f409f13e03b96adc98500431b4cbf00243ad8 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 19:44:07 -0400 Subject: [PATCH 16/16] enable pytorch ops in cuda test --- .github/workflows/test_cuda.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index d97b1f9431..81ec974e33 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -49,11 +49,13 @@ jobs: - run: python -m pip install -U uv - run: python -m uv pip install --system "tensorflow>=2.15.0rc0" "torch>=2.2.0" - run: | + export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])') export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') python -m uv pip install --system -v -e .[gpu,test,lmp,cu12,torch] mpi4py env: DP_VARIANT: cuda DP_ENABLE_NATIVE_OPTIMIZATION: 1 + DP_ENABLE_PYTORCH: 1 - run: dp --version - run: python -m pytest source/tests --durations=0 env: