diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 0d1c179..445f266 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -51,7 +51,8 @@ jobs: -G Ninja -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_UNITY_BUILD=${{ matrix.build_type == 'Debug' || matrix.valgrind }} - -DTAPP_REFERENCE_ENABLE_TBLIS=${{ !matrix.valgrind }} + -DTAPP_REFERENCE_USE_TBLIS=${{ !matrix.valgrind }} + steps: - uses: actions/checkout@v4 @@ -90,6 +91,7 @@ jobs: run: | sudo apt-get update sudo apt-get install ninja-build g++-14 liblapack-dev ccache valgrind + - name: Prepare ccache timestamp id: ccache_cache_timestamp shell: cmake -P {0} @@ -136,8 +138,8 @@ jobs: working-directory: ${{github.workspace}}/build shell: bash run: | - valgrind --error-exitcode=1 --leak-check=full ./tapp-reference-demo - valgrind --error-exitcode=1 --leak-check=full ./tapp-reference-driver + valgrind --error-exitcode=1 --leak-check=full ./test/tapp-reference-demo + valgrind --error-exitcode=1 --leak-check=full ./examples/tapp-reference-driver - name: Consume from build tree if: ${{ !matrix.valgrind && !matrix.sanitize }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 107c6ad..b79ff68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,8 +39,8 @@ project(tapp HOMEPAGE_URL "https://github.com/TAPPOrg/") # TBLIS requires CXX; enable_language must be called at the top level -option(TAPP_REFERENCE_ENABLE_TBLIS "Build and link TBLIS and TBLIS bindings" OFF) -if(TAPP_REFERENCE_ENABLE_TBLIS) +option(TAPP_REFERENCE_USE_TBLIS "TAPP-Reference will use TBLIS to implement TAPP_product" OFF) +if(TAPP_REFERENCE_USE_TBLIS) include(CheckLanguage) check_language(CXX) if(CMAKE_CXX_COMPILER) @@ -65,207 +65,39 @@ set(TAPP_INSTALL_DATADIR "share/tapp/${TAPP_EXT_VERSION}/data" set(TAPP_INSTALL_DOCDIR "share/tapp/${TAPP_EXT_VERSION}/doc" CACHE PATH "TAPP doc install directory") -# this provides tapp-api target +# this provides tapp::api target add_subdirectory(api) -# this provides tapp-reference target +# this provides tapp::reference target add_subdirectory(reference_implementation) # ---------------------------------------------------------------------------- -# testing - -include(CTest) - -if(BUILD_TESTING) - - # ---------------------------------------------------------------------------- - # TBLIS test - - if(TAPP_REFERENCE_ENABLE_TBLIS) - add_executable(tapp-reference-test++) - - target_sources( - tapp-reference-test++ - PRIVATE - test/test.cpp - test/test.h - ) - - target_link_libraries( - tapp-reference-test++ - PRIVATE - tapp-reference - tblis-static - ) - - set_property( - TARGET tapp-reference-test++ - PROPERTY - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED YES - CXX_EXTENSIONS NO - ) - - add_test( - NAME tapp-reference-test++ - COMMAND $ - ) - endif() - - # ---------------------------------------------------------------------------- - # demo - - add_executable(tapp-reference-demo) - - target_sources( - tapp-reference-demo - PRIVATE - test/demo.c - test/helpers.c - test/helpers.h - ) - - target_link_libraries( - tapp-reference-demo - PRIVATE - tapp-reference - ) - - add_test( - NAME tapp-reference-demo - COMMAND $ - ) - - # ---------------------------------------------------------------------------- - # driver - - add_executable(tapp-reference-driver) - - target_sources( - tapp-reference-driver - PRIVATE - examples/driver/driver.c - test/helpers.c - test/helpers.h - ) - - target_include_directories( - tapp-reference-driver - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/test - ) - - target_link_libraries( - tapp-reference-driver - PRIVATE - tapp-reference - ) - - add_test( - NAME tapp-reference-driver - COMMAND $ - ) - - # ---------------------------------------------------------------------------- - # exercise: contraction - - if(TAPP_BUILD_EXERCISE) - add_executable(tapp-reference-exercise_contraction) - - target_sources( - tapp-reference-exercise_contraction - PRIVATE - examples/exercise_contraction/exercise_contraction.c - test/helpers.c - test/helpers.h - ) - - target_include_directories( - tapp-reference-exercise_contraction - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/test - ) - - target_link_libraries( - tapp-reference-exercise_contraction - PRIVATE - tapp-reference - ) - - add_test( - NAME tapp-reference-exercise_contraction - COMMAND $ - ) +# cutensor bindings +option(TAPP_CUTENSOR "Build cuTensor bindings" OFF) +if (TAPP_CUTENSOR) + # enable_language must be called at the top level + include(CheckLanguage) + check_language(CXX) + if(CMAKE_CXX_COMPILER) + enable_language(CXX) + else() + message(FATAL_ERROR "Cannot build cuTENSOR bindings due to missing CXX language support") endif() + # since CUDAToolkit will be needed in tests/ also, load it here + cmake_minimum_required(VERSION 3.17) # CUDAToolkit + find_package(CUDAToolkit REQUIRED) - # ---------------------------------------------------------------------------- - # exercise: contraction answers - - add_executable(tapp-reference-exercise_contraction_answers) - - target_sources( - tapp-reference-exercise_contraction_answers - PRIVATE - examples/exercise_contraction/answers/exercise_contraction_answers.c - test/helpers.c - test/helpers.h - ) - - target_include_directories( - tapp-reference-exercise_contraction_answers - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/test - ) - - target_link_libraries( - tapp-reference-exercise_contraction_answers - PRIVATE - tapp-reference - ) - - add_test( - NAME tapp-reference-exercise_contraction_answers - COMMAND $ - ) - - # ---------------------------------------------------------------------------- - # exercise: tucker - - add_library(tapp-reference-exercise_tucker SHARED) - - target_sources( - tapp-reference-exercise_tucker - PUBLIC - examples/exercise_tucker/tapp_tucker/exercise_tucker.h - PRIVATE - examples/exercise_tucker/tapp_tucker/exercise_tucker.c - ) - - target_link_libraries( - tapp-reference-exercise_tucker - PRIVATE - tapp-reference - ) - - # ---------------------------------------------------------------------------- - # exercise: tucker answers - - add_library(tapp-reference-exercise_tucker_answers SHARED) + add_subdirectory(cutensor_bindings) +endif() - target_sources( - tapp-reference-exercise_tucker_answers - PUBLIC - examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.h - PRIVATE - examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c - ) +# ---------------------------------------------------------------------------- +# testing - target_link_libraries( - tapp-reference-exercise_tucker_answers - PRIVATE - tapp-reference - ) +include(CTest) +if(BUILD_TESTING) + add_subdirectory(test) + add_subdirectory(examples) endif() # ============================================================================ diff --git a/api/include/tapp/tensor.h b/api/include/tapp/tensor.h index 68bf287..113022d 100644 --- a/api/include/tapp/tensor.h +++ b/api/include/tapp/tensor.h @@ -3,6 +3,7 @@ #include +#include "handle.h" #include "util.h" #include "error.h" #include "datatype.h" @@ -20,6 +21,7 @@ typedef intptr_t TAPP_tensor_info; */ TAPP_EXPORT TAPP_error TAPP_create_tensor_info(TAPP_tensor_info* info, + TAPP_handle handle, TAPP_datatype type, int nmode, const int64_t* extents, diff --git a/cutensor_bindings/CMakeLists.txt b/cutensor_bindings/CMakeLists.txt new file mode 100644 index 0000000..08dbf6f --- /dev/null +++ b/cutensor_bindings/CMakeLists.txt @@ -0,0 +1,73 @@ +# cuTENSOR is not part of the CUDA toolkit; look for it separately +if(NOT TARGET cutensor::cutensor) + find_path(CUTENSOR_INCLUDE_DIR + NAMES cutensor.h + HINTS ${CUTENSOR_ROOT} ENV CUTENSOR_ROOT + ${CUDAToolkit_LIBRARY_ROOT} + PATH_SUFFIXES include + ) + find_library(CUTENSOR_LIBRARY + NAMES cutensor + HINTS ${CUTENSOR_ROOT} ENV CUTENSOR_ROOT + ${CUDAToolkit_LIBRARY_ROOT} + PATH_SUFFIXES lib lib64 lib/${CMAKE_LIBRARY_ARCHITECTURE} + ) + + if(NOT CUTENSOR_INCLUDE_DIR OR NOT CUTENSOR_LIBRARY) + message(FATAL_ERROR "cuTENSOR not found; set CUTENSOR_ROOT to the cuTENSOR installation prefix") + endif() + message(STATUS "Found cuTENSOR: ${CUTENSOR_LIBRARY}") + message(STATUS "cuTENSOR include: ${CUTENSOR_INCLUDE_DIR}") + + add_library(cutensor::cutensor UNKNOWN IMPORTED GLOBAL) + set_target_properties(cutensor::cutensor PROPERTIES + IMPORTED_LOCATION "${CUTENSOR_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${CUTENSOR_INCLUDE_DIR}" + ) +endif() + +add_library(tapp-cutensor SHARED) +set_property(TARGET tapp-cutensor PROPERTY EXPORT_NAME cutensor) +add_library(tapp::cutensor ALIAS tapp-cutensor) +target_link_libraries( + cutensor::cutensor + INTERFACE + CUDA::cudart +) + +target_sources(tapp-cutensor + PRIVATE + src/attributes.cpp + src/datatype.cpp + src/error.cpp + src/executor.cpp + src/handle.cpp + src/product.cpp + src/tensor.cpp +) + +set_target_properties(tapp-cutensor PROPERTIES + POSITION_INDEPENDENT_CODE ON + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED YES +) + +target_include_directories(tapp-cutensor + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +target_link_libraries(tapp-cutensor + PUBLIC + tapp::api + PRIVATE + cutensor::cutensor + CUDA::cudart +) + +install(TARGETS tapp-cutensor EXPORT tapp + COMPONENT cutensor) + +if(APPLE AND CMAKE_C_COMPILER_ID MATCHES "^(Clang|AppleClang)$") + target_link_options(tapp-cutensor PRIVATE "-undefined;dynamic_lookup") +endif() diff --git a/cutensor_bindings/include/attributes.h b/cutensor_bindings/include/attributes.h new file mode 100644 index 0000000..059d3dc --- /dev/null +++ b/cutensor_bindings/include/attributes.h @@ -0,0 +1,12 @@ +#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_ATTRIBUTES_H_ +#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_ATTRIBUTES_H_ + +#include + +#include + +#include "handle.h" + +#define ATTR_KEY_USE_DEVICE_MEMORY 0 + +#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_ATTRIBUTES_H_ */ \ No newline at end of file diff --git a/cutensor_bindings/include/datatype.h b/cutensor_bindings/include/datatype.h new file mode 100644 index 0000000..dbebf13 --- /dev/null +++ b/cutensor_bindings/include/datatype.h @@ -0,0 +1,16 @@ +#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_DATATYPE_H_ +#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_DATATYPE_H_ + +#include + +#include + +#include + +cutensorDataType_t translate_datatype(TAPP_datatype type); + +cutensorComputeDescriptor_t translate_prectype(TAPP_prectype prec, TAPP_datatype datatype); + +size_t sizeof_datatype(TAPP_datatype type); + +#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_DATATYPE_H_ */ \ No newline at end of file diff --git a/cutensor_bindings/include/error.h b/cutensor_bindings/include/error.h new file mode 100644 index 0000000..219195e --- /dev/null +++ b/cutensor_bindings/include/error.h @@ -0,0 +1,15 @@ +#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_ERROR_H_ +#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_ERROR_H_ + +#include + +#include + +#include +#include + +int pack_error(int current_value, int tapp_err); +int pack_error(int current_value, cutensorStatus_t e); +int pack_error(int current_value, cudaError_t e); + +#endif /* TAPP_REF_IMPL_CUTENSOR_BINDS_ERROR_H_ */ \ No newline at end of file diff --git a/cutensor_bindings/include/executor.h b/cutensor_bindings/include/executor.h new file mode 100644 index 0000000..3480deb --- /dev/null +++ b/cutensor_bindings/include/executor.h @@ -0,0 +1,8 @@ +#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_EXECUTOR_H_ +#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_EXECUTOR_H_ + +#include + +#include "error.h" + +#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_EXECUTOR_H_ */ \ No newline at end of file diff --git a/cutensor_bindings/include/handle.h b/cutensor_bindings/include/handle.h new file mode 100644 index 0000000..6b70173 --- /dev/null +++ b/cutensor_bindings/include/handle.h @@ -0,0 +1,16 @@ +#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_HANDLE_H_ +#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_HANDLE_H_ + +#include + +#include + +#include "error.h" + +struct handle +{ + cutensorHandle_t* libhandle; + intptr_t* attributes; +}; + +#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_HANDLE_H_ */ \ No newline at end of file diff --git a/cutensor_bindings/include/product.h b/cutensor_bindings/include/product.h new file mode 100644 index 0000000..c89283c --- /dev/null +++ b/cutensor_bindings/include/product.h @@ -0,0 +1,40 @@ +#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_PRODUCT_H_ +#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_PRODUCT_H_ + +#include + +#include + +#include +#include +#include +#include + +#include "error.h" +#include "handle.h" +#include "tensor.h" +#include "attributes.h" + +struct product_plan +{ + int64_t data_offset_A; + size_t copy_size_A; + int64_t data_offset_B; + size_t copy_size_B; + int64_t data_offset_C; + size_t copy_size_C; + int64_t data_offset_D; + size_t copy_size_D; + int64_t sections_D; + int64_t section_size_D; + int64_t sections_nmode_D; + int64_t* section_extents_D; + int64_t* section_strides_D; + TAPP_datatype type_D; + TAPP_element_op op_D; + cutensorPlan_t* contraction_plan; + cutensorPlan_t* permutation_plan; + TAPP_handle handle; +}; + +#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_PRODUCT_H_ */ \ No newline at end of file diff --git a/cutensor_bindings/include/tensor.h b/cutensor_bindings/include/tensor.h new file mode 100644 index 0000000..2cb6f7e --- /dev/null +++ b/cutensor_bindings/include/tensor.h @@ -0,0 +1,26 @@ +#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_TENSOR_H_ +#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_TENSOR_H_ + +#include + +#include + +#include + +#include "error.h" +#include "handle.h" +#include "datatype.h" + +struct tensor_info +{ + int nmode; + int64_t *extents; + int64_t *strides; + size_t elements; + size_t copy_size; + int64_t data_offset; + TAPP_datatype type; + cutensorTensorDescriptor_t* desc; +}; + +#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_TENSOR_H_ */ \ No newline at end of file diff --git a/cutensor_bindings/src/attributes.cpp b/cutensor_bindings/src/attributes.cpp new file mode 100644 index 0000000..203a2bb --- /dev/null +++ b/cutensor_bindings/src/attributes.cpp @@ -0,0 +1,49 @@ +#include "../include/attributes.h" + +TAPP_error TAPP_attr_set(TAPP_attr attr, TAPP_key key, void* value) +{ + struct handle* handle_struct = (struct handle*) attr; + switch (key) + { + case 0: + memcpy((void*)handle_struct->attributes[0], value, sizeof(bool)); + break; + + default: + return 15; // Invalid key + } + return 0; +} + +TAPP_error TAPP_attr_get(TAPP_attr attr, TAPP_key key, void** value) +{ + struct handle* handle_struct = (struct handle*) attr; + switch (key) + { + case 0: + memcpy(value, (void*)handle_struct->attributes[0], sizeof(bool)); + break; + + default: + return 15; // Invalid key + } + return 0; +} + +TAPP_error TAPP_attr_clear(TAPP_attr attr, TAPP_key key) +{ + struct handle* handle_struct = (struct handle*) attr; + switch (key) + { + case 0: + { + bool default_value = false; + memcpy((void*)handle_struct->attributes[0], &default_value, sizeof(bool)); + } + break; + + default: + return 15; // Invalid key + } + return 0; +} \ No newline at end of file diff --git a/cutensor_bindings/src/datatype.cpp b/cutensor_bindings/src/datatype.cpp new file mode 100644 index 0000000..2a63229 --- /dev/null +++ b/cutensor_bindings/src/datatype.cpp @@ -0,0 +1,92 @@ +#include "../include/datatype.h" + +cutensorDataType_t translate_datatype(TAPP_datatype type) +{ + switch (type) + { + case TAPP_F32: + return CUTENSOR_R_32F; + break; + case TAPP_F64: + return CUTENSOR_R_64F; + break; + case TAPP_C32: + return CUTENSOR_C_32F; + break; + case TAPP_C64: + return CUTENSOR_C_64F; + break; + case TAPP_F16: + return CUTENSOR_R_16F; + break; + case TAPP_BF16: + return CUTENSOR_R_16BF; + break; + default: // TODO: Default should probably be an error + return CUTENSOR_R_32F; + break; + } +} + +cutensorComputeDescriptor_t translate_prectype(TAPP_prectype prec, TAPP_datatype datatype) +{ + switch (prec) + { + case TAPP_DEFAULT_PREC: + switch (datatype) + { + case TAPP_F32: + case TAPP_C32: + return CUTENSOR_COMPUTE_DESC_32F; + break; + case TAPP_F64: + case TAPP_C64: + return CUTENSOR_COMPUTE_DESC_64F; + break; + default: // TODO: Default should probably be an error + return CUTENSOR_COMPUTE_DESC_32F; + break; + } + break; + case TAPP_F32F32_ACCUM_F32: + return CUTENSOR_COMPUTE_DESC_32F; + break; + case TAPP_F64F64_ACCUM_F64: + return CUTENSOR_COMPUTE_DESC_64F; + break; + case TAPP_F16F16_ACCUM_F16: + return CUTENSOR_COMPUTE_DESC_16F; + break; + default: // TODO: Default should probably be an error + return CUTENSOR_COMPUTE_DESC_32F; + break; + } +} + +size_t sizeof_datatype(TAPP_datatype type) +{ + switch (type) + { + case TAPP_F32: + return sizeof(float); + break; + case TAPP_F64: + return sizeof(double); + break; + case TAPP_C32: + return sizeof(std::complex); + break; + case TAPP_C64: + return sizeof(std::complex); + break; + /*case TAPP_F16: // Fix these datatypes + //return _Float16; + break; + case TAPP_BF16: + //return __bf16; + break;*/ + default: // TODO: Default should probably be an error + return sizeof(float); + break; + } +} \ No newline at end of file diff --git a/cutensor_bindings/src/error.cpp b/cutensor_bindings/src/error.cpp new file mode 100644 index 0000000..8c239aa --- /dev/null +++ b/cutensor_bindings/src/error.cpp @@ -0,0 +1,133 @@ +#include "../include/error.h" + +// pack multiple types of error codes into one int +constexpr int TAPP_BITS = 5; +constexpr int CUTENSOR_BITS = 9; +constexpr int CUTENSOR_OFFS = TAPP_BITS; // 5 +constexpr int CUDA_OFFS = CUTENSOR_OFFS + CUTENSOR_BITS; // 14 +constexpr uint64_t TAPP_FIELD_MASK = (1ULL << TAPP_BITS) - 1; // 0x1F +constexpr uint64_t CUTENSOR_FIELD_MASK = ((1ULL << CUTENSOR_BITS) - 1) << CUTENSOR_OFFS; +constexpr uint64_t TAPP_CLEAR_MASK = ~TAPP_FIELD_MASK; +constexpr uint64_t CUTENSOR_CLEAR_MASK = ~CUTENSOR_FIELD_MASK; + + +bool TAPP_check_success(TAPP_error error) { + return error == 0; +} + + +size_t TAPP_explain_error(TAPP_error error, + size_t maxlen, + char* message) { + + std::string str = ""; + + if (error == 0) { + str += "Success."; + } + uint64_t code = static_cast(error); + + //1. Extract TAPP (Bottom 5 bits) + uint64_t tappVal = code & TAPP_FIELD_MASK; + if (tappVal != 0) { + str += " [TAPP Error]: "; + switch (tappVal) + { + case 1: + str += "The extents for the indices shared between tensor A and B does not match."; + break; + case 2: + str += "The extents for the indices shared between tensor A and D does not match."; + break; + case 3: + str += "The extents for the indices shared between tensor B and D does not match."; + break; + case 4: + str += "Tensor D has indices not shared with tensor A or B."; + break; + case 5: + str += "The tensors C and D have different amount of dimensions."; + break; + case 6: + str += "The indices of tensor C and D does not line up."; + break; + case 7: + str += "The extents for the indices shared between tensor C and D does not match."; + break; + case 8: + str += "Aliasing found within tensor D."; + break; + case 9: + str += "An idx in tensor A has two different extents."; + break; + case 10: + str += "An idx in tensor B has two different extents."; + break; + case 11: + str += "An idx in tensor D has two different extents."; + break; + case 12: + str += "C should not be NULL while beta is not zero."; + break; + case 13: + str += "Nmode can not be negative."; + break; + case 14: + str += "Extents can not be negative."; + break; + case 15: + str += "Invalid attribute key."; + break; + default: + str += "Unknown TAPP error code."; + break; + } + } + + //2. Extract cuTENSOR (Middle 9 bits) + uint64_t cutensorVal = (code & CUTENSOR_FIELD_MASK) >> CUTENSOR_OFFS; + if (cutensorVal != 0) { + cutensorStatus_t ts = static_cast(cutensorVal); + str += " [cuTENSOR Status]: "; + str += cutensorGetErrorString(ts); + } + + //3. Extract CUDA (Top 18 bits) + int cudaVal = (code >> CUDA_OFFS); + if (cudaVal != 0) { + cudaError_t cs = static_cast(cudaVal); + str += " [CUDA Error]: "; + str += cudaGetErrorString(cs); + } + + const char* error_message = str.c_str(); + size_t message_len = strlen(error_message); + if (maxlen == 0) { + return message_len; + } + size_t writelen = maxlen - 1 < message_len ? maxlen - 1 : message_len; + strncpy(message, error_message, writelen); + message[writelen] = '\0'; + return writelen; +} + + +int pack_error(int current_value, int tapp_err) { + uint64_t val = static_cast(current_value); + uint64_t new_tapp_val = static_cast(tapp_err); + return static_cast((val & TAPP_CLEAR_MASK) | new_tapp_val); +} + +int pack_error(int current_value, cutensorStatus_t e) { + uint64_t val = static_cast(current_value); + uint64_t new_tensor_val = static_cast(e) << CUTENSOR_OFFS; + return static_cast((val & CUTENSOR_CLEAR_MASK) | new_tensor_val); +} + +int pack_error(int current_value, cudaError_t e) { + uint64_t val = static_cast(current_value); + uint64_t new_cuda_val = static_cast(e) << CUDA_OFFS; + uint64_t LOW_FIELDS_MASK = TAPP_FIELD_MASK | CUTENSOR_FIELD_MASK; + uint64_t cleared_val = val & (~LOW_FIELDS_MASK); + return static_cast(cleared_val | new_cuda_val); +} diff --git a/cutensor_bindings/src/executor.cpp b/cutensor_bindings/src/executor.cpp new file mode 100644 index 0000000..19c1f41 --- /dev/null +++ b/cutensor_bindings/src/executor.cpp @@ -0,0 +1,21 @@ +#include "../include/executor.h" + +TAPP_error TAPP_create_executor(TAPP_executor* exec) +{ + cudaStream_t* stream = (cudaStream_t*)malloc(sizeof(cudaStream_t)); + cudaError_t cerr; + cerr = cudaStreamCreate(stream); + if (cerr != cudaSuccess) return pack_error(0, cerr); + *exec = (TAPP_executor)stream; + return pack_error(0, cerr); +} + +TAPP_error TAPP_destroy_executor(TAPP_executor exec) +{ + cudaStream_t* stream = (cudaStream_t*)exec; + cudaError_t cerr; + cerr = cudaStreamDestroy(*stream); + if (cerr != cudaSuccess) return pack_error(0, cerr); + free(stream); + return pack_error(0, cerr); +} diff --git a/cutensor_bindings/src/handle.cpp b/cutensor_bindings/src/handle.cpp new file mode 100644 index 0000000..c1ea80b --- /dev/null +++ b/cutensor_bindings/src/handle.cpp @@ -0,0 +1,34 @@ +#include "../include/handle.h" + +TAPP_error TAPP_create_handle(TAPP_handle* handle) +{ + cutensorHandle_t* libhandle = new cutensorHandle_t; + cutensorStatus_t err = cutensorCreate(libhandle); + if (err != CUTENSOR_STATUS_SUCCESS) + { + delete libhandle; + return pack_error(0, err); + } + struct handle* handle_struct = new struct handle; + handle_struct->libhandle = libhandle; + bool* use_device_memory = new bool(true); + handle_struct->attributes = new intptr_t[1]; + handle_struct->attributes[0] = (intptr_t) use_device_memory; + *handle = (TAPP_handle) handle_struct; + return 0; +} + +TAPP_error TAPP_destroy_handle(TAPP_handle handle) +{ + struct handle* handle_struct = (struct handle*) handle; + cutensorStatus_t err = cutensorDestroy(*handle_struct->libhandle); + if (err != CUTENSOR_STATUS_SUCCESS) + { + return pack_error(0, err); + } + delete handle_struct->libhandle; + delete (bool*)handle_struct->attributes[0]; + delete[] handle_struct->attributes; + delete handle_struct; + return 0; +} \ No newline at end of file diff --git a/cutensor_bindings/src/product.cpp b/cutensor_bindings/src/product.cpp new file mode 100644 index 0000000..c441e91 --- /dev/null +++ b/cutensor_bindings/src/product.cpp @@ -0,0 +1,385 @@ +#include "../include/product.h" + +int64_t compute_index(const int64_t* coordinates, int nmode, const int64_t* strides); +void increment_coordinates(int64_t* coordinates, int nmode, const int64_t* extents); +cutensorOperator_t translate_operator(TAPP_element_op op); + +TAPP_error TAPP_create_tensor_product(TAPP_tensor_product* plan, + TAPP_handle handle, + TAPP_element_op op_A, + TAPP_tensor_info A, + const int64_t* idx_A, + TAPP_element_op op_B, + TAPP_tensor_info B, + const int64_t* idx_B, + TAPP_element_op op_C, + TAPP_tensor_info C, + const int64_t* idx_C, + TAPP_element_op op_D, + TAPP_tensor_info D, + const int64_t* idx_D, + TAPP_prectype prec) +{ + struct product_plan* plan_struct = new struct product_plan; + plan_struct->handle = handle; + struct handle* handle_struct = (struct handle*) plan_struct->handle; + std::vector cuidx_A = std::vector(idx_A, idx_A + TAPP_get_nmodes(A)); + std::vector cuidx_B = std::vector(idx_B, idx_B + TAPP_get_nmodes(B)); + std::vector cuidx_C = std::vector(idx_C, idx_C + TAPP_get_nmodes(C)); + std::vector cuidx_D = std::vector(idx_D, idx_D + TAPP_get_nmodes(D)); + + cutensorStatus_t err; + cutensorOperationDescriptor_t contraction_desc; + err = cutensorCreateContraction(*handle_struct->libhandle, + &contraction_desc, + *((struct tensor_info*)A)->desc, cuidx_A.data(), translate_operator(op_A), + *((struct tensor_info*)B)->desc, cuidx_B.data(), translate_operator(op_B), + *((struct tensor_info*)C)->desc, cuidx_C.data(), translate_operator(op_C), + *((struct tensor_info*)D)->desc, cuidx_D.data(), + translate_prectype(prec, ((struct tensor_info*)D)->type)); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + + cutensorDataType_t scalarType; + err = cutensorOperationDescriptorGetAttribute(*handle_struct->libhandle, + contraction_desc, + CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE, + (void*)&scalarType, + sizeof(scalarType)); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + + assert(scalarType == translate_datatype(((struct tensor_info*)D)->type)); + + cutensorOperationDescriptor_t permutation_desc; + err = cutensorCreatePermutation(*handle_struct->libhandle, + &permutation_desc, + *((struct tensor_info*)D)->desc, cuidx_D.data(), translate_operator(op_D), + *((struct tensor_info*)D)->desc, cuidx_D.data(), + translate_prectype(prec, ((tensor_info*)D)->type)); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + + err = cutensorOperationDescriptorGetAttribute(*handle_struct->libhandle, + permutation_desc, + CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE, + (void*)&scalarType, + sizeof(scalarType)); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + + assert(scalarType == translate_datatype(((struct tensor_info*)D)->type)); + + const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT; + + cutensorPlanPreference_t plan_pref; + err = cutensorCreatePlanPreference( + *handle_struct->libhandle, + &plan_pref, + algo, + CUTENSOR_JIT_MODE_NONE); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + + uint64_t workspace_size_estimate = 0; + const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT; + cutensorEstimateWorkspaceSize(*handle_struct->libhandle, + contraction_desc, + plan_pref, + workspacePref, + &workspace_size_estimate); + + plan_struct->contraction_plan = new cutensorPlan_t; + err = cutensorCreatePlan(*handle_struct->libhandle, + plan_struct->contraction_plan, + contraction_desc, + plan_pref, + workspace_size_estimate); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + + plan_struct->permutation_plan = new cutensorPlan_t; + err = cutensorCreatePlan(*handle_struct->libhandle, + plan_struct->permutation_plan, + permutation_desc, + plan_pref, + workspace_size_estimate + ); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + + plan_struct->data_offset_A = ((struct tensor_info*)A)->data_offset; + plan_struct->copy_size_A = ((struct tensor_info*)A)->copy_size; + plan_struct->data_offset_B = ((struct tensor_info*)B)->data_offset; + plan_struct->copy_size_B = ((struct tensor_info*)B)->copy_size; + plan_struct->data_offset_C = ((struct tensor_info*)C)->data_offset; + plan_struct->copy_size_C = ((struct tensor_info*)C)->copy_size; + plan_struct->data_offset_D = ((struct tensor_info*)D)->data_offset; + plan_struct->copy_size_D = ((struct tensor_info*)D)->copy_size; + plan_struct->sections_D = 1; + plan_struct->section_size_D = 1; + plan_struct->sections_nmode_D = 0; + plan_struct->section_strides_D = new int64_t[TAPP_get_nmodes(D)]; + plan_struct->section_extents_D = new int64_t[TAPP_get_nmodes(D)]; + plan_struct->type_D = ((struct tensor_info*)D)->type; + plan_struct->op_D = op_D; + int64_t sorted_strides_D[TAPP_get_nmodes(D)]; + memcpy(sorted_strides_D, ((struct tensor_info*)D)->strides, TAPP_get_nmodes(D) * sizeof(int64_t)); + auto compare = [](int64_t a, int64_t b) { return std::abs(a) < std::abs(b); }; + std::sort(sorted_strides_D, sorted_strides_D + TAPP_get_nmodes(D), compare); + for (int i = 0; i < TAPP_get_nmodes(D); i++) + { + for (int j = 0; j < TAPP_get_nmodes(D); j++) + { + if (((struct tensor_info*)D)->strides[j] == sorted_strides_D[i]) + { + if (std::abs(sorted_strides_D[i]) == plan_struct->section_size_D) + { + plan_struct->section_size_D *= std::abs(((struct tensor_info*)D)->extents[i]); + } + else if (((struct tensor_info*)D)->extents[j] != 1) // if extent = 0 then stride will never be used i.e. no need for section, even if stride would create section + { + plan_struct->sections_D *= ((struct tensor_info*)D)->extents[j]; + plan_struct->section_extents_D[plan_struct->sections_nmode_D] = ((struct tensor_info*)D)->extents[j]; + plan_struct->section_strides_D[plan_struct->sections_nmode_D] = ((struct tensor_info*)D)->strides[j]; + plan_struct->sections_nmode_D++; + } + break; + } + } + } + plan_struct->section_size_D *= sizeof_datatype(((struct tensor_info*)D)->type); + *plan = (TAPP_tensor_product) plan_struct; + err = cutensorDestroyOperationDescriptor(contraction_desc); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + err = cutensorDestroyOperationDescriptor(permutation_desc); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + cutensorDestroyPlanPreference(plan_pref); + return pack_error(0, err); +} + +TAPP_error TAPP_destroy_tensor_product(TAPP_tensor_product plan) +{ + struct product_plan* plan_struct = (struct product_plan*) plan; + cutensorStatus_t err; + err = cutensorDestroyPlan(*plan_struct->contraction_plan); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + delete plan_struct->contraction_plan; + err = cutensorDestroyPlan(*plan_struct->permutation_plan); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + delete plan_struct->permutation_plan; + delete[] plan_struct->section_strides_D; + delete[] plan_struct->section_extents_D; + delete plan_struct; + return pack_error(0, err); +} + +TAPP_error TAPP_execute_product(TAPP_tensor_product plan, + TAPP_executor exec, + TAPP_status* status, + const void* alpha, + const void* A, + const void* B, + const void* beta, + const void* C, + void* D) +{ + void *A_d, *B_d, *C_d, *D_d; + struct handle* handle_struct = (struct handle*) ((struct product_plan*) plan)->handle; + bool use_device_memory = *(bool*)((handle_struct->attributes)[ATTR_KEY_USE_DEVICE_MEMORY]); + const bool do_permutation = ( ((struct product_plan*)plan)->op_D != TAPP_IDENTITY ); + cudaError_t cerr; + + void *E_d = nullptr; + if (do_permutation) { + cerr = cudaMallocAsync((void**)&E_d, ((struct product_plan*)plan)->copy_size_D, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + } + + if (use_device_memory) + { + A_d = (void*)A; + B_d = (void*)B; + C_d = (void*)C; + D_d = (void*)D; + } + else + { + cerr = cudaMallocAsync((void**)&A_d, ((struct product_plan*)plan)->copy_size_A, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + cerr = cudaMallocAsync((void**)&B_d, ((struct product_plan*)plan)->copy_size_B, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + cerr = cudaMallocAsync((void**)&C_d, ((struct product_plan*)plan)->copy_size_C, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + cerr = cudaMallocAsync((void**)&D_d, ((struct product_plan*)plan)->copy_size_D, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + cerr = cudaMemcpyAsync(A_d, (void*)((intptr_t)A + ((struct product_plan*)plan)->data_offset_A), ((struct product_plan*)plan)->copy_size_A, cudaMemcpyHostToDevice, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + cerr = cudaMemcpyAsync(B_d, (void*)((intptr_t)B + ((struct product_plan*)plan)->data_offset_B), ((struct product_plan*)plan)->copy_size_B, cudaMemcpyHostToDevice, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + cerr = cudaMemcpyAsync(C_d, (void*)((intptr_t)C + ((struct product_plan*)plan)->data_offset_C), ((struct product_plan*)plan)->copy_size_C, cudaMemcpyHostToDevice, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + A_d = (void*)((intptr_t)A_d + ((struct product_plan*)plan)->data_offset_A); + B_d = (void*)((intptr_t)B_d + ((struct product_plan*)plan)->data_offset_B); + C_d = (void*)((intptr_t)C_d + ((struct product_plan*)plan)->data_offset_C); + D_d = (void*)((intptr_t)D_d + ((struct product_plan*)plan)->data_offset_D); + if (do_permutation) { + E_d = (void*)((intptr_t)E_d + ((struct product_plan*)plan)->data_offset_D); + } + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + } + cutensorPlan_t* contraction_plan = ((struct product_plan*) plan)->contraction_plan; + uint64_t contraction_actual_workspace_size = 0; + cutensorStatus_t err; + err = cutensorPlanGetAttribute(*handle_struct->libhandle, + *contraction_plan, + CUTENSOR_PLAN_REQUIRED_WORKSPACE, + &contraction_actual_workspace_size, + sizeof(contraction_actual_workspace_size)); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + + contraction_actual_workspace_size = std::max(contraction_actual_workspace_size, uint64_t(128 * 1024 * 1024)); // 128 MiB recomended minimum size https://docs.nvidia.com/cuda/cutensor/latest/api/cutensor.html#cutensorcontract + void *contraction_work = nullptr; + cerr = cudaMallocAsync(&contraction_work, contraction_actual_workspace_size, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + assert(uintptr_t(contraction_work) % 128 == 0); + + void* contraction_output = do_permutation ? E_d : D_d; + err = cutensorContract(*handle_struct->libhandle, + *contraction_plan, + alpha, A_d, B_d, + beta, C_d, contraction_output, + contraction_work, contraction_actual_workspace_size, *(cudaStream_t*)exec); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + + if (do_permutation) + { + cutensorPlan_t* permutation_plan = ((struct product_plan*) plan)->permutation_plan; + void* perm_scalar_ptr = NULL; + + if (((struct product_plan*)plan)->type_D == TAPP_F32) + { + perm_scalar_ptr = malloc(sizeof(float)); + *(float*)perm_scalar_ptr = 1.0f; + } + else if (((struct product_plan*)plan)->type_D == TAPP_F64) + { + perm_scalar_ptr = malloc(sizeof(double)); + *(double*)perm_scalar_ptr = 1.0; + } + else if (((struct product_plan*)plan)->type_D == TAPP_C32) + { + perm_scalar_ptr = malloc(sizeof(std::complex)); + *(std::complex*)perm_scalar_ptr = 1.0f; + } + else if (((struct product_plan*)plan)->type_D == TAPP_C64) + { + perm_scalar_ptr = malloc(sizeof(std::complex)); + *(std::complex*)perm_scalar_ptr = 1.0; + } + + err = cutensorPermute(*handle_struct->libhandle, + *permutation_plan, + perm_scalar_ptr, + E_d, + D_d, + *(cudaStream_t*)exec); + if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err); + free(perm_scalar_ptr); + } + + if (!use_device_memory) + { + int64_t section_coordinates_D[((struct product_plan*)plan)->sections_nmode_D]; + for (size_t i = 0; i < ((struct product_plan*)plan)->sections_nmode_D; i++) + { + section_coordinates_D[i] = 0; + } + + for (size_t i = 0; i < ((struct product_plan*)plan)->sections_D; i++) + { + int64_t index = compute_index(section_coordinates_D, ((struct product_plan*)plan)->sections_nmode_D, ((struct product_plan*)plan)->section_strides_D); + cerr = cudaMemcpyAsync((void*)((intptr_t)D + index * sizeof_datatype(((struct product_plan*)plan)->type_D)), + (void*)((intptr_t)D_d + index * sizeof_datatype(((struct product_plan*)plan)->type_D)), + ((struct product_plan*)plan)->section_size_D, cudaMemcpyDeviceToHost, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + increment_coordinates(section_coordinates_D, ((struct product_plan*)plan)->sections_nmode_D, ((struct product_plan*)plan)->section_extents_D); + } + + A_d = (void*)((intptr_t)A_d - ((struct product_plan*)plan)->data_offset_A); + B_d = (void*)((intptr_t)B_d - ((struct product_plan*)plan)->data_offset_B); + C_d = (void*)((intptr_t)C_d - ((struct product_plan*)plan)->data_offset_C); + D_d = (void*)((intptr_t)D_d - ((struct product_plan*)plan)->data_offset_D); + + if (A_d) { + cerr = cudaFreeAsync(A_d, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + } + if (B_d) { + cerr = cudaFreeAsync(B_d, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + } + if (C_d) { + cerr = cudaFreeAsync(C_d, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + } + if (D_d) { + cerr = cudaFreeAsync(D_d, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + } + } + + if (E_d) + { + if (!use_device_memory) + { + E_d = (void*)((intptr_t)E_d - ((struct product_plan*)plan)->data_offset_D); + } + cerr = cudaFreeAsync(E_d, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + } + if (contraction_work) { + cerr = cudaFreeAsync(contraction_work, *(cudaStream_t*)exec); + if (cerr != cudaSuccess) return pack_error(0, cerr); + } + + return pack_error(0, err); +} + +int64_t compute_index(const int64_t* coordinates, int nmode, const int64_t* strides) +{ + int64_t index = 0; + for (int i = 0; i < nmode; i++) + { + index += coordinates[i] * strides[i]; + } + return index; + +} + +void increment_coordinates(int64_t* coordinates, int nmode, const int64_t* extents) +{ + if (nmode <= 0) + { + return; + } + + int k = 0; + do + { + coordinates[k] = (coordinates[k] + 1) % extents[k]; + k++; + } while (coordinates[k - 1] == 0 && k < nmode); +} + +cutensorOperator_t translate_operator(TAPP_element_op op) +{ + switch (op) + { + case TAPP_IDENTITY: + return CUTENSOR_OP_IDENTITY; + break; + case TAPP_CONJUGATE: + return CUTENSOR_OP_CONJ; + break; + default: // TODO: Default should probably be an error + return CUTENSOR_OP_IDENTITY; + break; + } +} diff --git a/cutensor_bindings/src/tensor.cpp b/cutensor_bindings/src/tensor.cpp new file mode 100644 index 0000000..a316380 --- /dev/null +++ b/cutensor_bindings/src/tensor.cpp @@ -0,0 +1,106 @@ +#include "../include/tensor.h" + +TAPP_error TAPP_create_tensor_info(TAPP_tensor_info* info, + TAPP_handle handle, + TAPP_datatype type, + int nmode, + const int64_t* extents, + const int64_t* strides) +{ + struct tensor_info* tensor_info = new struct tensor_info; + tensor_info->desc = new cutensorTensorDescriptor_t; + struct handle* handle_struct = (struct handle*) handle; + + const uint32_t kAlignment = 128; + cutensorStatus_t err = cutensorCreateTensorDescriptor(*handle_struct->libhandle, + tensor_info->desc, + nmode, + extents, + strides, + translate_datatype(type), kAlignment); + if (err != CUTENSOR_STATUS_SUCCESS) + { + delete tensor_info->desc; + delete tensor_info; + return pack_error(0, err); + } + size_t elements = 1; + for (int i = 0; i < nmode; ++i) + elements *= extents[i]; + tensor_info->copy_size = 1; + tensor_info->data_offset = 0; + for (int i = 0; i < nmode; i++) + { + tensor_info->copy_size += (extents[i] - 1)*strides[i]; + if (strides[i] < 0) + { + tensor_info->data_offset += extents[i] * strides[i]; + } + } + tensor_info->copy_size *= sizeof_datatype(type); + tensor_info->data_offset *= sizeof_datatype(type); + tensor_info->type = type; + tensor_info->elements = elements; + tensor_info->nmode = nmode; + tensor_info->extents = new int64_t[nmode]; + tensor_info->strides = new int64_t[nmode]; + for (int i = 0; i < nmode; ++i) + { + tensor_info->extents[i] = extents[i]; + tensor_info->strides[i] = strides[i]; + } + *info = (TAPP_tensor_info) tensor_info; + return 0; +} + +TAPP_error TAPP_destroy_tensor_info(TAPP_tensor_info info) +{ + struct tensor_info* tensor_info = (struct tensor_info*) info; + cutensorStatus_t err = cutensorDestroyTensorDescriptor(*tensor_info->desc); + if (err != CUTENSOR_STATUS_SUCCESS) + { + return pack_error(0, err); + } + delete tensor_info->desc; + delete[] tensor_info->extents; + delete[] tensor_info->strides; + delete tensor_info; + return 0; +} + +int TAPP_get_nmodes(TAPP_tensor_info info) +{ + return ((struct tensor_info*) info)->nmode; +} + +TAPP_error TAPP_set_nmodes(TAPP_tensor_info info, + int nmodes) +{ + return -1; // Can for now not be implemented. Cutensor does not support changing the number of modes after creation, so this would require recreating the descriptor, would need handle. +} + +void TAPP_get_extents(TAPP_tensor_info info, + int64_t* extents) +{ + memcpy(extents, ((struct tensor_info*) info)->extents, ((struct tensor_info*) info)->nmode * sizeof(int64_t)); + return; +} + +TAPP_error TAPP_set_extents(TAPP_tensor_info info, + const int64_t* extents) +{ + return -1; // Can for now not be implemented. Cutensor does not support changing the number of modes after creation, so this would require recreating the descriptor, would need handle. +} + +void TAPP_get_strides(TAPP_tensor_info info, + int64_t* strides) +{ + memcpy(strides, ((struct tensor_info*) info)->strides, ((struct tensor_info*) info)->nmode * sizeof(int64_t)); + return; +} + +TAPP_error TAPP_set_strides(TAPP_tensor_info info, + const int64_t* strides) +{ + return -1; // Can for now not be implemented. Cutensor does not support changing the number of modes after creation, so this would require recreating the descriptor, would need handle. +} \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..009b438 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,129 @@ +# ---------------------------------------------------------------------------- +# driver + +add_executable(tapp-reference-driver) + +target_sources( + tapp-reference-driver + PRIVATE + driver/driver.c + ${PROJECT_SOURCE_DIR}/test/helpers.c + ${PROJECT_SOURCE_DIR}/test/helpers.h + ) + +target_include_directories( + tapp-reference-driver + PRIVATE + ${PROJECT_SOURCE_DIR}/test + ) + +target_link_libraries( + tapp-reference-driver + PRIVATE + tapp::reference + ) + +add_test( + NAME tapp-reference-driver + COMMAND $ + ) + +# ---------------------------------------------------------------------------- +# exercise: contraction + +if(TAPP_BUILD_EXERCISE) + add_executable(tapp-reference-exercise_contraction) + + target_sources( + tapp-reference-exercise_contraction + PRIVATE + exercise_contraction/exercise_contraction.c + ${PROJECT_SOURCE_DIR}/test/helpers.c + ${PROJECT_SOURCE_DIR}/test/helpers.h + ) + + target_include_directories( + tapp-reference-exercise_contraction + PRIVATE + ${PROJECT_SOURCE_DIR}/test + ) + + target_link_libraries( + tapp-reference-exercise_contraction + PRIVATE + tapp::reference + ) + + add_test( + NAME tapp-reference-exercise_contraction + COMMAND $ + ) +endif() + +# ---------------------------------------------------------------------------- +# exercise: contraction answers + +add_executable(tapp-reference-exercise_contraction_answers) + +target_sources( + tapp-reference-exercise_contraction_answers + PRIVATE + exercise_contraction/answers/exercise_contraction_answers.c + ${PROJECT_SOURCE_DIR}/test/helpers.c + ${PROJECT_SOURCE_DIR}/test/helpers.h + ) + +target_include_directories( + tapp-reference-exercise_contraction_answers + PRIVATE + ${PROJECT_SOURCE_DIR}/test + ) + +target_link_libraries( + tapp-reference-exercise_contraction_answers + PRIVATE + tapp::reference + ) + +add_test( + NAME tapp-reference-exercise_contraction_answers + COMMAND $ + ) + +# ---------------------------------------------------------------------------- +# exercise: tucker + +add_library(tapp-reference-exercise_tucker SHARED) + +target_sources( + tapp-reference-exercise_tucker + PUBLIC + exercise_tucker/tapp_tucker/exercise_tucker.h + PRIVATE + exercise_tucker/tapp_tucker/exercise_tucker.c + ) + +target_link_libraries( + tapp-reference-exercise_tucker + PRIVATE + tapp::reference + ) + +# ---------------------------------------------------------------------------- +# exercise: tucker answers + +add_library(tapp-reference-exercise_tucker_answers SHARED) + +target_sources( + tapp-reference-exercise_tucker_answers + PUBLIC + exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.h + PRIVATE + exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c + ) + +target_link_libraries( + tapp-reference-exercise_tucker_answers + PRIVATE + tapp::reference + ) diff --git a/examples/README.md b/examples/README.md index ae41198..6608ada 100644 --- a/examples/README.md +++ b/examples/README.md @@ -9,7 +9,7 @@ for cmake: (Unix commands) Run CMake from directory: "cmake .." Run make from directory: "make -j" All files are created in the build directory - For use of TBLIS (not needed for the exercise) add: "-DTAPP_REFERENCE_ENABLE_TBLIS=1" after "cmake .." + For use of TBLIS (not needed for the exercise) add: "-DTAPP_REFERENCE_USE_TBLIS=1" after "cmake .." With TBLIS a file called test++ will be compiled 2. Exercise contraction (try writing a tensor contraction with tapp) diff --git a/examples/driver/driver.c b/examples/driver/driver.c index 035ff33..c64d8ef 100644 --- a/examples/driver/driver.c +++ b/examples/driver/driver.c @@ -12,12 +12,19 @@ int main(int argc, char const *argv[]) { + TAPP_handle handle; // Declare handle + TAPP_create_handle(&handle); // Create handle + /* * The tensor product looks in a simplified way as follows: D <- a*A*B+b*C. * Where the lowercase letters are constants and uppercase are tensors. * The operation requires four tensors that all needs to be initialized. */ + /* + * Decide who the calculation should be executed, which indices to contract, elemental operations and precision. + */ + // Initialize the structures of the tensors // Tensor A @@ -30,34 +37,28 @@ int main(int argc, char const *argv[]) TAPP_tensor_info info_A; // Declare the variable that holds the tensor structure - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); // Assign the structure to the variable, including datatype + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); // Assign the structure to the variable, including datatype // Tensor B int nmode_B = 4; int64_t extents_B[4] = {3, 2, 2, 3}; int64_t strides_B[4] = {1, 3, 6, 12}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); // Tensor C int nmode_C = 3; int64_t extents_C[3] = {4, 2, 2}; int64_t strides_C[3] = {1, 4, 8}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); // Output tensor D int nmode_D = 3; int64_t extents_D[3] = {4, 2, 2}; int64_t strides_D[3] = {1, 4, 8}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - /* - * Decide who the calculation should be executed, which indices to contract, elemental operations and precision. - */ - - TAPP_handle handle; // Declare handle (not yet in use) + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); // Decide elemental operations (conjugate available for complex datatypes) TAPP_element_op op_A = TAPP_IDENTITY; // Decide elemental operation for tensor A @@ -181,6 +182,7 @@ int main(int argc, char const *argv[]) TAPP_destroy_tensor_info(info_C); TAPP_destroy_tensor_info(info_D); TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); return 0; } \ No newline at end of file diff --git a/examples/exercise_contraction/answers/exercise_contraction_answers.c b/examples/exercise_contraction/answers/exercise_contraction_answers.c index 5063b1c..a1258bf 100644 --- a/examples/exercise_contraction/answers/exercise_contraction_answers.c +++ b/examples/exercise_contraction/answers/exercise_contraction_answers.c @@ -17,6 +17,10 @@ int main(int argc, char const *argv[]) { + // Declare handle + TAPP_handle handle; + TAPP_create_handle(&handle); + /* * Create the tensor structures for tensor A, B, C and D. * Tensor A with 3 indices, with the extents 4, 3, 2, and the strides 1, 4, 12. @@ -44,28 +48,28 @@ int main(int argc, char const *argv[]) * Uncomment code. * Fill in: the tensor info object, datatype(float32), structure for tensor A: number of indices, extents, strides. */ - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); // Tensor B int nmode_B = 3; int64_t extents_B[3] = {3, 2, 4}; int64_t strides_B[3] = {1, 3, 6}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); // Tensor C int nmode_C = 2; int64_t extents_C[2] = {3, 3}; int64_t strides_C[2] = {1, 3}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); // Tensor D int nmode_D = 2; int64_t extents_D[2] = {3, 3}; int64_t strides_D[2] = {1, 3}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); /* @@ -78,9 +82,6 @@ int main(int argc, char const *argv[]) * The second index for A and the first index for B are free indices, in that order. */ - // Declare handle (no assignment) - TAPP_handle handle; - // Initialize the precision TAPP_prectype prec = TAPP_DEFAULT_PREC; @@ -225,6 +226,7 @@ int main(int argc, char const *argv[]) TAPP_destroy_tensor_info(info_C); TAPP_destroy_tensor_info(info_D); TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); /* * Expected output: diff --git a/examples/exercise_contraction/exercise_contraction.c b/examples/exercise_contraction/exercise_contraction.c index 2ed5d6c..d913107 100644 --- a/examples/exercise_contraction/exercise_contraction.c +++ b/examples/exercise_contraction/exercise_contraction.c @@ -16,6 +16,10 @@ int main(int argc, char const *argv[]) { + // Declare handle + TAPP_handle handle; + TAPP_create_handle(&handle); + /* * Create the tensor structures for tensor A, B, C and D. * Tensor A with 3 indices, with the extents 4, 3, 2, and the strides 1, 4, 12. @@ -41,30 +45,30 @@ int main(int argc, char const *argv[]) /* * TODO 1: Fill in the arguments for creating the tensor info. * Uncomment code. - * Fill in: the tensor info object, datatype(float32), structure for tensor A: number of indices, extents, strides. + * Fill in: the tensor info object, handle, datatype(float32), structure for tensor A: number of indices, extents, strides. */ - //TAPP_create_tensor_info(, , , , ); + //TAPP_create_tensor_info(, , , , , ); // Tensor B int nmode_B = 3; int64_t extents_B[3] = {3, 2, 4}; int64_t strides_B[3] = {1, 3, 6}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); // Tensor C int nmode_C = 2; int64_t extents_C[2] = {3, 3}; int64_t strides_C[2] = {1, 3}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); // Tensor D int nmode_D = 2; int64_t extents_D[2] = {3, 3}; int64_t strides_D[2] = {1, 3}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); /* @@ -77,9 +81,6 @@ int main(int argc, char const *argv[]) * The second index for A and the first index for B are free indices, in that order. */ - // Declare handle (no assignment) - TAPP_handle handle; - // Initialize the precision TAPP_prectype prec = TAPP_DEFAULT_PREC; @@ -223,6 +224,7 @@ int main(int argc, char const *argv[]) TAPP_destroy_tensor_info(info_C); TAPP_destroy_tensor_info(info_D); TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); /* * Expected output: diff --git a/examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c b/examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c index 99f18d2..2221ddd 100644 --- a/examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c +++ b/examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c @@ -12,6 +12,9 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str int nmode_D, int64_t* extents_D, int64_t* strides_D, void* D, int64_t* idx_A, int64_t* idx_B, int64_t* idx_D) { + TAPP_handle handle; // Declare handle + TAPP_create_handle(&handle); // Create handle + /* * The tensor product looks in a simplified way as follows: D <- a*A*B+b*C. * Where the lowercase letters are constants and uppercase are tensors. @@ -29,26 +32,24 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str * Uncomment function call * Add: nmode_A, extents_A, and strides_A */ - TAPP_create_tensor_info(&info_A, TAPP_F64, nmode_A, extents_A, strides_A); // Assign the structure to the variable, including datatype + TAPP_create_tensor_info(&info_A, handle, TAPP_F64, nmode_A, extents_A, strides_A); // Assign the structure to the variable, including datatype // Tensor B TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F64, nmode_B, extents_B, strides_B); + TAPP_create_tensor_info(&info_B, handle, TAPP_F64, nmode_B, extents_B, strides_B); // Tensor C TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F64, nmode_D, extents_D, strides_D); + TAPP_create_tensor_info(&info_C, handle, TAPP_F64, nmode_D, extents_D, strides_D); // Output tensor D TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F64, nmode_D, extents_D, strides_D); + TAPP_create_tensor_info(&info_D, handle, TAPP_F64, nmode_D, extents_D, strides_D); /* - * Decide who the calculation should be executed, which indices to contract, elemental operations and precision. + * Decide how the calculation should be executed, which indices to contract, elemental operations and precision. */ - TAPP_handle handle; // Declare handle (not yet in use) - // Decide elemental operations (conjugate available for complex datatypes) TAPP_element_op op_A = TAPP_IDENTITY; // Decide elemental operation for tensor A TAPP_element_op op_B = TAPP_IDENTITY; // Decide elemental operation for tensor B @@ -108,7 +109,7 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str int message_len = TAPP_explain_error(error, 0, NULL); // Get size of error message char *message_buff = malloc((message_len + 1) * sizeof(char)); // Allocate buffer for message, including null terminator TAPP_explain_error(error, message_len + 1, message_buff); // Fetch error message - printf(message_buff); // Print message + printf("%s", message_buff); // Print message free(message_buff); // Free buffer printf("\n"); } @@ -122,6 +123,7 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str TAPP_destroy_tensor_info(info_B); TAPP_destroy_tensor_info(info_D); TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); return D; } \ No newline at end of file diff --git a/examples/exercise_tucker/tapp_tucker/exercise_tucker.c b/examples/exercise_tucker/tapp_tucker/exercise_tucker.c index 9c0c86e..a67ea5d 100644 --- a/examples/exercise_tucker/tapp_tucker/exercise_tucker.c +++ b/examples/exercise_tucker/tapp_tucker/exercise_tucker.c @@ -12,6 +12,9 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str int nmode_D, int64_t* extents_D, int64_t* strides_D, void* D, int64_t* idx_A, int64_t* idx_B, int64_t* idx_D) { + TAPP_handle handle; // Declare handle + TAPP_create_handle(&handle); // Create handle + /* * The tensor product looks in a simplified way as follows: D <- a*A*B+b*C. * Where the lowercase letters are constants and uppercase are tensors. @@ -29,26 +32,24 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str * Uncomment function call * Add: nmode_A, extents_A, and strides_A */ - //TAPP_create_tensor_info(&info_A, TAPP_F64, , , ); // Assign the structure to the variable, including datatype + //TAPP_create_tensor_info(&info_A, handle, TAPP_F64, , , ); // Assign the structure to the variable, including datatype // Tensor B TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F64, nmode_B, extents_B, strides_B); + TAPP_create_tensor_info(&info_B, handle, TAPP_F64, nmode_B, extents_B, strides_B); // Tensor C TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F64, nmode_D, extents_D, strides_D); + TAPP_create_tensor_info(&info_C, handle, TAPP_F64, nmode_D, extents_D, strides_D); // Output tensor D TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F64, nmode_D, extents_D, strides_D); + TAPP_create_tensor_info(&info_D, handle, TAPP_F64, nmode_D, extents_D, strides_D); /* - * Decide who the calculation should be executed, which indices to contract, elemental operations and precision. + * Decide how the calculation should be executed, which indices to contract, elemental operations and precision. */ - TAPP_handle handle; // Declare handle (not yet in use) - // Decide elemental operations (conjugate available for complex datatypes) TAPP_element_op op_A = TAPP_IDENTITY; // Decide elemental operation for tensor A TAPP_element_op op_B = TAPP_IDENTITY; // Decide elemental operation for tensor B @@ -108,7 +109,7 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str int message_len = TAPP_explain_error(error, 0, NULL); // Get size of error message char *message_buff = malloc((message_len + 1) * sizeof(char)); // Allocate buffer for message, including null terminator TAPP_explain_error(error, message_len + 1, message_buff); // Fetch error message - printf(message_buff); // Print message + printf("%s", message_buff); // Print message free(message_buff); // Free buffer printf("\n"); } @@ -122,6 +123,7 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str TAPP_destroy_tensor_info(info_B); TAPP_destroy_tensor_info(info_D); TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); return D; } \ No newline at end of file diff --git a/reference_implementation/CMakeLists.txt b/reference_implementation/CMakeLists.txt index 311e44b..a9c13a9 100644 --- a/reference_implementation/CMakeLists.txt +++ b/reference_implementation/CMakeLists.txt @@ -31,7 +31,7 @@ if (APPLE AND CMAKE_C_COMPILER_ID MATCHES "^(Clang|AppleClang)$") target_link_options(tapp-reference PRIVATE "-undefined;dynamic_lookup") endif() -target_link_libraries(tapp-reference PUBLIC tapp-api) +target_link_libraries(tapp-reference PUBLIC tapp::api) option(TAPP_BUILD_EXERCISE "Build contraction exercise with TODOs in it." OFF) @@ -46,7 +46,7 @@ if(TAPP_REFERENCE_ENABLE_BF16) target_compile_definitions(tapp-reference PRIVATE TAPP_REFERENCE_ENABLE_BF16=1) endif() -if(TAPP_REFERENCE_ENABLE_TBLIS) +if(TAPP_REFERENCE_USE_TBLIS) set(TBLIS_INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/tblis) @@ -63,7 +63,7 @@ if(TAPP_REFERENCE_ENABLE_TBLIS) FetchContent_MakeAvailable(tblis) - target_compile_definitions(tapp-reference PRIVATE TAPP_REFERENCE_ENABLE_TBLIS=1) + target_compile_definitions(tapp-reference PRIVATE TAPP_REFERENCE_USE_TBLIS=1) target_sources( tapp-reference diff --git a/reference_implementation/src/executor.c b/reference_implementation/src/executor.c index f352ed2..818602a 100644 --- a/reference_implementation/src/executor.c +++ b/reference_implementation/src/executor.c @@ -9,7 +9,7 @@ TAPP_error TAPP_create_executor(TAPP_executor* exec) { *exec = (TAPP_executor)malloc(sizeof(int)); int ex = 1; // the bruteforce reference executor -#ifdef TAPP_REFERENCE_ENABLE_TBLIS +#ifdef TAPP_REFERENCE_USE_TBLIS // ex = 2; // TBLIS used as executor, use 12 for debug mode #endif *((int*)(*exec)) = ex; diff --git a/reference_implementation/src/product.c b/reference_implementation/src/product.c index 1624839..276ac91 100644 --- a/reference_implementation/src/product.c +++ b/reference_implementation/src/product.c @@ -8,7 +8,7 @@ #include #include #include -#ifdef TAPP_REFERENCE_ENABLE_TBLIS +#ifdef TAPP_REFERENCE_USE_TBLIS #include "tblis_bind.h" #endif @@ -251,7 +251,7 @@ TAPP_error TAPP_execute_product(TAPP_tensor_product plan, if((*exec_int_ptr) == 2 || (*exec_int_ptr) == 12 ) { // 1 = bruteforce, 2 = tblis, 12 = tblis + bruteforce check // if((*exec_int_ptr) == 2) printf("tapp used2 \n"); -#ifdef TAPP_REFERENCE_ENABLE_TBLIS +#ifdef TAPP_REFERENCE_USE_TBLIS bind_tblis_execute_product(nmode_A, extents_A, strides_A, A, op_A, idx_A, nmode_B, extents_B, strides_B, B, op_B, idx_B, nmode_C, extents_C, strides_C, C, op_C, idx_D, @@ -423,7 +423,7 @@ TAPP_error TAPP_execute_product(TAPP_tensor_product plan, bool comp_ = true; if((*exec_int_ptr) == 12 ) { // 1 = bruteforce, 2 = tblis, 12 = tblis + bruteforce check -#ifdef TAPP_REFERENCE_ENABLE_TBLIS +#ifdef TAPP_REFERENCE_USE_TBLIS comp_ = compare_tensors_(D, E_, (int64_t)size_D, type_D); #endif if(!comp_){ diff --git a/reference_implementation/src/status.c b/reference_implementation/src/status.c new file mode 100644 index 0000000..cc1cf79 --- /dev/null +++ b/reference_implementation/src/status.c @@ -0,0 +1,10 @@ +/* + * Ed Valeev + */ +#include "ref_impl.h" +#include + +TAPP_error TAPP_destroy_status(TAPP_status status) { + return 0; +} + diff --git a/reference_implementation/src/tensor.c b/reference_implementation/src/tensor.c index 56e8234..c55c208 100644 --- a/reference_implementation/src/tensor.c +++ b/reference_implementation/src/tensor.c @@ -9,6 +9,7 @@ #include TAPP_error TAPP_create_tensor_info(TAPP_tensor_info* info, + TAPP_handle handle, TAPP_datatype type, int nmode, const int64_t* extents, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 0000000..4408300 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,155 @@ +# ---------------------------------------------------------------------------- +# TBLIS test + +if(TAPP_REFERENCE_USE_TBLIS) + add_executable(tapp-reference-test) + + target_sources( + tapp-reference-test + PRIVATE + test.cpp + test.h + ) + + target_link_libraries( + tapp-reference-test + PRIVATE + tapp::reference + tblis-static + ) + + set_property( + TARGET tapp-reference-test + PROPERTY + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS NO + ) + + add_test( + NAME tapp-reference-test + COMMAND $ + ) +endif() + +# ---------------------------------------------------------------------------- +# demo + +add_executable(tapp-reference-demo) + +target_sources( + tapp-reference-demo + PRIVATE + demo.c + helpers.c + helpers.h + ) + +target_link_libraries( + tapp-reference-demo + PRIVATE + tapp::reference + ) + +add_test( + NAME tapp-reference-demo + COMMAND $ + ) + +# ---------------------------------------------------------------------------- +# cutensor specific code + +if (TAPP_CUTENSOR) + # ---------------------------------------------------------------------------- + # cutensor demo + + add_executable(tapp-cutensor-demo) + + target_sources( + tapp-cutensor-demo + PRIVATE + cutensor_demo.cpp + helpers.c + helpers.h + ) + + target_link_libraries( + tapp-cutensor-demo + PRIVATE + tapp::cutensor + CUDA::cudart + ) + + target_include_directories( + tapp-cutensor-demo + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ) + + add_test( + NAME tapp-cutensor-demo + COMMAND $ + ) + + # ---------------------------------------------------------------------------- + # demo using dynamic library + + add_executable(tapp-reference-demo-dynamic) + + target_compile_definitions( + tapp-reference-demo-dynamic + PRIVATE + TAPP_DYNAMIC_LAUNCH + ) + + target_sources( + tapp-reference-demo-dynamic + PRIVATE + demo.c + helpers.c + helpers.h + ) + + target_link_libraries( + tapp-reference-demo-dynamic + PRIVATE + tapp::api + ${CMAKE_DL_LIBS} + ) + + add_test( + NAME tapp-reference-demo-dynamic + COMMAND $ + ) + + # ---------------------------------------------------------------------------- + # test using dynamic library + + add_executable(tapp-reference-test-dynamic) + + target_compile_definitions( + tapp-reference-test-dynamic + PRIVATE + TAPP_DYNAMIC_LAUNCH + ) + + target_sources( + tapp-reference-test-dynamic + PRIVATE + test.cpp + test.h + ) + + target_link_libraries( + tapp-reference-test-dynamic + PRIVATE + tapp::api + ${CMAKE_DL_LIBS} + ) + + add_test( + NAME tapp-reference-test-dynamic + COMMAND $ + ) + +endif() diff --git a/test/cutensor_demo.cpp b/test/cutensor_demo.cpp new file mode 100644 index 0000000..87d3ab8 --- /dev/null +++ b/test/cutensor_demo.cpp @@ -0,0 +1,1518 @@ +/* + * Niklas Hörnblad + * Paolo Bientinesi + * Umeå University - December 2025 + */ + +#include + +#include + +#include +#include +#include +#include + +extern "C" { + #include "helpers.h" +} + +void contraction(); +void hadamard(); +void complex_num(); +void conjugate(); +void zero_dim(); +void one_ext_contracted(); +void one_ext_transfered(); +void chained_diff_op(); +void chained_same_op(); +void negative_str(); +void subtensors(); +void print_tensor_c_cpp(int nmode, const int64_t *extents, const int64_t *strides, const std::complex *data); + +int main(int argc, char const *argv[]) +{ + printf("Contraction: \n"); + contraction(); + printf("Hadamard: \n"); + hadamard(); + printf("Complex: \n"); + complex_num(); + printf("Conjugate: \n"); // Incorrect result with CuTensor, CuTensor applies opC in another way + conjugate(); + printf("Zero dim: \n"); + zero_dim(); + printf("One ext contracted: \n"); + one_ext_contracted(); + printf("One ext transfered: \n"); + one_ext_transfered(); + printf("Chained diff op: \n"); + chained_diff_op(); + printf("Chained same op: \n"); + chained_same_op(); + /*printf("Negative str: \n"); // Doesn't work with CuTensor, does not allow negative strides + negative_str();*/ + printf("Subtensors: \n"); + subtensors(); + return 0; +} + +void contraction() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 3; + int64_t extents_A[3] = {4, 3, 3}; + int64_t strides_A[3] = {1, 4, 12}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); + + int nmode_B = 4; + int64_t extents_B[4] = {3, 2, 2, 3}; + int64_t strides_B[4] = {1, 3, 6, 12}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); + + int nmode_C = 3; + int64_t extents_C[3] = {4, 2, 2}; + int64_t strides_C[3] = {1, 4, 8}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); + + int nmode_D = 3; + int64_t extents_D[3] = {4, 2, 2}; + int64_t strides_D[3] = {1, 4, 8}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[3] = {'a', 'b', 'c'}; + int64_t idx_B[4] = {'c', 'd', 'e', 'b'}; + int64_t idx_C[3] = {'a', 'd', 'e'}; + int64_t idx_D[3] = {'a', 'd', 'e'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + // int exec_id = 1; + // exec = (intptr_t)&exec_id; + TAPP_status status; + + float alpha = 1; + + float A[36] = { + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1}; + + float B[36] = { + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6, + + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6, + + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6}; + + float beta = 0; + + float C[16] = { + 2, 4, 6, 8, + 2, 4, 6, 8, + + 2, 4, 6, 8, + 2, 4, 6, 8}; + + float D[16] = { + 1, 2, 3, 4, + 5, 6, 7, 8, + + 1, 2, 3, 4, + 5, 6, 7, 8}; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 36 * sizeof(float)); + cudaMalloc((void**)&B_d, 36 * sizeof(float)); + cudaMalloc((void**)&C_d, 16 * sizeof(float)); + cudaMalloc((void**)&D_d, 16 * sizeof(float)); + + cudaMemcpy(A_d, (void*)A, 36 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B, 36 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_error error = TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + printf(TAPP_check_success(error) ? "Success\n" : "Fail\n"); + int message_len = TAPP_explain_error(error, 0, NULL); + char *message_buff = (char*)malloc((message_len + 1) * sizeof(char)); + TAPP_explain_error(error, message_len + 1, message_buff); + printf("%s", message_buff); + free(message_buff); + + cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost); + + print_tensor_s(nmode_D, extents_D, strides_D, D); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +void hadamard() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 2; + int64_t extents_A[2] = {4, 4}; + int64_t strides_A[2] = {1, 4}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); + + int nmode_B = 2; + int64_t extents_B[2] = {4, 4}; + int64_t strides_B[2] = {1, 4}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); + + int nmode_C = 2; + int64_t extents_C[2] = {4, 4}; + int64_t strides_C[2] = {1, 4}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); + + int nmode_D = 2; + int64_t extents_D[2] = {4, 4}; + int64_t strides_D[2] = {1, 4}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[2] = {'a', 'b'}; + int64_t idx_B[2] = {'a', 'b'}; + int64_t idx_C[2] = {'a', 'b'}; + int64_t idx_D[2] = {'a', 'b'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + float alpha = 3; + + float A[16] = { + 1, 2, 3, 4, + 1, 2, 3, 4, + 1, 2, 3, 4, + 1, 2, 3, 4}; + + float B[16] = { + 1, 1, 1, 1, + 2, 2, 2, 2, + 3, 3, 3, 3, + 4, 4, 4, 4}; + + float beta = 2; + + float C[16] = { + 1, 2, 1, 2, + 1, 2, 1, 2, + 1, 2, 1, 2, + 1, 2, 1, 2}; + + float D[16] = { + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + }; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 16 * sizeof(float)); + cudaMalloc((void**)&B_d, 16 * sizeof(float)); + cudaMalloc((void**)&C_d, 16 * sizeof(float)); + cudaMalloc((void**)&D_d, 16 * sizeof(float)); + + cudaMemcpy(A_d, (void*)A, 16 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B, 16 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + + cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost); + + print_tensor_s(nmode_D, extents_D, strides_D, D); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +void complex_num() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 2; + int64_t extents_A[2] = {3, 3}; + int64_t strides_A[2] = {1, 3}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_C32, nmode_A, extents_A, strides_A); + + int nmode_B = 2; + int64_t extents_B[2] = {3, 3}; + int64_t strides_B[2] = {1, 3}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_C32, nmode_B, extents_B, strides_B); + + int nmode_C = 2; + int64_t extents_C[2] = {3, 3}; + int64_t strides_C[2] = {1, 3}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_C32, nmode_C, extents_C, strides_C); + + int nmode_D = 2; + int64_t extents_D[2] = {3, 3}; + int64_t strides_D[2] = {1, 3}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_C32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[2] = {'a', 'b'}; + int64_t idx_B[2] = {'b', 'c'}; + int64_t idx_C[2] = {'a', 'c'}; + int64_t idx_D[2] = {'a', 'c'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + std::complex alpha = 1; + + std::complex A[9] = { + {1, 1}, {3, 2}, {5, 3}, + {1, 1}, {3, 2}, {5, 3}, + {1, 1}, {3, 2}, {5, 3}}; + + std::complex B[9] = { + {1, 1}, {1, 1}, {1, 1}, + {2, 2}, {2, 2}, {2, 2}, + {3, 3}, {3, 3}, {3, 3}}; + + std::complex beta = {0, 1}; + + std::complex C[9] = { + {1, 2}, {2, 1}, {3, 1}, + {1, 2}, {2, 1}, {3, 1}, + {1, 2}, {2, 1}, {3, 1}}; + + std::complex D[9] = { + {1, 1}, {2, 2}, {3, 3}, + {4, 4}, {5, 5}, {6, 6}, + {7, 7}, {8, 8}, {9, 2}}; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 9 * sizeof(std::complex)); + cudaMalloc((void**)&B_d, 9 * sizeof(std::complex)); + cudaMalloc((void**)&C_d, 9 * sizeof(std::complex)); + cudaMalloc((void**)&D_d, 9 * sizeof(std::complex)); + + cudaMemcpy(A_d, (void*)A, 9 * sizeof(std::complex), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B, 9 * sizeof(std::complex), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 9 * sizeof(std::complex), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + + cudaMemcpy((void*)D, (void*)D_d, 9 * sizeof(std::complex), cudaMemcpyDeviceToHost); + + print_tensor_c_cpp(nmode_D, extents_D, strides_D, D); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +void conjugate() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 2; + int64_t extents_A[2] = {3, 3}; + int64_t strides_A[2] = {1, 3}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_C32, nmode_A, extents_A, strides_A); + + int nmode_B = 2; + int64_t extents_B[2] = {3, 3}; + int64_t strides_B[2] = {1, 3}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_C32, nmode_B, extents_B, strides_B); + + int nmode_C = 2; + int64_t extents_C[2] = {3, 3}; + int64_t strides_C[2] = {1, 3}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_C32, nmode_C, extents_C, strides_C); + + int nmode_D = 2; + int64_t extents_D[2] = {3, 3}; + int64_t strides_D[2] = {1, 3}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_C32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_CONJUGATE; + TAPP_element_op op_C = TAPP_CONJUGATE; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[2] = {'a', 'b'}; + int64_t idx_B[2] = {'b', 'c'}; + int64_t idx_C[2] = {'a', 'c'}; + int64_t idx_D[2] = {'a', 'c'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + std::complex alpha = 1; + + std::complex A[9] = { + {1, 1}, {3, 2}, {5, 3}, + {1, 1}, {3, 2}, {5, 3}, + {1, 1}, {3, 2}, {5, 3}}; + + std::complex B[9] = { + {1, 1}, {1, 1}, {1, 1}, + {2, 2}, {2, 2}, {2, 2}, + {3, 3}, {3, 3}, {3, 3}}; + + std::complex beta = {0, 1}; + + std::complex C[9] = { + {1, 2}, {2, 1}, {3, 1}, + {1, 2}, {2, 1}, {3, 1}, + {1, 2}, {2, 1}, {3, 1}}; + + std::complex D[9] = { + {1, 1}, {2, 2}, {3, 3}, + {4, 4}, {5, 5}, {6, 6}, + {7, 7}, {8, 8}, {9, 2}}; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 9 * sizeof(std::complex)); + cudaMalloc((void**)&B_d, 9 * sizeof(std::complex)); + cudaMalloc((void**)&C_d, 9 * sizeof(std::complex)); + cudaMalloc((void**)&D_d, 9 * sizeof(std::complex)); + + cudaMemcpy(A_d, (void*)A, 9 * sizeof(std::complex), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B, 9 * sizeof(std::complex), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 9 * sizeof(std::complex), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + + cudaMemcpy((void*)D, (void*)D_d, 9 * sizeof(std::complex), cudaMemcpyDeviceToHost); + + print_tensor_c_cpp(nmode_D, extents_D, strides_D, D); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +void zero_dim() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 0; + int64_t extents_A[0] = {}; + int64_t strides_A[0] = {}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); + + int nmode_B = 2; + int64_t extents_B[2] = {3, 3}; + int64_t strides_B[2] = {1, 3}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); + + int nmode_C = 2; + int64_t extents_C[2] = {3, 3}; + int64_t strides_C[2] = {1, 3}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); + + int nmode_D = 2; + int64_t extents_D[2] = {3, 3}; + int64_t strides_D[2] = {1, 3}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[0] = {}; + int64_t idx_B[2] = {'a', 'b'}; + int64_t idx_C[2] = {'a', 'b'}; + int64_t idx_D[2] = {'a', 'b'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + float alpha = 1; + + float A[1] = { + 5}; + + float B[9] = { + 1, 2, 3, + 4, 5, 6, + 7, 8, 9}; + + float beta = 0; + + float C[9] = { + 1, 1, 1, + 1, 1, 1, + 1, 1, 1}; + + float D[9] = { + 2, 2, 2, + 2, 2, 2, + 2, 2, 2}; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 1 * sizeof(float)); + cudaMalloc((void**)&B_d, 9 * sizeof(float)); + cudaMalloc((void**)&C_d, 9 * sizeof(float)); + cudaMalloc((void**)&D_d, 9 * sizeof(float)); + + cudaMemcpy(A_d, (void*)A, 1 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B, 9 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 9 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + + cudaMemcpy((void*)D, (void*)D_d, 9 * sizeof(float), cudaMemcpyDeviceToHost); + + print_tensor_s(nmode_D, extents_D, strides_D, D); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +void one_ext_contracted() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 4; + int64_t extents_A[4] = {4, 1, 3, 3}; + int64_t strides_A[4] = {1, 4, 4, 12}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); + + int nmode_B = 5; + int64_t extents_B[5] = {3, 2, 1, 2, 3}; + int64_t strides_B[5] = {1, 3, 6, 6, 12}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); + + int nmode_C = 3; + int64_t extents_C[3] = {4, 2, 2}; + int64_t strides_C[3] = {1, 4, 8}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); + + int nmode_D = 3; + int64_t extents_D[3] = {4, 2, 2}; + int64_t strides_D[3] = {1, 4, 8}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[4] = {'a', 'b', 'c', 'd'}; + int64_t idx_B[5] = {'d', 'e', 'b', 'f', 'c'}; + int64_t idx_C[3] = {'a', 'e', 'f'}; + int64_t idx_D[3] = {'a', 'e', 'f'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + float alpha = 1; + + float A[36] = { + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1}; + + float B[36] = { + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6, + + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6, + + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6}; + + float beta = 0; + + float C[16] = { + 2, 4, 6, 8, + 2, 4, 6, 8, + + 2, 4, 6, 8, + 2, 4, 6, 8}; + + float D[16] = { + 1, 2, 3, 4, + 5, 6, 7, 8, + + 1, 2, 3, 4, + 5, 6, 7, 8}; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 36 * sizeof(float)); + cudaMalloc((void**)&B_d, 36 * sizeof(float)); + cudaMalloc((void**)&C_d, 16 * sizeof(float)); + cudaMalloc((void**)&D_d, 16 * sizeof(float)); + + cudaMemcpy(A_d, (void*)A, 36 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B, 36 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + + cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost); + + print_tensor_s(nmode_D, extents_D, strides_D, D); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +void one_ext_transfered() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 4; + int64_t extents_A[4] = {4, 1, 3, 3}; + int64_t strides_A[4] = {1, 4, 4, 12}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); + + int nmode_B = 4; + int64_t extents_B[4] = {3, 2, 2, 3}; + int64_t strides_B[4] = {1, 3, 6, 12}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); + + int nmode_C = 4; + int64_t extents_C[4] = {4, 1, 2, 2}; + int64_t strides_C[4] = {1, 4, 4, 8}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); + + int nmode_D = 4; + int64_t extents_D[4] = {4, 1, 2, 2}; + int64_t strides_D[4] = {1, 4, 4, 8}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[4] = {'a', 'b', 'c', 'd'}; + int64_t idx_B[4] = {'d', 'e', 'f', 'c'}; + int64_t idx_C[4] = {'a', 'b', 'e', 'f'}; + int64_t idx_D[4] = {'a', 'b', 'e', 'f'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + float alpha = 1; + + float A[36] = { + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1}; + + float B[36] = { + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6, + + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6, + + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6}; + + float beta = 0; + + float C[16] = { + 2, 4, 6, 8, + 2, 4, 6, 8, + + 2, 4, 6, 8, + 2, 4, 6, 8}; + + float D[16] = { + 1, 2, 3, 4, + 5, 6, 7, 8, + + 1, 2, 3, 4, + 5, 6, 7, 8}; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 36 * sizeof(float)); + cudaMalloc((void**)&B_d, 36 * sizeof(float)); + cudaMalloc((void**)&C_d, 16 * sizeof(float)); + cudaMalloc((void**)&D_d, 16 * sizeof(float)); + + cudaMemcpy(A_d, (void*)A, 36 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B, 36 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + + cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost); + + print_tensor_s(nmode_D, extents_D, strides_D, D); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +void chained_diff_op() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 3; + int64_t extents_A[3] = {4, 3, 3}; + int64_t strides_A[3] = {1, 4, 12}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); + + int nmode_B = 4; + int64_t extents_B[4] = {3, 2, 2, 3}; + int64_t strides_B[4] = {1, 3, 6, 12}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); + + int nmode_C = 3; + int64_t extents_C[3] = {4, 2, 2}; + int64_t strides_C[3] = {1, 4, 8}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); + + int nmode_D = 3; + int64_t extents_D[3] = {4, 2, 2}; + int64_t strides_D[3] = {1, 4, 8}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[3] = {'a', 'b', 'c'}; + int64_t idx_B[4] = {'c', 'd', 'e', 'b'}; + int64_t idx_C[3] = {'a', 'd', 'e'}; + int64_t idx_D[3] = {'a', 'd', 'e'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + float alpha = 2; + + float A[36] = { + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + + 1, 2, 1.01, -1, + 1, 2, 1.01, -1, + 1, 2, 1.01, -1}; + + float B[36] = { + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6, + + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6, + + 1, 1, 1, + 2, 2, 2, + + 3, 3, 3, + 6, 6, 6}; + + float beta = 0; + + float C[16] = { + 2, 4, 6, 8, + 2, 4, 6, 8, + + 2, 4, 6, 8, + 2, 4, 6, 8}; + + float D[16] = { + 1, 2, 3, 4, + 5, 6, 7, 8, + + 1, 2, 3, 4, + 5, 6, 7, 8}; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 36 * sizeof(float)); + cudaMalloc((void**)&B_d, 36 * sizeof(float)); + cudaMalloc((void**)&C_d, 16 * sizeof(float)); + cudaMalloc((void**)&D_d, 16 * sizeof(float)); + + cudaMemcpy(A_d, (void*)A, 36 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B, 36 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + + cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost); + + printf("\tOperation 1:\n"); + print_tensor_s(nmode_D, extents_D, strides_D, D); + + alpha = 0.5; + + int nmode_E = 3; + int64_t extents_E[3] = {4, 2, 2}; + int64_t strides_E[3] = {1, 4, 8}; + TAPP_tensor_info info_E; + TAPP_create_tensor_info(&info_E, handle, TAPP_F32, nmode_E, extents_E, strides_E); + + TAPP_tensor_product plan2; + TAPP_element_op op_E = TAPP_IDENTITY; + int64_t idx_E[3] = {'a', 'd', 'e'}; + TAPP_create_tensor_product(&plan2, handle, op_D, info_D, idx_D, op_C, info_C, idx_C, op_C, info_C, idx_C, op_E, info_E, idx_E, prec); + + float E[16] = { + 1, 2, 3, 4, + 5, 6, 7, 8, + + 1, 2, 3, 4, + 5, 6, 7, 8}; + + void* E_d; // Device pointer + cudaMalloc((void**)&E_d, 16 * sizeof(float)); + + cudaMemcpy(E_d, (void*)E, 16 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(E_d) % 128 == 0); + + TAPP_execute_product(plan2, exec, &status, (void *)&alpha, (void *)D_d, (void *)C_d, (void *)&beta, (void *)C_d, (void *)E_d); + + cudaMemcpy((void*)E, (void*)E_d, 16 * sizeof(float), cudaMemcpyDeviceToHost); + + printf("\tOperation 2:\n"); + print_tensor_s(nmode_E, extents_E, strides_E, E); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + if (E_d) cudaFree(E_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_product(plan2); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_tensor_info(info_E); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +void chained_same_op() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 2; + int64_t extents_A[2] = {4, 4}; + int64_t strides_A[2] = {1, 4}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); + + int nmode_B = 2; + int64_t extents_B[2] = {4, 4}; + int64_t strides_B[2] = {1, 4}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); + + int nmode_C = 2; + int64_t extents_C[2] = {4, 4}; + int64_t strides_C[2] = {1, 4}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); + + int nmode_D = 2; + int64_t extents_D[2] = {4, 4}; + int64_t strides_D[2] = {1, 4}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[2] = {'a', 'b'}; + int64_t idx_B[2] = {'a', 'b'}; + int64_t idx_C[2] = {'a', 'b'}; + int64_t idx_D[2] = {'a', 'b'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + float alpha = 3; + + float A[16] = { + 1, 2, 3, 4, + 1, 2, 3, 4, + 1, 2, 3, 4, + 1, 2, 3, 4}; + + float B[16] = { + 1, 1, 1, 1, + 2, 2, 2, 2, + 3, 3, 3, 3, + 4, 4, 4, 4}; + + float beta = 2; + + float C[16] = { + 1, 2, 1, 2, + 1, 2, 1, 2, + 1, 2, 1, 2, + 1, 2, 1, 2}; + + float D[16] = { + 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12, + 13, 14, 15, 16}; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 16 * sizeof(float)); + cudaMalloc((void**)&B_d, 16 * sizeof(float)); + cudaMalloc((void**)&C_d, 16 * sizeof(float)); + cudaMalloc((void**)&D_d, 16 * sizeof(float)); + + cudaMemcpy(A_d, (void*)A, 16 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B, 16 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + + cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost); + + printf("\tOperation 1:\n"); + print_tensor_s(nmode_D, extents_D, strides_D, D); + + alpha = 1; + beta = 2; + float E[16] = { + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + }; + + void* E_d; // Device pointer + cudaMalloc((void**)&E_d, 16 * sizeof(float)); + + cudaMemcpy(E_d, (void*)E, 16 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(E_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)D_d, (void *)&beta, (void *)C_d, (void *)E_d); + + cudaMemcpy((void*)E, (void*)E_d, 16 * sizeof(float), cudaMemcpyDeviceToHost); + + printf("\tOperation 2:\n"); + print_tensor_s(nmode_D, extents_D, strides_D, E); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +/*void negative_str() //cutensor does not support negative strides +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 3; + int64_t extents_A[3] = {4, 3, 3}; + int64_t strides_A[3] = {-1, -4, -12}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); + + int nmode_B = 4; + int64_t extents_B[4] = {3, 2, 2, 3}; + int64_t strides_B[4] = {-1, -3, -6, -12}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); + + int nmode_C = 3; + int64_t extents_C[3] = {4, 2, 2}; + int64_t strides_C[3] = {1, 4, 8}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); + + int nmode_D = 3; + int64_t extents_D[3] = {4, 2, 2}; + int64_t strides_D[3] = {1, 4, 8}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[3] = {'a', 'b', 'c'}; + int64_t idx_B[4] = {'c', 'd', 'e', 'b'}; + int64_t idx_C[3] = {'a', 'd', 'e'}; + int64_t idx_D[3] = {'a', 'd', 'e'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + float alpha = 1; + + float A[36] = { + -1, 1.01, 2, 1, + -1, 1.01, 2, 1, + -1, 1.01, 2, 1, + + -1, 1.01, 2, 1, + -1, 1.01, 2, 1, + -1, 1.01, 2, 1, + + -1, 1.01, 2, 1, + -1, 1.01, 2, 1, + -1, 1.01, 2, 1}; + + float B[36] = { + 6, 6, 6, + 3, 3, 3, + + 2, 2, 2, + 1, 1, 1, + + 6, 6, 6, + 3, 3, 3, + + 2, 2, 2, + 1, 1, 1, + + 6, 6, 6, + 3, 3, 3, + + 2, 2, 2, + 1, 1, 1}; + + float beta = 0; + + float C[16] = { + 2, 4, 6, 8, + 2, 4, 6, 8, + + 2, 4, 6, 8, + 2, 4, 6, 8}; + + float D[16] = { + 1, 2, 3, 4, + 5, 6, 7, 8, + + 1, 2, 3, 4, + 5, 6, 7, 8}; + + float *A_ptr = &A[35]; + float *B_ptr = &B[35]; + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D); + + print_tensor_s(nmode_D, extents_D, strides_D, D); + + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +}*/ + +void subtensors() +{ + TAPP_handle handle; + TAPP_create_handle(&handle); + + int nmode_A = 3; + int64_t extents_A[3] = {3, 2, 2}; + int64_t strides_A[3] = {1, 12, 24}; + TAPP_tensor_info info_A; + TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); + + int nmode_B = 3; + int64_t extents_B[3] = {2, 2, 3}; + int64_t strides_B[3] = {3, 6, 12}; + TAPP_tensor_info info_B; + TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); + + int nmode_C = 2; + int64_t extents_C[2] = {3, 3}; + int64_t strides_C[2] = {1, 3}; + TAPP_tensor_info info_C; + TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); + + int nmode_D = 2; + int64_t extents_D[2] = {3, 3}; + int64_t strides_D[2] = {1, 3}; + TAPP_tensor_info info_D; + TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); + + TAPP_tensor_product plan; + TAPP_element_op op_A = TAPP_IDENTITY; + TAPP_element_op op_B = TAPP_IDENTITY; + TAPP_element_op op_C = TAPP_IDENTITY; + TAPP_element_op op_D = TAPP_IDENTITY; + int64_t idx_A[3] = {'a', 'b', 'c'}; + int64_t idx_B[3] = {'b', 'c', 'd'}; + int64_t idx_C[2] = {'a', 'd'}; + int64_t idx_D[2] = {'a', 'd'}; + TAPP_prectype prec = TAPP_DEFAULT_PREC; + TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + + TAPP_executor exec; + TAPP_create_executor(&exec); + TAPP_status status; + + float alpha = 1; + + float A[48] = { + 0, + 0, + 0, + 0, + 0, + 2, + 1.01, + -1, + 0, + 0, + 0, + 0, + + 0, + 0, + 0, + 0, + 0, + 2, + 1.01, + -1, + 0, + 0, + 0, + 0, + + 0, + 0, + 0, + 0, + 0, + 2, + 1.01, + -1, + 0, + 0, + 0, + 0, + + 0, + 0, + 0, + 0, + 0, + 2, + 1.01, + -1, + 0, + 0, + 0, + 0, + }; + + float B[36] = { + 0, 1, 0, + 0, 2, 0, + + 0, 3, 0, + 0, 4, 0, + + 0, 2, 0, + 0, 4, 0, + + 0, 6, 0, + 0, 8, 0, + + 0, 3, 0, + 0, 6, 0, + + 0, 9, 0, + 0, 12, 0}; + + float beta = 0.5; + + float C[9] = { + 2, 4, 6, + 2, 4, 6, + 2, 4, 6}; + + float D[12] = { + 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12}; + + float *A_ptr = &A[5]; + + float *B_ptr = &B[1]; + + void *A_d, *B_d, *C_d, *D_d; // Device pointers + cudaMalloc((void**)&A_d, 43 * sizeof(float)); + cudaMalloc((void**)&B_d, 35 * sizeof(float)); + cudaMalloc((void**)&C_d, 9 * sizeof(float)); + cudaMalloc((void**)&D_d, 12 * sizeof(float)); + + cudaMemcpy(A_d, (void*)A_ptr, 43 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(B_d, (void*)B_ptr, 35 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(C_d, (void*)C, 9 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(D_d, (void*)D, 12 * sizeof(float), cudaMemcpyHostToDevice); + + assert(uintptr_t(A_d) % 128 == 0); + assert(uintptr_t(B_d) % 128 == 0); + assert(uintptr_t(C_d) % 128 == 0); + assert(uintptr_t(D_d) % 128 == 0); + + TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d); + + int64_t super_extents_D[2] = {4, 3}; + int64_t super_strides_D[2] = {1, 4}; + + cudaMemcpy((void*)D, (void*)D_d, 12 * sizeof(float), cudaMemcpyDeviceToHost); + print_tensor_s(nmode_D, super_extents_D, super_strides_D, D); + + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (D_d) cudaFree(D_d); + TAPP_destroy_tensor_product(plan); + TAPP_destroy_tensor_info(info_A); + TAPP_destroy_tensor_info(info_B); + TAPP_destroy_tensor_info(info_C); + TAPP_destroy_tensor_info(info_D); + TAPP_destroy_executor(exec); + TAPP_destroy_handle(handle); +} + +void print_tensor_c_cpp(int nmode, const int64_t *extents, const int64_t *strides, const std::complex *data) +{ + int64_t *coords = (int64_t *)malloc(nmode * sizeof(int64_t)); + int64_t size = 1; + for (size_t i = 0; i < nmode; i++) + { + coords[i] = 0; + size *= extents[i]; + } + printf("\t"); + for (size_t j = 0; j < size; j++) + { + int64_t index = 0; + for (size_t i = 0; i < nmode; i++) + { + index += coords[i] * strides[i]; + } + printf("%.3f+%.3fi", data[index].real(), data[index].imag()); + + if (nmode <= 0) + continue; + + int k = 0; + do + { + if (k != 0) + { + printf("\n"); + if (j < size - 1) + { + printf("\t"); + } + } + else + { + printf(" "); + } + coords[k] = (coords[k] + 1) % extents[k]; + k++; + } while (coords[k - 1] == 0 && k < nmode); + } + free(coords); +} \ No newline at end of file diff --git a/test/demo.c b/test/demo.c index 3f26335..6cd6a42 100644 --- a/test/demo.c +++ b/test/demo.c @@ -10,6 +10,74 @@ #include #include #include +#ifdef TAPP_DYNAMIC_LAUNCH +#include // POSIX dynamic loading, TODO: fix for windows +#include +#endif + +#ifdef TAPP_DYNAMIC_LAUNCH +const char* path = "./cutensor_bindings/libtapp-cutensor.so"; +#endif + +void* dlhandle; +TAPP_error (*fn_TAPP_attr_set)(TAPP_attr attr, TAPP_key key, void* value); +TAPP_error (*fn_TAPP_attr_get)(TAPP_attr attr, TAPP_key key, void** value); +TAPP_error (*fn_TAPP_attr_clear)(TAPP_attr attr, TAPP_key key); +bool (*fn_TAPP_check_success)(TAPP_error error); +size_t (*fn_TAPP_explain_error)(TAPP_error error, size_t maxlen, char* message); +TAPP_error (*fn_TAPP_create_executor)(TAPP_executor* exec); +TAPP_error (*fn_TAPP_destroy_executor)(TAPP_executor exec); +TAPP_error (*fn_TAPP_create_handle)(TAPP_handle* handle); +TAPP_error (*fn_TAPP_destroy_handle)(TAPP_handle handle); +TAPP_error (*fn_TAPP_create_tensor_product)(TAPP_tensor_product* plan, + TAPP_handle handle, + TAPP_element_op op_A, + TAPP_tensor_info A, + const int64_t* idx_A, + TAPP_element_op op_B, + TAPP_tensor_info B, + const int64_t* idx_B, + TAPP_element_op op_C, + TAPP_tensor_info C, + const int64_t* idx_C, + TAPP_element_op op_D, + TAPP_tensor_info D, + const int64_t* idx_D, + TAPP_prectype prec); +TAPP_error (*fn_TAPP_destroy_tensor_product)(TAPP_tensor_product plan); +TAPP_error (*fn_TAPP_execute_product)(TAPP_tensor_product plan, + TAPP_executor exec, + TAPP_status* status, + const void* alpha, + const void* A, + const void* B, + const void* beta, + const void* C, + void* D); +TAPP_error (*fn_TAPP_execute_batched_product)(TAPP_tensor_product plan, + TAPP_executor exec, + TAPP_status* status, + int num_batches, + const void* alpha, + const void** A, + const void** B, + const void* beta, + const void** C, + void** D); +TAPP_error (*fn_TAPP_destroy_status)(TAPP_status status); +TAPP_error (*fn_TAPP_create_tensor_info)(TAPP_tensor_info* info, + TAPP_handle handle, + TAPP_datatype type, + int nmode, + const int64_t* extents, + const int64_t* strides); +TAPP_error (*fn_TAPP_destroy_tensor_info)(TAPP_tensor_info info); +int (*fn_TAPP_get_nmodes)(TAPP_tensor_info info); +TAPP_error (*fn_TAPP_set_nmodes)(TAPP_tensor_info info, int nmodes); +void (*fn_TAPP_get_extents)(TAPP_tensor_info info, int64_t* extents); +TAPP_error (*fn_TAPP_set_extents)(TAPP_tensor_info info, const int64_t* extents); +void (*fn_TAPP_get_strides)(TAPP_tensor_info info, int64_t* strides); +TAPP_error (*fn_TAPP_set_strides)(TAPP_tensor_info info, const int64_t* strides); void contraction(); void hadamard(); @@ -23,15 +91,88 @@ void chained_same_op(); void negative_str(); void subtensors(); +void load_implementation() { +#ifdef TAPP_DYNAMIC_LAUNCH + dlhandle = dlopen(path, RTLD_LAZY); + if (!dlhandle) { + fprintf(stderr, "dlopen failed: %s\n", dlerror()); + return; + } + dlerror(); + *(void**)(&fn_TAPP_attr_set) = dlsym(dlhandle, "TAPP_attr_set"); + *(void**)(&fn_TAPP_attr_get) = dlsym(dlhandle, "TAPP_attr_get"); + *(void**)(&fn_TAPP_attr_clear) = dlsym(dlhandle, "TAPP_attr_clear"); + *(void**)(&fn_TAPP_check_success) = dlsym(dlhandle, "TAPP_check_success"); + *(void**)(&fn_TAPP_explain_error) = dlsym(dlhandle, "TAPP_explain_error"); + *(void**)(&fn_TAPP_create_executor) = dlsym(dlhandle, "TAPP_create_executor"); + *(void**)(&fn_TAPP_destroy_executor) = dlsym(dlhandle, "TAPP_destroy_executor"); + *(void**)(&fn_TAPP_create_handle) = dlsym(dlhandle, "TAPP_create_handle"); + *(void**)(&fn_TAPP_destroy_handle) = dlsym(dlhandle, "TAPP_destroy_handle"); + *(void**)(&fn_TAPP_create_tensor_product) = dlsym(dlhandle, "TAPP_create_tensor_product"); + *(void**)(&fn_TAPP_destroy_tensor_product) = dlsym(dlhandle, "TAPP_destroy_tensor_product"); + *(void**)(&fn_TAPP_execute_product) = dlsym(dlhandle, "TAPP_execute_product"); + *(void**)(&fn_TAPP_execute_batched_product) = dlsym(dlhandle, "TAPP_execute_batched_product"); + *(void**)(&fn_TAPP_destroy_status) = dlsym(dlhandle, "TAPP_destroy_status"); + *(void**)(&fn_TAPP_create_tensor_info) = dlsym(dlhandle, "TAPP_create_tensor_info"); + *(void**)(&fn_TAPP_destroy_tensor_info) = dlsym(dlhandle, "TAPP_destroy_tensor_info"); + *(void**)(&fn_TAPP_get_nmodes) = dlsym(dlhandle, "TAPP_get_nmodes"); + *(void**)(&fn_TAPP_set_nmodes) = dlsym(dlhandle, "TAPP_set_nmodes"); + *(void**)(&fn_TAPP_get_extents) = dlsym(dlhandle, "TAPP_get_extents"); + *(void**)(&fn_TAPP_set_extents) = dlsym(dlhandle, "TAPP_set_extents"); + *(void**)(&fn_TAPP_get_strides) = dlsym(dlhandle, "TAPP_get_strides"); + *(void**)(&fn_TAPP_set_strides) = dlsym(dlhandle, "TAPP_set_strides"); + const char* error = dlerror(); + if (error != NULL) { + fprintf(stderr, "dlsym failed: %s\n", error); + dlclose(dlhandle); + return; + } +#else + //fn_TAPP_attr_set = TAPP_attr_set; Not implemented in the reference implementation + //fn_TAPP_attr_get = TAPP_attr_get; Not implemented in the reference implementation + //fn_TAPP_attr_clear = TAPP_attr_clear; Not implemented in the reference implementation + fn_TAPP_check_success = TAPP_check_success; + fn_TAPP_explain_error = TAPP_explain_error; + fn_TAPP_create_executor = TAPP_create_executor; + fn_TAPP_destroy_executor = TAPP_destroy_executor; + fn_TAPP_create_handle = TAPP_create_handle; + fn_TAPP_destroy_handle = TAPP_destroy_handle; + fn_TAPP_create_tensor_product = TAPP_create_tensor_product; + fn_TAPP_destroy_tensor_product = TAPP_destroy_tensor_product; + fn_TAPP_execute_product = TAPP_execute_product; + //fn_TAPP_execute_batched_product = TAPP_execute_batched_product; Not implemented in the reference implementation + //fn_TAPP_destroy_status = TAPP_destroy_status; Not implemented in the reference implementation + fn_TAPP_create_tensor_info = TAPP_create_tensor_info; + fn_TAPP_destroy_tensor_info = TAPP_destroy_tensor_info; + fn_TAPP_get_nmodes = TAPP_get_nmodes; + fn_TAPP_set_nmodes = TAPP_set_nmodes; + fn_TAPP_get_extents = TAPP_get_extents; + fn_TAPP_set_extents = TAPP_set_extents; + fn_TAPP_get_strides = TAPP_get_strides; + fn_TAPP_set_strides = TAPP_set_strides; +#endif +} + +#ifdef TAPP_DYNAMIC_LAUNCH +void unload_implementation() { + if (dlhandle) { + dlclose(dlhandle); + dlhandle = NULL; + } +} +#endif + int main(int argc, char const *argv[]) { + load_implementation(); + printf("Contraction: \n"); contraction(); printf("Hadamard: \n"); hadamard(); printf("Complex: \n"); complex_num(); - printf("Conjugate: \n"); + printf("Conjugate: \n"); // Incorrect result with CuTensor, CuTensor applies opC in another way conjugate(); printf("Zero dim: \n"); zero_dim(); @@ -43,40 +184,51 @@ int main(int argc, char const *argv[]) chained_diff_op(); printf("Chained same op: \n"); chained_same_op(); - printf("Negative str: \n"); + printf("Negative str: \n"); // Doesn't work with CuTensor, does not allow negative strides negative_str(); printf("Subtensors: \n"); subtensors(); + +#ifdef TAPP_DYNAMIC_LAUNCH + unload_implementation(); +#endif + return 0; } void contraction() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 3; int64_t extents_A[3] = {4, 3, 3}; int64_t strides_A[3] = {1, 4, 12}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); int nmode_B = 4; int64_t extents_B[4] = {3, 2, 2, 3}; int64_t strides_B[4] = {1, 3, 6, 12}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); int nmode_C = 3; int64_t extents_C[3] = {4, 2, 2}; int64_t strides_C[3] = {1, 4, 8}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); int nmode_D = 3; int64_t extents_D[3] = {4, 2, 2}; int64_t strides_D[3] = {1, 4, 8}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_handle handle; + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -87,10 +239,10 @@ void contraction() int64_t idx_C[3] = {'a', 'd', 'e'}; int64_t idx_D[3] = {'a', 'd', 'e'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); // int exec_id = 1; // exec = (intptr_t)&exec_id; TAPP_status status; @@ -145,51 +297,59 @@ void contraction() 1, 2, 3, 4, 5, 6, 7, 8}; - TAPP_error error = TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); - printf(TAPP_check_success(error) ? "Success\n" : "Fail\n"); - int message_len = TAPP_explain_error(error, 0, NULL); + TAPP_error error = fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); + printf(fn_TAPP_check_success(error) ? "Success\n" : "Fail\n"); + int message_len = fn_TAPP_explain_error(error, 0, NULL); char *message_buff = malloc((message_len + 1) * sizeof(char)); - TAPP_explain_error(error, message_len + 1, message_buff); - printf(message_buff); + fn_TAPP_explain_error(error, message_len + 1, message_buff); + printf("%s", message_buff); free(message_buff); print_tensor_s(nmode_D, extents_D, strides_D, D); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void hadamard() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 2; int64_t extents_A[2] = {4, 4}; int64_t strides_A[2] = {1, 4}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); int nmode_B = 2; int64_t extents_B[2] = {4, 4}; int64_t strides_B[2] = {1, 4}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); int nmode_C = 2; int64_t extents_C[2] = {4, 4}; int64_t strides_C[2] = {1, 4}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); int nmode_D = 2; int64_t extents_D[2] = {4, 4}; int64_t strides_D[2] = {1, 4}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -200,10 +360,10 @@ void hadamard() int64_t idx_C[2] = {'a', 'b'}; int64_t idx_D[2] = {'a', 'b'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float alpha = 3; @@ -247,45 +407,53 @@ void hadamard() 16, }; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); print_tensor_s(nmode_D, extents_D, strides_D, D); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void complex_num() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 2; int64_t extents_A[2] = {3, 3}; int64_t strides_A[2] = {1, 3}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_C32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_C32, nmode_A, extents_A, strides_A); int nmode_B = 2; int64_t extents_B[2] = {3, 3}; int64_t strides_B[2] = {1, 3}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_C32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_C32, nmode_B, extents_B, strides_B); int nmode_C = 2; int64_t extents_C[2] = {3, 3}; int64_t strides_C[2] = {1, 3}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_C32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_C32, nmode_C, extents_C, strides_C); int nmode_D = 2; int64_t extents_D[2] = {3, 3}; int64_t strides_D[2] = {1, 3}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_C32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_C32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -296,10 +464,10 @@ void complex_num() int64_t idx_C[2] = {'a', 'c'}; int64_t idx_D[2] = {'a', 'c'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float complex alpha = 1; @@ -326,45 +494,53 @@ void complex_num() 4 + 4 * I, 5 + 5 * I, 6 + 6 * I, 7 + 7 * I, 8 + 8 * I, 9 + 2 * I}; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); print_tensor_c(nmode_D, extents_D, strides_D, D); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void conjugate() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 2; int64_t extents_A[2] = {3, 3}; int64_t strides_A[2] = {1, 3}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_C32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_C32, nmode_A, extents_A, strides_A); int nmode_B = 2; int64_t extents_B[2] = {3, 3}; int64_t strides_B[2] = {1, 3}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_C32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_C32, nmode_B, extents_B, strides_B); int nmode_C = 2; int64_t extents_C[2] = {3, 3}; int64_t strides_C[2] = {1, 3}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_C32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_C32, nmode_C, extents_C, strides_C); int nmode_D = 2; int64_t extents_D[2] = {3, 3}; int64_t strides_D[2] = {1, 3}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_C32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_C32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_CONJUGATE; @@ -375,10 +551,10 @@ void conjugate() int64_t idx_C[2] = {'a', 'c'}; int64_t idx_D[2] = {'a', 'c'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float complex alpha = 1; @@ -405,45 +581,53 @@ void conjugate() 4 + 4 * I, 5 + 5 * I, 6 + 6 * I, 7 + 7 * I, 8 + 8 * I, 9 + 2 * I}; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); print_tensor_c(nmode_D, extents_D, strides_D, D); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void zero_dim() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 0; int64_t extents_A[0] = {}; int64_t strides_A[0] = {}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); int nmode_B = 2; int64_t extents_B[2] = {3, 3}; int64_t strides_B[2] = {1, 3}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); int nmode_C = 2; int64_t extents_C[2] = {3, 3}; int64_t strides_C[2] = {1, 3}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); int nmode_D = 2; int64_t extents_D[2] = {3, 3}; int64_t strides_D[2] = {1, 3}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -454,10 +638,10 @@ void zero_dim() int64_t idx_C[2] = {'a', 'b'}; int64_t idx_D[2] = {'a', 'b'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float alpha = 1; @@ -482,45 +666,53 @@ void zero_dim() 2, 2, 2, 2, 2, 2}; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); print_tensor_s(nmode_D, extents_D, strides_D, D); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void one_ext_contracted() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 4; int64_t extents_A[4] = {4, 1, 3, 3}; int64_t strides_A[4] = {1, 4, 4, 12}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); int nmode_B = 5; int64_t extents_B[5] = {3, 2, 1, 2, 3}; int64_t strides_B[5] = {1, 3, 6, 6, 12}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); int nmode_C = 3; int64_t extents_C[3] = {4, 2, 2}; int64_t strides_C[3] = {1, 4, 8}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); int nmode_D = 3; int64_t extents_D[3] = {4, 2, 2}; int64_t strides_D[3] = {1, 4, 8}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -531,10 +723,10 @@ void one_ext_contracted() int64_t idx_C[3] = {'a', 'e', 'f'}; int64_t idx_D[3] = {'a', 'e', 'f'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float alpha = 1; @@ -587,45 +779,53 @@ void one_ext_contracted() 1, 2, 3, 4, 5, 6, 7, 8}; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); print_tensor_s(nmode_D, extents_D, strides_D, D); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void one_ext_transfered() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 4; int64_t extents_A[4] = {4, 1, 3, 3}; int64_t strides_A[4] = {1, 4, 4, 12}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); int nmode_B = 4; int64_t extents_B[4] = {3, 2, 2, 3}; int64_t strides_B[4] = {1, 3, 6, 12}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); int nmode_C = 4; int64_t extents_C[4] = {4, 1, 2, 2}; int64_t strides_C[4] = {1, 4, 4, 8}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); int nmode_D = 4; int64_t extents_D[4] = {4, 1, 2, 2}; int64_t strides_D[4] = {1, 4, 4, 8}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -636,10 +836,10 @@ void one_ext_transfered() int64_t idx_C[4] = {'a', 'b', 'e', 'f'}; int64_t idx_D[4] = {'a', 'b', 'e', 'f'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float alpha = 1; @@ -692,45 +892,53 @@ void one_ext_transfered() 1, 2, 3, 4, 5, 6, 7, 8}; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); print_tensor_s(nmode_D, extents_D, strides_D, D); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void chained_diff_op() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 3; int64_t extents_A[3] = {4, 3, 3}; int64_t strides_A[3] = {1, 4, 12}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); int nmode_B = 4; int64_t extents_B[4] = {3, 2, 2, 3}; int64_t strides_B[4] = {1, 3, 6, 12}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); int nmode_C = 3; int64_t extents_C[3] = {4, 2, 2}; int64_t strides_C[3] = {1, 4, 8}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); int nmode_D = 3; int64_t extents_D[3] = {4, 2, 2}; int64_t strides_D[3] = {1, 4, 8}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -741,10 +949,10 @@ void chained_diff_op() int64_t idx_C[3] = {'a', 'd', 'e'}; int64_t idx_D[3] = {'a', 'd', 'e'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float alpha = 2; @@ -797,7 +1005,7 @@ void chained_diff_op() 1, 2, 3, 4, 5, 6, 7, 8}; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); printf("\tOperation 1:\n"); print_tensor_s(nmode_D, extents_D, strides_D, D); @@ -808,12 +1016,12 @@ void chained_diff_op() int64_t extents_E[3] = {4, 2, 2}; int64_t strides_E[3] = {1, 4, 8}; TAPP_tensor_info info_E; - TAPP_create_tensor_info(&info_E, TAPP_F32, nmode_E, extents_E, strides_E); + fn_TAPP_create_tensor_info(&info_E, handle, TAPP_F32, nmode_E, extents_E, strides_E); TAPP_tensor_product plan2; TAPP_element_op op_E = TAPP_IDENTITY; int64_t idx_E[3] = {'a', 'd', 'e'}; - TAPP_create_tensor_product(&plan2, handle, op_D, info_D, idx_D, op_C, info_C, idx_C, op_C, info_C, idx_C, op_E, info_E, idx_E, prec); + fn_TAPP_create_tensor_product(&plan2, handle, op_D, info_D, idx_D, op_C, info_C, idx_C, op_C, info_C, idx_C, op_E, info_E, idx_E, prec); float E[16] = { 1, 2, 3, 4, @@ -821,48 +1029,56 @@ void chained_diff_op() 1, 2, 3, 4, 5, 6, 7, 8}; - TAPP_execute_product(plan2, exec, &status, (void *)&alpha, (void *)D, (void *)C, (void *)&beta, (void *)C, (void *)E); + fn_TAPP_execute_product(plan2, exec, &status, (void *)&alpha, (void *)D, (void *)C, (void *)&beta, (void *)C, (void *)E); printf("\tOperation 2:\n"); print_tensor_s(nmode_E, extents_E, strides_E, E); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_product(plan2); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_tensor_info(info_E); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_product(plan2); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_tensor_info(info_E); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void chained_same_op() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 2; int64_t extents_A[2] = {4, 4}; int64_t strides_A[2] = {1, 4}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); int nmode_B = 2; int64_t extents_B[2] = {4, 4}; int64_t strides_B[2] = {1, 4}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); int nmode_C = 2; int64_t extents_C[2] = {4, 4}; int64_t strides_C[2] = {1, 4}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); int nmode_D = 2; int64_t extents_D[2] = {4, 4}; int64_t strides_D[2] = {1, 4}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -873,10 +1089,10 @@ void chained_same_op() int64_t idx_C[2] = {'a', 'b'}; int64_t idx_D[2] = {'a', 'b'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float alpha = 3; @@ -907,7 +1123,7 @@ void chained_same_op() 9, 10, 11, 12, 13, 14, 15, 16}; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); printf("\tOperation 1:\n"); print_tensor_s(nmode_D, extents_D, strides_D, D); @@ -932,46 +1148,54 @@ void chained_same_op() 15, 16, }; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)D, (void *)&beta, (void *)C, (void *)E); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)D, (void *)&beta, (void *)C, (void *)E); printf("\tOperation 2:\n"); print_tensor_s(nmode_D, extents_D, strides_D, E); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void negative_str() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 3; int64_t extents_A[3] = {4, 3, 3}; int64_t strides_A[3] = {-1, -4, -12}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); int nmode_B = 4; int64_t extents_B[4] = {3, 2, 2, 3}; int64_t strides_B[4] = {-1, -3, -6, -12}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); int nmode_C = 3; int64_t extents_C[3] = {4, 2, 2}; int64_t strides_C[3] = {1, 4, 8}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); int nmode_D = 3; int64_t extents_D[3] = {4, 2, 2}; int64_t strides_D[3] = {1, 4, 8}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -982,10 +1206,10 @@ void negative_str() int64_t idx_C[3] = {'a', 'd', 'e'}; int64_t idx_D[3] = {'a', 'd', 'e'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float alpha = 1; @@ -1041,45 +1265,53 @@ void negative_str() float *A_ptr = &A[35]; float *B_ptr = &B[35]; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D); print_tensor_s(nmode_D, extents_D, strides_D, D); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } void subtensors() { + TAPP_handle handle; + fn_TAPP_create_handle(&handle); + +#ifdef TAPP_DYNAMIC_LAUNCH + bool use_device_memory = false; // CuTensor specific attribute + fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute +#endif + int nmode_A = 3; int64_t extents_A[3] = {3, 2, 2}; int64_t strides_A[3] = {1, 12, 24}; TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); + fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); int nmode_B = 3; int64_t extents_B[3] = {2, 2, 3}; int64_t strides_B[3] = {3, 6, 12}; TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); + fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B); int nmode_C = 2; int64_t extents_C[2] = {3, 3}; int64_t strides_C[2] = {1, 3}; TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); + fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C); int nmode_D = 2; int64_t extents_D[2] = {3, 3}; - int64_t strides_D[2] = {1, 4}; + int64_t strides_D[2] = {1, 3}; TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_handle handle; TAPP_tensor_product plan; TAPP_element_op op_A = TAPP_IDENTITY; TAPP_element_op op_B = TAPP_IDENTITY; @@ -1090,10 +1322,10 @@ void subtensors() int64_t idx_C[2] = {'a', 'd'}; int64_t idx_D[2] = {'a', 'd'}; TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); + fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); TAPP_executor exec; - TAPP_create_executor(&exec); + fn_TAPP_create_executor(&exec); TAPP_status status; float alpha = 1; @@ -1187,16 +1419,17 @@ void subtensors() float *B_ptr = &B[1]; - TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D); + fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D); int64_t super_extents_D[2] = {4, 3}; int64_t super_strides_D[2] = {1, 4}; print_tensor_s(nmode_D, super_extents_D, super_strides_D, D); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); + fn_TAPP_destroy_tensor_product(plan); + fn_TAPP_destroy_tensor_info(info_A); + fn_TAPP_destroy_tensor_info(info_B); + fn_TAPP_destroy_tensor_info(info_C); + fn_TAPP_destroy_tensor_info(info_D); + fn_TAPP_destroy_executor(exec); + fn_TAPP_destroy_handle(handle); } \ No newline at end of file diff --git a/test/exercise.c b/test/exercise.c deleted file mode 100644 index 31a5baa..0000000 --- a/test/exercise.c +++ /dev/null @@ -1,207 +0,0 @@ -#include - -#include "helpers.h" -#include -#include -#include - -int main(int argc, char const *argv[]) -{ - /* - * Create the tensor structures for tensor A, B, C and D. - * Tensor A 3 dimensional tensor with the extents 4, 3, 2, and the strides 1, 4, 12. - * Tensor B 3 dimensional tensor with the extents 3, 2, 4, and the strides 1, 3, 6. - * Tensor C 2 dimensional tensor with the extents 3, 3, and the strides 1, 3. - * Tensor D 2 dimensional tensor with the extents 3, 3, and the strides 1, 3. - */ - - // Tensor A - // Assign the number of indices - /* Remove */ int nmode_A = 3; - - // Assign the extents - /* Remove */ int64_t extents_A[3] = {4, 3, 2}; - - // Assign the strides - /* Remove */ int64_t strides_A[3] = {1, 4, 12}; - - // Declare the tensor structure variable - /* Remove */ TAPP_tensor_info info_A; - - // Assign the structure to the variable - /* Remove */ TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - - // Tensor B - /* Remove */ int nmode_B = 3; - /* Remove */ int64_t extents_B[3] = {3, 2, 4}; - /* Remove */ int64_t strides_B[3] = {1, 3, 6}; - /* Remove */ TAPP_tensor_info info_B; - /* Remove */ TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - - // Tensor C - /* Remove */ int nmode_C = 2; - /* Remove */ int64_t extents_C[2] = {3, 3}; - /* Remove */ int64_t strides_C[2] = {1, 3}; - /* Remove */ TAPP_tensor_info info_C; - /* Remove */ TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - - // Tensor D - /* Remove */ int nmode_D = 2; - /* Remove */ int64_t extents_D[2] = {3, 3}; - /* Remove */ int64_t strides_D[2] = {1, 3}; - /* Remove */ TAPP_tensor_info info_D; - /* Remove */ TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - - /* - * Assign the options for the calculation. - * The precision used will be the default precision. - * The elemental operations should be the identity one (doesn't really matter since this exercise doesn't use complex numbers). - * The operation that should be executed is: - * Contraction between the first index for tensor A and third index for tensor B. - * Contraction between the third index for tensor A and second index for tensor B. - * The second index for A and the first index for B are free indices, in that order. - */ - - // Declare handle (no assignment) - /* Remove */ TAPP_handle handle; - - // Initialize the precision - /* Remove */ TAPP_prectype prec = TAPP_DEFAULT_PREC; - - // Initialize the elemental operations for each of the tensors - /* Remove */ TAPP_element_op op_A = TAPP_IDENTITY; - /* Remove */ TAPP_element_op op_B = TAPP_IDENTITY; - /* Remove */ TAPP_element_op op_C = TAPP_IDENTITY; - /* Remove */ TAPP_element_op op_D = TAPP_IDENTITY; - - // Create ths indicies arrays for each of the tensor - /* Remove */ int64_t idx_A[3] = {'a', 'b', 'c'}; - /* Remove */ int64_t idx_B[3] = {'d', 'c', 'a'}; - /* Remove */ int64_t idx_C[2] = {'b', 'd'}; - /* Remove */ int64_t idx_D[2] = {'b', 'd'}; - - // Declare plan - /* Remove */ TAPP_tensor_product plan; - - // Create plan/Assign the options to the plan - /* Remove */ TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); - - // Declare executor - /* Remove */ TAPP_executor exec; - - // Create executor - TAPP_create_executor(&exec); - - // Declare status object - /* Remove */ TAPP_status status; - - - /* - * Assign data for the execution - */ - - // Initialize alpha - float alpha = 3; - - // Initialize data for tensor A - float A[24] = { - 1, 2, 1.01, -1, - 1, 2, 1.01, -1, - 1, 2, 1.01, -1, - - 1, 2, 1.01, -1, - 1, 2, 1.01, -1, - 1, 2, 1.01, -1}; - - // Initialize data for tensor B - float B[24] = { - 1, 1, 1, - 2, 2, 2, - - 3, 3, 3, - 6, 6, 6, - - 1, 1, 1, - 2, 2, 2, - - 3, 3, 3, - 6, 6, 6}; - - // Initialize beta - float beta = 2; - - // Initialize data for tensor C - float C[9] = { - 4, 4, 8, - 4, 8, 8, - 8, 8, 8}; - - // Initialize data for tensor D - float D[9] = { - 2, 3, 4, - 5, 6, 7, - 9, 1, 2}; - - - /* - * Run the execution - */ - - // Call the execution function - /* Remove */TAPP_error error = TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D); - - - /* - * Print results - */ - - // Check if the execution was successful - bool success = /* Remove */ TAPP_check_success(error); - - // Print if the execution was successful - printf(success ? "Success\n" : "Fail\n"); - - // Get the length of the error message - /* Remove */ int message_len = TAPP_explain_error(error, 0, NULL); - - // Create a buffer to hold the message + 1 character for null terminator - /* Remove */ char* message_buff = malloc((message_len + 1) * sizeof(char)); - - // Fetch error message - /* Remove */ TAPP_explain_error(error, message_len + 1, message_buff); - - // Print error message - printf("%s", message_buff); - printf("\n"); - - // Print the output - print_tensor_s(nmode_D, extents_D, strides_D, D); - - - /* - * Free data - */ - - // Free buffer - free(message_buff); - - // Destroy structures - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); - - /* - * Expected output: - Success - Success. - 53.090 53.090 61.090 - 53.090 61.090 61.090 - 61.090 61.090 61.090 - */ - - return 0; -} diff --git a/test/helpers.h b/test/helpers.h index 0e6cbc8..eb062e2 100644 --- a/test/helpers.h +++ b/test/helpers.h @@ -8,4 +8,4 @@ #include void print_tensor_s(int nmode, const int64_t *extents, const int64_t *strides, const float *data); -void print_tensor_c(int nmode, const int64_t *extents, const int64_t *strides, const float complex *data); +void print_tensor_c(int nmode, const int64_t *extents, const int64_t *strides, const float _Complex *data); diff --git a/test/test.c b/test/test.c deleted file mode 100644 index d8c0134..0000000 --- a/test/test.c +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Niklas Hörnblad - * Paolo Bientinesi - * Umeå University - June 2024 - */ - -#include - -#include -#include - -int main(int argc, char const *argv[]) -{ - int nmode_A = 3; - int64_t extents_A[3] = {4, 3, 3}; - int64_t strides_A[3] = {1, 4, 12}; - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - - int nmode_B = 4; - int64_t extents_B[4] = {3, 2, 2, 3}; - int64_t strides_B[4] = {1, 3, 6, 12}; - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - - int nmode_C = 2; - int64_t extents_C[2] = {4, 2}; - int64_t strides_C[2] = {1, 4}; - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - - int nmode_D = 2; - int64_t extents_D[2] = {4, 2}; - int64_t strides_D[2] = {1, 4}; - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_handle handle; - TAPP_tensor_product plan; - TAPP_element_op op_A = TAPP_IDENTITY; - TAPP_element_op op_B = TAPP_IDENTITY; - TAPP_element_op op_C = TAPP_IDENTITY; - TAPP_element_op op_D = TAPP_IDENTITY; - int64_t idx_A[3] = {'a', 'b', 'c'}; - int64_t idx_B[4] = {'c', 'd', 'e', 'b'}; - int64_t idx_C[2] = {'a', 'd'}; - int64_t idx_D[3] = {'a', 'd'}; - TAPP_prectype prec = TAPP_DEFAULT_PREC; - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec); - - TAPP_executor exec; - TAPP_create_executor(&exec); - TAPP_status status; - - float alpha = 1; - - float A[36] = { - 1, 2, 1.01, -1, - 1, 2, 1.01, -1, - 1, 2, 1.01, -1, - - 1, 2, 1.01, -1, - 1, 2, 1.01, -1, - 1, 2, 1.01, -1, - - 1, 2, 1.01, -1, - 1, 2, 1.01, -1, - 1, 2, 1.01, -1 - }; - - float B[36] = { - 1, 1, 1, - 2, 2, 2, - - 3, 3, 3, - 6, 6, 6, - - - 1, 1, 1, - 2, 2, 2, - - 3, 3, 3, - 6, 6, 6, - - - 1, 1, 1, - 2, 2, 2, - - 3, 3, 3, - 6, 6, 6 - }; - - float beta = 0; - - float C[16] = { - 2, 4, 6, 8, - 2, 4, 6, 8, - - 2, 4, 6, 8, - 2, 4, 6, 8 - }; - - float D[16] = { - 1, 2, 3, 4, - 5, 6, 7, 8, - - 1, 2, 3, 4, - 5, 6, 7, 8 - }; - - TAPP_error error = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - printf(TAPP_check_success(error) ? "Success\n" : "Fail\n"); - int message_len = TAPP_explain_error(error, 0, NULL); - char* message_buff = malloc((message_len + 1) * sizeof(char)); - TAPP_explain_error(error, message_len + 1, message_buff); - printf(message_buff); - free(message_buff); - - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_executor(exec); - return 0; -} diff --git a/test/test.cpp b/test/test.cpp index e28b3d8..31d9e2f 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -6,13 +6,16 @@ #include "test.h" -// TODO replace by #include of when possible -extern "C" { - extern void bli_init(); - extern void bli_finalize(); +unsigned int current_rand_seed = 1; + +#ifdef TAPP_DYNAMIC_LAUNCH +// TODO include ATTR_KEY_USE_DEVICE_MEMORY from cutensor_bindings attributes header +bool use_device_memory = false; // Global variable to control device memory usage in tests +inline void set_use_device_memory(struct impl& impl, TAPP_handle handle) { + impl.TAPP_attr_set(handle, 0, (void*)&use_device_memory); } +#endif -unsigned int current_rand_seed = 0; auto& rand_engine() { static std::mt19937 engine(current_rand_seed); return engine; @@ -20,50 +23,217 @@ auto& rand_engine() { int main(int argc, char const *argv[]) { +#ifdef TAPP_DYNAMIC_LAUNCH + if (argc >= 3) + { + pathA = argv[1]; + pathB = argv[2]; + } + + struct impl implA; + if (load_implementation(&implA, pathA) == -1) return -1; + struct impl implB; + if (load_implementation(&implB, pathB) != 0) return -1; + std::cout << "NOTE: CuTensor does not support negative nor 0 strides" << std::endl; +#endif + if (argc >= 2) current_rand_seed = std::atoi(argv[1]); // now ready to generate random numbers - bli_init(); std::cout << std::boolalpha; std::cout << "Starting seed for random numbers = " << current_rand_seed << std::endl; - std::cout << "Hadamard Product: " << test_hadamard_product() << std::endl; - std::cout << "Contraction: " << test_contraction() << std::endl; - std::cout << "Commutativity: " << test_commutativity() << std::endl; - std::cout << "Permutations: " << test_permutations() << std::endl; - std::cout << "Equal Extents: " << test_equal_extents() << std::endl; - std::cout << "Outer Product: " << test_outer_product() << std::endl; - std::cout << "Full Contraction: " << test_full_contraction() << std::endl; + std::cout << "Hadamard Product: " << test_hadamard_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Contraction: " << test_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Commutativity: " << test_commutativity( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Permutations: " << test_permutations( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Equal Extents: " << test_equal_extents( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Outer Product: " << test_outer_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Full Contraction: " << test_full_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; //for(int i=0;i<0;i++) - std::cout << "Zero Dim Tensor Contraction: " << test_zero_dim_tensor_contraction() << std::endl; - std::cout << "One Dim Tensor Contraction: " << test_one_dim_tensor_contraction() << std::endl; - std::cout << "Subtensor Same Nmode: " << test_subtensor_unchanged_nmode() << std::endl; - std::cout << "Subtensor Lower Nmode: " << test_subtensor_lower_nmode() << std::endl; - std::cout << "Negative Strides: " << test_negative_strides() << std::endl; - std::cout << "Negative Strides Subtensor Same Nmode: " << test_negative_strides_subtensor_unchanged_nmode() << std::endl; - std::cout << "Negative Strides Subtensor Lower Nmode: " << test_negative_strides_subtensor_lower_nmode() << std::endl; - std::cout << "Mixed Strides: " << test_mixed_strides() << std::endl; - std::cout << "Mixed Strides Subtensor Same Nmode: " << test_mixed_strides_subtensor_unchanged_nmode() << std::endl; - std::cout << "Mixed Strides Subtensor Lower Nmode: " << test_mixed_strides_subtensor_lower_nmode() << std::endl; - std::cout << "Contraction Double Precision: " << test_contraction_double_precision() << std::endl; - std::cout << "Contraction Complex: " << test_contraction_complex() << std::endl; + std::cout << "Zero Dim Tensor Contraction: " << test_zero_dim_tensor_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "One Dim Tensor Contraction: " << test_one_dim_tensor_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Subtensor Same Index: " << test_subtensor_same_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Subtensor Lower Index: " << test_subtensor_lower_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Negative Strides: " << test_negative_strides( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; // Cutensor doesn't support negative strides + std::cout << "Negative Strides Subtensor Same Index: " << test_negative_strides_subtensor_same_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Negative Strides Subtensor Lower Index: " << test_negative_strides_subtensor_lower_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Mixed Strides: " << test_mixed_strides( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; // Cutensor doesn't support negative strides + std::cout << "Mixed Strides Subtensor Same Index: " << test_mixed_strides_subtensor_same_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Mixed Strides Subtensor Lower Index: " << test_mixed_strides_subtensor_lower_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Contraction Double Precision: " << test_contraction_double_precision( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Contraction Complex: " << test_contraction_complex( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; //for(int i=0;i<1;i++) - std::cout << "Contraction Complex Double Precision: " << test_contraction_complex_double_precision() << std::endl; - std::cout << "Zero stride: " << test_zero_stride() << std::endl; - std::cout << "Isolated Indices: " << test_isolated_idx() << std::endl; - std::cout << "Repeated Indices: " << test_repeated_idx() << std::endl; - std::cout << "Hadamard And Free: " << test_hadamard_and_free() << std::endl; - std::cout << "Hadamard And Contraction: " << test_hadamard_and_contraction() << std::endl; - std::cout << "Error: Non Matching Extents: " << test_error_non_matching_ext() << std::endl; + std::cout << "Contraction Complex Double Precision: " << test_contraction_complex_double_precision( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Zero stride: " << test_zero_stride( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; // Cutensor doesn't support zero strides + std::cout << "Unique Index: " << test_unique_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Repeated Index: " << test_repeated_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Hadamard And Free: " << test_hadamard_and_free( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + std::cout << "Hadamard And Contraction: " << test_hadamard_and_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, implB +#endif + ) << std::endl; + +#ifndef TAPP_DYNAMIC_LAUNCH + std::cout << "Error: Non Matching Extents: " << test_error_non_matching_ext() << std::endl; //TODO CuTensor bindings should comply to a TAPP error handling std::cout << "Error: C Other Structure: " << test_error_C_other_structure() << std::endl; std::cout << "Error: Aliasing Within D: " << test_error_aliasing_within_D() << std::endl; - bli_finalize(); +#endif + +#ifdef TAPP_DYNAMIC_LAUNCH + unload_implementation(&implA); + unload_implementation(&implB); +#endif + return 0; } +#ifdef TAPP_DYNAMIC_LAUNCH +int load_implementation(struct impl* impl, const char* path) { + impl->handle = dlopen(path, RTLD_LAZY); + if (!impl->handle) { + fprintf(stderr, "dlopen failed: %s\n", dlerror()); + return -1; + } + dlerror(); + *(void**)(&impl->TAPP_attr_set) = dlsym(impl->handle, "TAPP_attr_set"); + *(void**)(&impl->TAPP_attr_get) = dlsym(impl->handle, "TAPP_attr_get"); + *(void**)(&impl->TAPP_attr_clear) = dlsym(impl->handle, "TAPP_attr_clear"); + *(void**)(&impl->TAPP_check_success) = dlsym(impl->handle, "TAPP_check_success"); + *(void**)(&impl->TAPP_explain_error) = dlsym(impl->handle, "TAPP_explain_error"); + *(void**)(&impl->TAPP_create_executor) = dlsym(impl->handle, "TAPP_create_executor"); + *(void**)(&impl->TAPP_destroy_executor) = dlsym(impl->handle, "TAPP_destroy_executor"); + *(void**)(&impl->TAPP_create_handle) = dlsym(impl->handle, "TAPP_create_handle"); + *(void**)(&impl->TAPP_destroy_handle) = dlsym(impl->handle, "TAPP_destroy_handle"); + *(void**)(&impl->TAPP_create_tensor_product) = dlsym(impl->handle, "TAPP_create_tensor_product"); + *(void**)(&impl->TAPP_destroy_tensor_product) = dlsym(impl->handle, "TAPP_destroy_tensor_product"); + *(void**)(&impl->TAPP_execute_product) = dlsym(impl->handle, "TAPP_execute_product"); + *(void**)(&impl->TAPP_execute_batched_product) = dlsym(impl->handle, "TAPP_execute_batched_product"); + *(void**)(&impl->TAPP_destroy_status) = dlsym(impl->handle, "TAPP_destroy_status"); + *(void**)(&impl->TAPP_create_tensor_info) = dlsym(impl->handle, "TAPP_create_tensor_info"); + *(void**)(&impl->TAPP_destroy_tensor_info) = dlsym(impl->handle, "TAPP_destroy_tensor_info"); + *(void**)(&impl->TAPP_get_nmodes) = dlsym(impl->handle, "TAPP_get_nmodes"); + *(void**)(&impl->TAPP_set_nmodes) = dlsym(impl->handle, "TAPP_set_nmodes"); + *(void**)(&impl->TAPP_get_extents) = dlsym(impl->handle, "TAPP_get_extents"); + *(void**)(&impl->TAPP_set_extents) = dlsym(impl->handle, "TAPP_set_extents"); + *(void**)(&impl->TAPP_get_strides) = dlsym(impl->handle, "TAPP_get_strides"); + *(void**)(&impl->TAPP_set_strides) = dlsym(impl->handle, "TAPP_set_strides"); + const char* error = dlerror(); + if (error != NULL) { + fprintf(stderr, "dlsym failed: %s\n", error); + dlclose(impl->handle); + return -1; + } + return 0; +} + +void unload_implementation(struct impl* impl) { + if (impl->handle) { + dlclose(impl->handle); + impl->handle = NULL; + } +} +#else template -void run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A, - int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B, - int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C, - int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D, - T alpha, T beta) +T* run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A, + int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B, + int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C, + int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D, + T alpha, T beta) { tblis::len_type* tblis_len_A = change_array_type(extents_A, nmode_A); tblis::stride_type* tblis_stride_A = change_array_type(strides_A, nmode_A); @@ -118,9 +288,9 @@ void run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, i } } - auto [tblis_A_reduced, tblis_idx_A_reduced, tblis_len_A_reduced, tblis_stride_A_reduced, tblis_data_A_reduced] = contract_unique_idx(&tblis_A, tblis_idx_A, nmode_B, tblis_idx_B, nmode_D, tblis_idx_D); + auto [tblis_A_reduced, tblis_idx_A_reduced, tblis_len_A_reduced, tblis_stride_A_reduced, tblis_data_A_reduced] = reduce_isolated_indices(&tblis_A, tblis_idx_A, nmode_B, tblis_idx_B, nmode_D, tblis_idx_D); - auto [tblis_B_reduced, tblis_idx_B_reduced, tblis_len_B_reduced, tblis_stride_B_reduced, tblis_data_B_reduced] = contract_unique_idx(&tblis_B, tblis_idx_B, nmode_A, tblis_idx_A, nmode_D, tblis_idx_D); + auto [tblis_B_reduced, tblis_idx_B_reduced, tblis_len_B_reduced, tblis_stride_B_reduced, tblis_data_B_reduced] = reduce_isolated_indices(&tblis_B, tblis_idx_B, nmode_A, tblis_idx_A, nmode_D, tblis_idx_D); tblis_tensor_mult(tblis_single, NULL, tblis_A_reduced, tblis_idx_A_reduced, tblis_B_reduced, tblis_idx_B_reduced, &tblis_D, tblis_idx_D); @@ -142,41 +312,49 @@ void run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, i delete[] tblis_len_D; delete[] tblis_stride_D; - delete[] tblis_idx_A_reduced; - delete[] tblis_len_A_reduced; - delete[] tblis_stride_A_reduced; - delete[] tblis_data_A_reduced; - delete tblis_A_reduced; + if (tblis_A_reduced != &tblis_A) + { + delete[] tblis_idx_A_reduced; + delete[] tblis_len_A_reduced; + delete[] tblis_stride_A_reduced; + delete[] tblis_data_A_reduced; + delete tblis_A_reduced; + } - delete[] tblis_idx_B_reduced; - delete[] tblis_len_B_reduced; - delete[] tblis_stride_B_reduced; - delete[] tblis_data_B_reduced; - delete tblis_B_reduced; + if (tblis_B_reduced != &tblis_B) + { + delete[] tblis_idx_B_reduced; + delete[] tblis_len_B_reduced; + delete[] tblis_stride_B_reduced; + delete[] tblis_data_B_reduced; + delete tblis_B_reduced; + } + + return D; } template -std::tuple contract_unique_idx(tblis::tblis_tensor* tensor, tblis::label_type* idx, int nmode_1, tblis::label_type* idx_1, int nmode_2, tblis::label_type* idx_2) -{ - int nmode_reduced = 0; - int64_t size_reduced = 1; - tblis::tblis_tensor* tblis_reduced = new tblis::tblis_tensor; - tblis::len_type* len_reduced = new tblis::len_type[tensor->ndim]; - tblis::stride_type* stride_reduced = new tblis::stride_type[tensor->ndim]; - tblis::label_type* idx_reduced = new tblis::label_type[tensor->ndim+1]; +std::tuple reduce_isolated_indices(tblis::tblis_tensor* tensor, tblis::label_type* idx, int nmode_X, tblis::label_type* idx_X, int nmode_Y, tblis::label_type* idx_Y) +{ + int new_nmode = 0; + int64_t new_size = 1; + tblis::tblis_tensor* new_tensor = new tblis::tblis_tensor; + tblis::len_type* new_len = new tblis::len_type[tensor->ndim]; + tblis::stride_type* new_stride = new tblis::stride_type[tensor->ndim]; + tblis::label_type* new_idx = new tblis::label_type[tensor->ndim+1]; for (size_t i = 0; i < tensor->ndim; i++) { bool found = false; - for (size_t j = 0; j < nmode_1; j++) + for (size_t j = 0; j < nmode_X; j++) { - if (idx[i] == idx_1[j]) + if (idx[i] == idx_X[j]) { found = true; } } - for (size_t j = 0; j < nmode_2; j++) + for (size_t j = 0; j < nmode_Y; j++) { - if (idx[i] == idx_2[j]) + if (idx[i] == idx_Y[j]) { found = true; } @@ -184,45 +362,175 @@ std::tuplelen[i]; - stride_reduced[nmode_reduced] = nmode_reduced == 0 ? 1 : stride_reduced[nmode_reduced - 1] * len_reduced[nmode_reduced - 1]; - idx_reduced[nmode_reduced] = idx[i]; - size_reduced *= len_reduced[nmode_reduced]; - nmode_reduced++; + new_len[new_nmode] = tensor->len[i]; + new_stride[new_nmode] = new_nmode == 0 ? 1 : new_stride[new_nmode - 1] * new_len[new_nmode - 1]; + new_idx[new_nmode] = idx[i]; + new_size *= new_len[new_nmode]; + new_nmode++; + } + } + new_idx[new_nmode] = '\0'; + + if (new_nmode == tensor->ndim) + { + delete new_tensor; + delete[] new_len; + delete[] new_stride; + delete[] new_idx; + return {tensor, idx, (tblis::len_type*)NULL, (tblis::stride_type*)NULL, (T*)NULL}; + } + T* new_data = new T[new_size]; + for (size_t i = 0; i < new_size; i++) + { + new_data[i] = 0; + } + + if constexpr (std::is_same_v) + { + tblis_init_tensor_s(new_tensor, new_nmode, new_len, new_data, new_stride); + } + else if constexpr (std::is_same_v) + { + tblis_init_tensor_d(new_tensor, new_nmode, new_len, new_data, new_stride); + } + else if constexpr (is_complex_v) + { + using value_type = typename T::value_type; + if constexpr (std::is_same_v) + { + tblis_init_tensor_c(new_tensor, new_nmode, new_len, new_data, new_stride); + } + else if constexpr (std::is_same_v) + { + tblis_init_tensor_z(new_tensor, new_nmode, new_len, new_data, new_stride); } } - idx_reduced[nmode_reduced] = '\0'; + tblis_tensor_add(tblis_single, NULL, tensor, idx, new_tensor, new_idx); + return {new_tensor, new_idx, new_len, new_stride, new_data}; +} +#endif + +template +TAPP_error run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl impl, bool use_device_memory, +#else + bool use_tblis, +#endif + int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A, + int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B, + int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C, + int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D, + T alpha, T beta + ) +{ +#ifndef TAPP_DYNAMIC_LAUNCH + if (use_tblis) + { + run_tblis_mult(nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_D, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, beta); + } + auto fn_create_handle = TAPP_create_handle; + auto fn_destroy_handle = TAPP_destroy_handle; + auto fn_create_tensor_info = TAPP_create_tensor_info; + auto fn_destroy_tensor_info = TAPP_destroy_tensor_info; + auto fn_create_tensor_product = TAPP_create_tensor_product; + auto fn_destroy_tensor_product = TAPP_destroy_tensor_product; + auto fn_create_executor = TAPP_create_executor; + auto fn_destroy_executor = TAPP_destroy_executor; + auto fn_execute_product = TAPP_execute_product; +#else + auto fn_create_handle = impl.TAPP_create_handle; + auto fn_destroy_handle = impl.TAPP_destroy_handle; + auto fn_create_tensor_info = impl.TAPP_create_tensor_info; + auto fn_destroy_tensor_info = impl.TAPP_destroy_tensor_info; + auto fn_create_tensor_product = impl.TAPP_create_tensor_product; + auto fn_destroy_tensor_product = impl.TAPP_destroy_tensor_product; + auto fn_create_executor = impl.TAPP_create_executor; + auto fn_destroy_executor = impl.TAPP_destroy_executor; + auto fn_execute_product = impl.TAPP_execute_product; +#endif + + TAPP_error error_status; - T* data_reduced = new T[size_reduced]; - for (size_t i = 0; i < size_reduced; i++) + TAPP_handle handle; + error_status = fn_create_handle(&handle); + if (error_status != 0) goto at_return; +#ifdef TAPP_DYNAMIC_LAUNCH + if (use_device_memory) { - data_reduced[i] = 0; + set_use_device_memory(impl, handle); } +#endif + TAPP_datatype datatype; if constexpr (std::is_same_v) { - tblis_init_tensor_s(tblis_reduced, nmode_reduced, len_reduced, data_reduced, stride_reduced); + datatype = TAPP_FLOAT; } else if constexpr (std::is_same_v) { - tblis_init_tensor_d(tblis_reduced, nmode_reduced, len_reduced, data_reduced, stride_reduced); + datatype = TAPP_DOUBLE; } else if constexpr (is_complex_v) { using value_type = typename T::value_type; if constexpr (std::is_same_v) { - tblis_init_tensor_c(tblis_reduced, nmode_reduced, len_reduced, data_reduced, stride_reduced); + datatype = TAPP_SCOMPLEX; } else if constexpr (std::is_same_v) { - tblis_init_tensor_z(tblis_reduced, nmode_reduced, len_reduced, data_reduced, stride_reduced); + datatype = TAPP_DCOMPLEX; } } - tblis_tensor_add(tblis_single, NULL, tensor, idx, tblis_reduced, idx_reduced); - return {tblis_reduced, idx_reduced, len_reduced, stride_reduced, data_reduced}; + + TAPP_tensor_info info_A; + error_status = fn_create_tensor_info(&info_A, handle, datatype, nmode_A, extents_A, strides_A); + if (error_status != 0) goto at_free_handle; + TAPP_tensor_info info_B; + error_status = fn_create_tensor_info(&info_B, handle, datatype, nmode_B, extents_B, strides_B); + if (error_status != 0) goto at_free_info_A; + TAPP_tensor_info info_C; + error_status = fn_create_tensor_info(&info_C, handle, datatype, nmode_C, extents_C, strides_C); + if (error_status != 0) goto at_free_info_B; + TAPP_tensor_info info_D; + error_status = fn_create_tensor_info(&info_D, handle, datatype, nmode_D, extents_D, strides_D); + if (error_status != 0) goto at_free_info_C; + + TAPP_tensor_product plan; + error_status = fn_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, TAPP_DEFAULT_PREC); + if (error_status != 0) goto at_free_info_D; + TAPP_status status; + + TAPP_executor exec; + error_status = fn_create_executor(&exec); + if (error_status != 0) goto at_free_plan; + + error_status = fn_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); + + fn_destroy_executor(exec); + at_free_plan: + fn_destroy_tensor_product(plan); + at_free_info_D: + fn_destroy_tensor_info(info_D); + at_free_info_C: + fn_destroy_tensor_info(info_C); + at_free_info_B: + fn_destroy_tensor_info(info_B); + at_free_info_A: + fn_destroy_tensor_info(info_A); + at_free_handle: + fn_destroy_handle(handle); + at_return: + + return error_status; } + template U* change_array_type(T* array, int size) { @@ -282,9 +590,9 @@ std::tuple index_extent_map = generate_index_extent_map(min_extent, 4, equal_extents_only, total_unique_indices, unique_indices); - auto [extents_A, extents_B, extents_C, extents_D] = assign_extents(index_extent_map, nmode_A, idx_A, nmode_B, idx_B, nmode_D, idx_D); + auto [extents_A, extents_B, extents_D] = assign_extents(index_extent_map, nmode_A, idx_A, nmode_B, idx_B, nmode_D, idx_D); + int64_t* extents_C = new int64_t[nmode_C]; + std::copy(extents_D, extents_D + nmode_D, extents_C); int outer_nmode_A = subtensor_on_nmode ? nmode_A + rand(1, 4) : nmode_A; int outer_nmode_B = subtensor_on_nmode ? nmode_B + rand(1, 4) : nmode_B; - int outer_nmode_C = subtensor_on_nmode ? nmode_C + rand(1, 4) : nmode_C; int outer_nmode_D = subtensor_on_nmode ? nmode_D + rand(1, 4) : nmode_D; int* stride_signs_A = choose_stride_signs(nmode_A, negative_strides_enabled, mixed_strides_enabled); int* stride_signs_B = choose_stride_signs(nmode_B, negative_strides_enabled, mixed_strides_enabled); - int* stride_signs_C = choose_stride_signs(nmode_C, negative_strides_enabled, mixed_strides_enabled); int* stride_signs_D = choose_stride_signs(nmode_D, negative_strides_enabled, mixed_strides_enabled); bool* subtensor_dims_A = choose_subtensor_dims(nmode_A, outer_nmode_A); bool* subtensor_dims_B = choose_subtensor_dims(nmode_B, outer_nmode_B); - bool* subtensor_dims_C = choose_subtensor_dims(nmode_C, outer_nmode_C); bool* subtensor_dims_D = choose_subtensor_dims(nmode_D, outer_nmode_D); int64_t* outer_extents_A = calculate_outer_extents(outer_nmode_A, extents_A, subtensor_dims_A, subtensor_on_extents); int64_t* outer_extents_B = calculate_outer_extents(outer_nmode_B, extents_B, subtensor_dims_B, subtensor_on_extents); - int64_t* outer_extents_C = calculate_outer_extents(outer_nmode_C, extents_C, subtensor_dims_C, subtensor_on_extents); int64_t* outer_extents_D = calculate_outer_extents(outer_nmode_D, extents_D, subtensor_dims_D, subtensor_on_extents); int64_t* offsets_A = calculate_offsets(nmode_A, outer_nmode_A, extents_A, outer_extents_A, subtensor_dims_A, subtensor_on_extents); int64_t* offsets_B = calculate_offsets(nmode_B, outer_nmode_B, extents_B, outer_extents_B, subtensor_dims_B, subtensor_on_extents); - int64_t* offsets_C = calculate_offsets(nmode_C, outer_nmode_C, extents_C, outer_extents_C, subtensor_dims_C, subtensor_on_extents); int64_t* offsets_D = calculate_offsets(nmode_D, outer_nmode_D, extents_D, outer_extents_D, subtensor_dims_D, subtensor_on_extents); int64_t* strides_A = calculate_strides(nmode_A, outer_nmode_A, outer_extents_A, stride_signs_A, subtensor_dims_A); int64_t* strides_B = calculate_strides(nmode_B, outer_nmode_B, outer_extents_B, stride_signs_B, subtensor_dims_B); - int64_t* strides_C = calculate_strides(nmode_C, outer_nmode_C, outer_extents_C, stride_signs_C, subtensor_dims_C); int64_t* strides_D = calculate_strides(nmode_D, outer_nmode_D, outer_extents_D, stride_signs_D, subtensor_dims_D); + int64_t* strides_C = new int64_t[nmode_C]; + std::copy(strides_D, strides_D + nmode_D, strides_C); int64_t size_A = calculate_size(outer_nmode_A, outer_extents_A); int64_t size_B = calculate_size(outer_nmode_B, outer_extents_B); - int64_t size_C = calculate_size(outer_nmode_C, outer_extents_C); int64_t size_D = calculate_size(outer_nmode_D, outer_extents_D); + int64_t size_C = size_D; - T* data_A = create_tensor_data(size_A); - T* data_B = create_tensor_data(size_B); - T* data_C = create_tensor_data(size_C); - T* data_D = create_tensor_data(size_D); + T* data_A = create_tensor_data(size_A, -10, 10); + T* data_B = create_tensor_data(size_B, -10, 10); + T* data_C = create_tensor_data(size_C, -10, 10); + T* data_D = create_tensor_data(size_D, -10, 10); T* A = calculate_tensor_pointer(data_A, nmode_A, extents_A, offsets_A, strides_A); T* B = calculate_tensor_pointer(data_B, nmode_B, extents_B, offsets_B, strides_B); - T* C = calculate_tensor_pointer(data_C, nmode_C, extents_C, offsets_C, strides_C); + T* C = calculate_tensor_pointer(data_C, nmode_C, extents_C, offsets_D, strides_C); T* D = calculate_tensor_pointer(data_D, nmode_D, extents_D, offsets_D, strides_D); - T alpha = rand(); - T beta = rand(); + T alpha = rand(-10, 10); + T beta = rand(-10, 10); delete[] unique_indices; delete[] subtensor_dims_A; delete[] subtensor_dims_B; - delete[] subtensor_dims_C; delete[] subtensor_dims_D; delete[] outer_extents_A; delete[] outer_extents_B; - delete[] outer_extents_C; delete[] outer_extents_D; delete[] stride_signs_A; delete[] stride_signs_B; - delete[] stride_signs_C; delete[] stride_signs_D; delete[] offsets_A; delete[] offsets_B; - delete[] offsets_C; delete[] offsets_D; return {nmode_A, extents_A, strides_A, A, idx_A, @@ -391,7 +696,7 @@ std::tuple generate_index_configuration(int nmode_A, int nmode_B, int nmode_D, int contracted_indices, int hadamard_indices, @@ -741,7 +1046,7 @@ std::tuple assign_indices(int* unique_indices, - int contracted_indices, int hadamard_indices, - int free_indices_A, int free_indices_B, - int isolated_indices_A, int isolated_indices_B, - int repeated_indices_A, int repeated_indices_B) +std::tuple assign_indices(int* unique_indices, + int contracted_indices, int hadamard_indices, + int free_indices_A, int free_indices_B, + int isolated_indices_A, int isolated_indices_B, + int repeated_indices_A, int repeated_indices_B) { // Create index arrays int64_t* idx_A = new int64_t[repeated_indices_A + isolated_indices_A + free_indices_A + hadamard_indices + contracted_indices]; int64_t* idx_B = new int64_t[repeated_indices_B + isolated_indices_B + free_indices_B + hadamard_indices + contracted_indices]; - int64_t* idx_C = new int64_t[free_indices_A + hadamard_indices + free_indices_B]; int64_t* idx_D = new int64_t[free_indices_A + hadamard_indices + free_indices_B]; /* @@ -792,10 +1096,6 @@ std::tuple assign_indices(int* unique_in std::shuffle(idx_D, idx_D + (free_indices_A + hadamard_indices + free_indices_B), rand_engine()); // Shuffle indices for D - std::copy(idx_D, - idx_D + free_indices_A + hadamard_indices + free_indices_B, - idx_C); // C has the same indices as D - for (int i = 0; i < repeated_indices_A; i++) // Add repeated indices to A { idx_A[i + isolated_indices_A + free_indices_A + hadamard_indices + contracted_indices] = idx_A[rand(0, isolated_indices_A + free_indices_A + hadamard_indices + contracted_indices - 1)]; @@ -810,7 +1110,7 @@ std::tuple assign_indices(int* unique_in std::shuffle(idx_B, idx_B + repeated_indices_B + isolated_indices_B + free_indices_B + hadamard_indices + contracted_indices, rand_engine()); // Shuffle final indices for B - return {idx_A, idx_B, idx_C, idx_D}; + return {idx_A, idx_B, idx_D}; } std::unordered_map generate_index_extent_map(int64_t min_extent, int64_t max_extent, @@ -827,7 +1127,7 @@ std::unordered_map generate_index_extent_map(int64_t min_extent, i return index_to_extent; } -std::tuple assign_extents(std::unordered_map index_extent_map, +std::tuple assign_extents(std::unordered_map index_extent_map, int nmode_A, int64_t* idx_A, int nmode_B, int64_t* idx_B, int nmode_D, int64_t* idx_D) @@ -835,7 +1135,6 @@ std::tuple assign_extents(std::unordered // Create extent arrays int64_t* extents_A = new int64_t[nmode_A]; int64_t* extents_B = new int64_t[nmode_B]; - int64_t* extents_C = new int64_t[nmode_D]; int64_t* extents_D = new int64_t[nmode_D]; // Map extents to tensors based on their indices @@ -852,9 +1151,7 @@ std::tuple assign_extents(std::unordered extents_D[i] = index_extent_map[idx_D[i]]; // Assign extents to D } - std::copy(extents_D, extents_D + nmode_D, extents_C); - - return {extents_A, extents_B, extents_C, extents_D}; + return {extents_A, extents_B, extents_D}; } int* choose_stride_signs(int nmode, bool negative_strides_enabled, bool mixed_strides_enabled) @@ -1089,11 +1386,11 @@ T rand() { if constexpr (is_complex_v) { using value_type = typename T::value_type; - return rand(-std::numeric_limits::max(), std::numeric_limits::max()); + return rand(-std::numeric_limits::min(), std::numeric_limits::max()); } else { - return rand(-std::numeric_limits::max(), std::numeric_limits::max()); + return rand(-std::numeric_limits::min(), std::numeric_limits::max()); } } @@ -1282,7 +1579,11 @@ void add_idx(int* nmode, int64_t** idx, int64_t** extents, int64_t** strides, in *strides = strides_tmp; } -bool test_hadamard_product() +bool test_hadamard_product( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1294,46 +1595,41 @@ bool test_hadamard_product() auto [E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - int op_A = 0; - int op_B = 0; - int op_C = 0; - int op_D = 0; - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, op_A, idx_A, - nmode_B, extents_B, strides_B, B, op_B, idx_B, - nmode_C, extents_C, strides_C, C, op_C, idx_D, - nmode_D, extents_D, strides_D, E, op_D, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(D, E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] strides_A; delete[] extents_B; @@ -1355,7 +1651,11 @@ bool test_hadamard_product() return result; } -bool test_contraction() +bool test_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1367,41 +1667,41 @@ bool test_contraction() auto [E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -1423,7 +1723,11 @@ bool test_contraction() return result; } -bool test_commutativity() +bool test_commutativity( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1439,52 +1743,69 @@ bool test_commutativity() auto [G, data_G] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_tensor_product planAB; - TAPP_create_tensor_product(&planAB, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_tensor_product planBA; - TAPP_create_tensor_product(&planBA, handle, 0, info_B, idx_B, 0, info_A, idx_A, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(planAB, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); - - TAPP_execute_product(planBA, exec, &status, (void*)&alpha, (void*)B, (void*)A, (void*)&beta, (void*)C, (void*)F); - - run_tblis_mult(nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, G, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, F, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, G, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D) && compare_tensors(data_F, data_G, size_D) && compare_tensors(data_D, data_F, size_D); - - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(planAB); - TAPP_destroy_tensor_product(planBA); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); + delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -1508,7 +1829,11 @@ bool test_commutativity() return result; } -bool test_permutations() +bool test_permutations( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1519,49 +1844,50 @@ bool test_permutations() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, rand(2, 4)); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); bool result = true; for (int i = 0; i < nmode_D; i++) { - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); result = result && compare_tensors(data_D, data_E, size_D); rotate_indices(idx_C, nmode_C, extents_C, strides_C); rotate_indices(idx_D, nmode_D, extents_D, strides_D); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); - TAPP_destroy_tensor_product(plan); } - - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); + delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -1583,7 +1909,11 @@ bool test_permutations() return result; } -bool test_equal_extents() +bool test_equal_extents( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1595,41 +1925,41 @@ bool test_equal_extents() auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -1651,7 +1981,11 @@ bool test_equal_extents() return result; } -bool test_outer_product() +bool test_outer_product( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1662,42 +1996,41 @@ bool test_outer_product() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, 0); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -1719,7 +2052,11 @@ bool test_outer_product() return result; } -bool test_full_contraction() +bool test_full_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1730,42 +2067,42 @@ bool test_full_contraction() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, 0); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -1787,7 +2124,11 @@ bool test_full_contraction() return result; } -bool test_zero_dim_tensor_contraction() +bool test_zero_dim_tensor_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1798,42 +2139,42 @@ bool test_zero_dim_tensor_contraction() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(0);//2,2,0,2); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -1855,7 +2196,11 @@ bool test_zero_dim_tensor_contraction() return result; } -bool test_one_dim_tensor_contraction() +bool test_one_dim_tensor_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1866,42 +2211,42 @@ bool test_one_dim_tensor_contraction() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(1); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -1923,7 +2268,11 @@ bool test_one_dim_tensor_contraction() return result; } -bool test_subtensor_unchanged_nmode() +bool test_subtensor_same_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -1934,42 +2283,42 @@ bool test_subtensor_unchanged_nmode() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, true); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -1991,7 +2340,11 @@ bool test_subtensor_unchanged_nmode() return result; } -bool test_subtensor_lower_nmode() +bool test_subtensor_lower_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2002,42 +2355,42 @@ bool test_subtensor_lower_nmode() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, true, true); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2059,7 +2412,11 @@ bool test_subtensor_lower_nmode() return result; } -bool test_negative_strides() +bool test_negative_strides( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2070,41 +2427,42 @@ bool test_negative_strides() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, false, false, true); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - TAPP_executor exec; - TAPP_create_executor(&exec); - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2123,10 +2481,14 @@ bool test_negative_strides() delete[] data_D; delete[] data_E; - return true; + return result; } -bool test_negative_strides_subtensor_unchanged_nmode() +bool test_negative_strides_subtensor_same_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2137,42 +2499,42 @@ bool test_negative_strides_subtensor_unchanged_nmode() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, true, false, true); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2194,7 +2556,11 @@ bool test_negative_strides_subtensor_unchanged_nmode() return result; } -bool test_negative_strides_subtensor_lower_nmode() +bool test_negative_strides_subtensor_lower_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2205,42 +2571,42 @@ bool test_negative_strides_subtensor_lower_nmode() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, true, true, true); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2262,7 +2628,11 @@ bool test_negative_strides_subtensor_lower_nmode() return result; } -bool test_mixed_strides() +bool test_mixed_strides( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2270,44 +2640,45 @@ bool test_mixed_strides() nmode_D, extents_D, strides_D, D, idx_D, alpha, beta, data_A, data_B, data_C, data_D, - size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, false, false, false, true); + size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, false, false, true, true); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - TAPP_executor exec; - TAPP_create_executor(&exec); - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2326,10 +2697,14 @@ bool test_mixed_strides() delete[] data_D; delete[] data_E; - return true; + return result; } -bool test_mixed_strides_subtensor_unchanged_nmode() +bool test_mixed_strides_subtensor_same_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2337,45 +2712,45 @@ bool test_mixed_strides_subtensor_unchanged_nmode() nmode_D, extents_D, strides_D, D, idx_D, alpha, beta, data_A, data_B, data_C, data_D, - size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, true, false, false, true); + size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, true, false, true, true); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2397,7 +2772,11 @@ bool test_mixed_strides_subtensor_unchanged_nmode() return result; } -bool test_mixed_strides_subtensor_lower_nmode() +bool test_mixed_strides_subtensor_lower_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2405,45 +2784,45 @@ bool test_mixed_strides_subtensor_lower_nmode() nmode_D, extents_D, strides_D, D, idx_D, alpha, beta, data_A, data_B, data_C, data_D, - size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, true, true, false, true); + size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(-1, -1, -1, -1, -1, 1, false, true, true, true, true); auto[E, data_E] = copy_tensor_data(size_D, data_D, D); - - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2465,7 +2844,11 @@ bool test_mixed_strides_subtensor_lower_nmode() return result; } -bool test_contraction_double_precision() +bool test_contraction_double_precision( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2477,41 +2860,41 @@ bool test_contraction_double_precision() auto [E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F64, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F64, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F64, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F64, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2533,7 +2916,11 @@ bool test_contraction_double_precision() return result; } -bool test_contraction_complex() +bool test_contraction_complex( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2545,46 +2932,41 @@ bool test_contraction_complex() auto [E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_C32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_C32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_C32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_C32, nmode_D, extents_D, strides_D); - - int op_A = rand(0, 1); - int op_B = rand(0, 1); - int op_C = rand(0, 1); - int op_D = rand(0, 1); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, op_A, idx_A, - nmode_B, extents_B, strides_B, B, op_B, idx_B, - nmode_C, extents_C, strides_C, C, op_C, idx_D, - nmode_D, extents_D, strides_D, E, op_D, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2606,7 +2988,11 @@ bool test_contraction_complex() return result; } -bool test_contraction_complex_double_precision() +bool test_contraction_complex_double_precision( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2618,47 +3004,41 @@ bool test_contraction_complex_double_precision() auto [E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_C64, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_C64, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_C64, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_C64, nmode_D, extents_D, strides_D); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); - int op_A = rand(0, 1); - int op_B = rand(0, 1); - int op_C = rand(0, 1); - int op_D = rand(0, 1); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - int terr = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, op_A, idx_A, - nmode_B, extents_B, strides_B, B, op_B, idx_B, - nmode_C, extents_C, strides_C, C, op_C, idx_D, - nmode_D, extents_D, strides_D, E, op_D, idx_D, - alpha, beta); - // std::complex zma = 1.0+1.0e-12; - // data_D[0] = data_D[0]*zma; bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2680,7 +3060,11 @@ bool test_contraction_complex_double_precision() return result; } -bool test_zero_stride() +bool test_zero_stride( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2700,41 +3084,41 @@ bool test_zero_stride() strides_B[0] = 0; } - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2756,7 +3140,11 @@ bool test_zero_stride() return result; } -bool test_isolated_idx() +bool test_unique_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2768,41 +3156,41 @@ bool test_isolated_idx() auto [E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2824,7 +3212,11 @@ bool test_isolated_idx() return result; } -bool test_repeated_idx() +bool test_repeated_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2836,41 +3228,41 @@ bool test_repeated_idx() auto [E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A, - nmode_B, extents_B, strides_B, B, 0, idx_B, - nmode_C, extents_C, strides_C, C, 0, idx_D, - nmode_D, extents_D, strides_D, E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2892,7 +3284,11 @@ bool test_repeated_idx() return result; } -bool test_hadamard_and_free() +bool test_hadamard_and_free( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { auto [nmode_A, extents_A, strides_A, A, idx_A, nmode_B, extents_B, strides_B, B, idx_B, @@ -2904,41 +3300,41 @@ bool test_hadamard_and_free() auto [E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)data_A, (void*)data_B, (void*)&beta, (void*)data_C, (void*)data_D); - - run_tblis_mult(nmode_A, extents_A, strides_A, data_A, 0, idx_A, - nmode_B, extents_B, strides_B, data_B, 0, idx_B, - nmode_C, extents_C, strides_C, data_C, 0, idx_D, - nmode_D, extents_D, strides_D, data_E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -2960,7 +3356,11 @@ bool test_hadamard_and_free() return result; } -bool test_hadamard_and_contraction() +bool test_hadamard_and_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ) { int input_nmode = rand(0, 4); auto [nmode_A, extents_A, strides_A, A, idx_A, @@ -2973,41 +3373,41 @@ bool test_hadamard_and_contraction() auto [E, data_E] = copy_tensor_data(size_D, data_D, D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)data_A, (void*)data_B, (void*)&beta, (void*)data_C, (void*)data_D); - - run_tblis_mult(nmode_A, extents_A, strides_A, data_A, 0, idx_A, - nmode_B, extents_B, strides_B, data_B, 0, idx_B, - nmode_C, extents_C, strides_C, data_C, 0, idx_D, - nmode_D, extents_D, strides_D, data_E, 0, idx_D, - alpha, beta); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implA, false, +#else + false, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); + + run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + implB, true, +#else + true, +#endif + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, E, op_D, idx_D, + alpha, + beta + ); bool result = compare_tensors(data_D, data_E, size_D); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -3029,6 +3429,7 @@ bool test_hadamard_and_contraction() return result; } +#ifndef TAPP_DYNAMIC_LAUNCH bool test_error_too_many_idx_D() { auto [nmode_A, extents_A, strides_A, A, idx_A, @@ -3040,21 +3441,21 @@ bool test_error_too_many_idx_D() size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction(); int64_t max_idx = 0; - for (size_t i = 0; i < nmode_A; i++) + for (int i = 0; i < nmode_A; i++) { if (max_idx < idx_A[i]) { max_idx = idx_A[i]; } } - for (size_t i = 0; i < nmode_B; i++) + for (int i = 0; i < nmode_B; i++) { if (max_idx < idx_B[i]) { max_idx = idx_B[i]; } } - for (size_t i = 0; i < nmode_D; i++) + for (int i = 0; i < nmode_D; i++) { if (max_idx < idx_D[i]) { @@ -3064,33 +3465,20 @@ bool test_error_too_many_idx_D() add_incorrect_idx(max_idx, &nmode_D, &idx_D, &extents_D, &strides_D); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; - int error_status = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); + TAPP_error error_status = run_product(false, + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -3108,7 +3496,7 @@ bool test_error_too_many_idx_D() delete[] data_C; delete[] data_D; - return error_status == 7; + return error_status == 7; // && error_status_B == 7; Error status isn't the same for CuTensor and reference imp } bool test_error_non_matching_ext() @@ -3155,33 +3543,19 @@ bool test_error_non_matching_ext() break; } - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); - - int error_status = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); + TAPP_error error_status = run_product(false, + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -3199,7 +3573,7 @@ bool test_error_non_matching_ext() delete[] data_C; delete[] data_D; - return error_status == 1 || error_status == 2 || error_status == 3; + return (error_status == 1 || error_status == 2 || error_status == 3); // && (error_status_B == 1 || error_status_B == 2 || error_status_B == 3); Error status isn't the same for CuTensor and reference imp } bool test_error_C_other_structure() @@ -3247,33 +3621,20 @@ bool test_error_C_other_structure() break; } - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; + TAPP_error error_status = run_product(false, + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); - TAPP_executor exec; - TAPP_create_executor(&exec); - - int error_status = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); - - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -3291,7 +3652,7 @@ bool test_error_C_other_structure() delete[] data_C; delete[] data_D; - return error_status == 5 || error_status == 6 || error_status == 7; + return (error_status == 5 || error_status == 6 || error_status == 7); // && (error_status_B == 5 || error_status_B == 6 || error_status_B == 7); Error status isn't the same for CuTensor and reference imp } bool test_error_aliasing_within_D() @@ -3308,33 +3669,20 @@ bool test_error_aliasing_within_D() int signs[2] = {-1, 1}; strides_D[scewed_index] = random_choice(2, signs) * (strides_D[scewed_index - 1] * extents_D[scewed_index - 1] - rand((int64_t)1, strides_D[scewed_index - 1] * extents_D[scewed_index - 1] - 1)); - TAPP_tensor_info info_A; - TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); - TAPP_tensor_info info_B; - TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B); - TAPP_tensor_info info_C; - TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C); - TAPP_tensor_info info_D; - TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D); - - TAPP_tensor_product plan; - TAPP_handle handle; - TAPP_create_handle(&handle); - TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC); - TAPP_status status; - - TAPP_executor exec; - TAPP_create_executor(&exec); + int op_A = TAPP_IDENTITY; + int op_B = TAPP_IDENTITY; + int op_C = TAPP_IDENTITY; + int op_D = TAPP_IDENTITY; - int error_status = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D); + TAPP_error error_status = run_product(false, + nmode_A, extents_A, strides_A, A, op_A, idx_A, + nmode_B, extents_B, strides_B, B, op_B, idx_B, + nmode_C, extents_C, strides_C, C, op_C, idx_C, + nmode_D, extents_D, strides_D, D, op_D, idx_D, + alpha, + beta + ); - TAPP_destroy_executor(exec); - TAPP_destroy_handle(handle); - TAPP_destroy_tensor_product(plan); - TAPP_destroy_tensor_info(info_A); - TAPP_destroy_tensor_info(info_B); - TAPP_destroy_tensor_info(info_C); - TAPP_destroy_tensor_info(info_D); delete[] extents_A; delete[] extents_B; delete[] extents_C; @@ -3352,5 +3700,6 @@ bool test_error_aliasing_within_D() delete[] data_C; delete[] data_D; - return error_status == 8; + return error_status == 8; // && error_status_B == 8; Error status isn't the same for CuTensor and reference imp } +#endif \ No newline at end of file diff --git a/test/test.h b/test/test.h index bfcc50e..36019b3 100644 --- a/test/test.h +++ b/test/test.h @@ -1,8 +1,3 @@ -/* - * Niklas Hörnblad - * Paolo Bientinesi - * Umeå University - November 2024 - */ #include #include #include @@ -11,14 +6,109 @@ #include #include #include -#include +#include // POSIX dynamic loading, TODO: fix for windows +#ifndef TAPP_DYNAMIC_LAUNCH #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #include "tblis.h" #pragma GCC diagnostic pop +#endif #include +#ifdef TAPP_DYNAMIC_LAUNCH +const char* pathA = "./reference_implementation/libtapp-reference.so"; +const char* pathB = "./cutensor_bindings/libtapp-cutensor.so"; +struct impl +{ + void* handle; + TAPP_error (*TAPP_attr_set)(TAPP_attr attr, TAPP_key key, void* value); + TAPP_error (*TAPP_attr_get)(TAPP_attr attr, TAPP_key key, void** value); + TAPP_error (*TAPP_attr_clear)(TAPP_attr attr, TAPP_key key); + bool (*TAPP_check_success)(TAPP_error error); + size_t (*TAPP_explain_error)(TAPP_error error, size_t maxlen, char* message); + TAPP_error (*TAPP_create_executor)(TAPP_executor* exec); + TAPP_error (*TAPP_destroy_executor)(TAPP_executor exec); + TAPP_error (*TAPP_create_handle)(TAPP_handle* handle); + TAPP_error (*TAPP_destroy_handle)(TAPP_handle handle); + TAPP_error (*TAPP_create_tensor_product)(TAPP_tensor_product* plan, + TAPP_handle handle, + TAPP_element_op op_A, + TAPP_tensor_info A, + const int64_t* idx_A, + TAPP_element_op op_B, + TAPP_tensor_info B, + const int64_t* idx_B, + TAPP_element_op op_C, + TAPP_tensor_info C, + const int64_t* idx_C, + TAPP_element_op op_D, + TAPP_tensor_info D, + const int64_t* idx_D, + TAPP_prectype prec); + TAPP_error (*TAPP_destroy_tensor_product)(TAPP_tensor_product plan); + TAPP_error (*TAPP_execute_product)(TAPP_tensor_product plan, + TAPP_executor exec, + TAPP_status* status, + const void* alpha, + const void* A, + const void* B, + const void* beta, + const void* C, + void* D); + TAPP_error (*TAPP_execute_batched_product)(TAPP_tensor_product plan, + TAPP_executor exec, + TAPP_status* status, + int num_batches, + const void* alpha, + const void** A, + const void** B, + const void* beta, + const void** C, + void** D); + TAPP_error (*TAPP_destroy_status)(TAPP_status status); + TAPP_error (*TAPP_create_tensor_info)(TAPP_tensor_info* info, + TAPP_handle handle, + TAPP_datatype type, + int nmode, + const int64_t* extents, + const int64_t* strides); + TAPP_error (*TAPP_destroy_tensor_info)(TAPP_tensor_info info); + int (*TAPP_get_nmodes)(TAPP_tensor_info info); + TAPP_error (*TAPP_set_nmodes)(TAPP_tensor_info info, int nmodes); + void (*TAPP_get_extents)(TAPP_tensor_info info, int64_t* extents); + TAPP_error (*TAPP_set_extents)(TAPP_tensor_info info, const int64_t* extents); + void (*TAPP_get_strides)(TAPP_tensor_info info, int64_t* strides); + TAPP_error (*TAPP_set_strides)(TAPP_tensor_info info, const int64_t* strides); +}; + +int load_implementation(struct impl* impl, const char* path); +void unload_implementation(struct impl* impl); +#else +template +T* run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A, + int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B, + int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C, + int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D, + T alpha, T beta); +template +std::tuple reduce_isolated_indices(tblis::tblis_tensor* tensor, tblis::label_type* idx, int nmode_X, tblis::label_type* idx_X, int nmode_Y, tblis::label_type* idx_Y); +#endif + +template +TAPP_error run_product( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl impl, bool use_device_memory, +#else + bool use_tblis, +#endif + int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A, + int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B, + int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C, + int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D, + T alpha, T beta + ); + template struct is_complex : std::false_type {}; template @@ -30,14 +120,7 @@ template T rand(T min, T max); template T rand(); -template -void run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A, - int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B, - int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C, - int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D, - T alpha, T beta); -template -std::tuple contract_unique_idx(tblis::tblis_tensor* tensor, tblis::label_type* idx, int nmode_1, tblis::label_type* idx_1, int nmode_2, tblis::label_type* idx_2); + template U* change_array_type(T* array, int size); template @@ -57,25 +140,25 @@ std::tuple generate_index_configuration(int nmode_A = -1, int nmode_B = -1, int nmode_D = -1, int contracted_indices = -1, int hadamard_indices = -1, bool hadamard_only = false, bool hadamard_indices_enabled = false, bool isolated_indices_enabled = false, bool repeated_indices_enabled = false); int* generate_unique_indices(int64_t total_unique_indices); -std::tuple assign_indices(int* unique_indices, - int contracted_modes, int hadamard_modes, - int free_indices_A, int free_indices_B, - int isolated_indices_A, int isolated_indices_B, - int repeated_indices_A, int repeated_indices_B); +std::tuple assign_indices(int* unique_indices, + int contracted_modes, int hadamard_modes, + int free_indices_A, int free_indices_B, + int isolated_indices_A, int isolated_indices_B, + int repeated_indices_A, int repeated_indices_B); std::unordered_map generate_index_extent_map(int64_t min_extent, int64_t max_extent, bool equal_extents_only, int64_t total_unique_indices, int* unique_indices); -std::tuple assign_extents(std::unordered_map index_extent_map, - int nmode_A, int64_t* idx_A, - int nmode_B, int64_t* idx_B, - int nmode_D, int64_t* idx_D); +std::tuple assign_extents(std::unordered_map index_extent_map, + int nmode_A, int64_t* idx_A, + int nmode_B, int64_t* idx_B, + int nmode_D, int64_t* idx_D); int* choose_stride_signs(int nmode, bool negative_str, bool mixed_str); bool* choose_subtensor_dims(int nmode, int outer_nmode); int64_t* calculate_outer_extents(int outer_nmode, int64_t* extents, bool* subtensor_dims, bool lower_extents); @@ -106,31 +189,134 @@ void add_incorrect_idx(int64_t max_idx, int* nmode, int64_t** idx, int64_t** ext void add_idx(int* nmode, int64_t** idx, int64_t** extents, int64_t** strides, int64_t additional_idx, int64_t additional_extents, int64_t additional_strides); // Tests -bool test_hadamard_product(); -bool test_contraction(); -bool test_commutativity(); -bool test_permutations(); -bool test_equal_extents(); -bool test_outer_product(); -bool test_full_contraction(); -bool test_zero_dim_tensor_contraction(); -bool test_one_dim_tensor_contraction(); -bool test_subtensor_unchanged_nmode(); -bool test_subtensor_lower_nmode(); -bool test_negative_strides(); -bool test_negative_strides_subtensor_unchanged_nmode(); -bool test_negative_strides_subtensor_lower_nmode(); -bool test_mixed_strides(); -bool test_mixed_strides_subtensor_unchanged_nmode(); -bool test_mixed_strides_subtensor_lower_nmode(); -bool test_contraction_double_precision(); -bool test_contraction_complex(); -bool test_contraction_complex_double_precision(); -bool test_zero_stride(); -bool test_isolated_idx(); -bool test_repeated_idx(); -bool test_hadamard_and_free(); -bool test_hadamard_and_contraction(); +bool test_hadamard_product( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_commutativity( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_permutations( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_equal_extents( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_outer_product( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_full_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_zero_dim_tensor_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_one_dim_tensor_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_subtensor_same_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_subtensor_lower_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_negative_strides( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_negative_strides_subtensor_same_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_negative_strides_subtensor_lower_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_mixed_strides( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_mixed_strides_subtensor_same_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_mixed_strides_subtensor_lower_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_contraction_double_precision( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_contraction_complex( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_contraction_complex_double_precision( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_zero_stride( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_unique_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_repeated_idx( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_hadamard_and_free( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); +bool test_hadamard_and_contraction( +#ifdef TAPP_DYNAMIC_LAUNCH + struct impl implA, struct impl implB +#endif + ); + +#ifndef TAPP_DYNAMIC_LAUNCH // These test does not make sense for other implementations than the reference bool test_error_non_matching_ext(); bool test_error_C_other_structure(); bool test_error_aliasing_within_D(); +#endif