diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 0d1c179..445f266 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -51,7 +51,8 @@ jobs:
         -G Ninja
         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
         -DCMAKE_UNITY_BUILD=${{ matrix.build_type == 'Debug' || matrix.valgrind }}
-        -DTAPP_REFERENCE_ENABLE_TBLIS=${{ !matrix.valgrind }}
+        -DTAPP_REFERENCE_USE_TBLIS=${{ !matrix.valgrind }}
+
     steps:
     - uses: actions/checkout@v4
 
@@ -90,6 +91,7 @@ jobs:
       run: |
         sudo apt-get update
         sudo apt-get install ninja-build g++-14 liblapack-dev ccache valgrind
+
     - name: Prepare ccache timestamp
       id: ccache_cache_timestamp
       shell: cmake -P {0}
@@ -136,8 +138,8 @@ jobs:
       working-directory: ${{github.workspace}}/build
       shell: bash
       run: |
-        valgrind --error-exitcode=1 --leak-check=full ./tapp-reference-demo
-        valgrind --error-exitcode=1 --leak-check=full ./tapp-reference-driver
+        valgrind --error-exitcode=1 --leak-check=full ./test/tapp-reference-demo
+        valgrind --error-exitcode=1 --leak-check=full ./examples/tapp-reference-driver
 
     - name: Consume from build tree
       if: ${{ !matrix.valgrind && !matrix.sanitize }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 107c6ad..b79ff68 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,8 +39,8 @@ project(tapp
         HOMEPAGE_URL "https://github.com/TAPPOrg/")
 
 # TBLIS requires CXX; enable_language must be called at the top level
-option(TAPP_REFERENCE_ENABLE_TBLIS "Build and link TBLIS and TBLIS bindings" OFF)
-if(TAPP_REFERENCE_ENABLE_TBLIS)
+option(TAPP_REFERENCE_USE_TBLIS "TAPP-Reference will use TBLIS to implement TAPP_product" OFF)
+if(TAPP_REFERENCE_USE_TBLIS)
   include(CheckLanguage)
   check_language(CXX)
   if(CMAKE_CXX_COMPILER)
@@ -65,207 +65,39 @@ set(TAPP_INSTALL_DATADIR "share/tapp/${TAPP_EXT_VERSION}/data"
 set(TAPP_INSTALL_DOCDIR "share/tapp/${TAPP_EXT_VERSION}/doc"
         CACHE PATH "TAPP doc install directory")
 
-# this provides tapp-api target
+# this provides tapp::api target
 add_subdirectory(api)
 
-# this provides tapp-reference target
+# this provides tapp::reference target
 add_subdirectory(reference_implementation)
 
 # ----------------------------------------------------------------------------
-# testing
-
-include(CTest)
-
-if(BUILD_TESTING)
-
-  # ----------------------------------------------------------------------------
-  # TBLIS test
-
-  if(TAPP_REFERENCE_ENABLE_TBLIS)
-    add_executable(tapp-reference-test++)
-
-    target_sources(
-      tapp-reference-test++
-      PRIVATE
-        test/test.cpp
-        test/test.h
-      )
-
-    target_link_libraries(
-      tapp-reference-test++
-      PRIVATE
-        tapp-reference
-        tblis-static
-      )
-
-    set_property(
-      TARGET tapp-reference-test++
-      PROPERTY
-        CXX_STANDARD 20
-        CXX_STANDARD_REQUIRED YES
-        CXX_EXTENSIONS NO
-    )
-
-    add_test(
-      NAME tapp-reference-test++
-      COMMAND $<TARGET_FILE:tapp-reference-test++>
-      )
-  endif()
-
-  # ----------------------------------------------------------------------------
-  # demo
-
-  add_executable(tapp-reference-demo)
-
-  target_sources(
-    tapp-reference-demo
-    PRIVATE
-      test/demo.c
-      test/helpers.c
-      test/helpers.h
-    )
-
-  target_link_libraries(
-    tapp-reference-demo
-    PRIVATE
-      tapp-reference
-    )
-
-  add_test(
-    NAME tapp-reference-demo
-    COMMAND $<TARGET_FILE:tapp-reference-demo>
-    )
-
-  # ----------------------------------------------------------------------------
-  # driver
-
-  add_executable(tapp-reference-driver)
-
-  target_sources(
-    tapp-reference-driver
-    PRIVATE
-      examples/driver/driver.c
-      test/helpers.c
-      test/helpers.h
-    )
-
-  target_include_directories(
-    tapp-reference-driver
-    PRIVATE
-      ${CMAKE_CURRENT_SOURCE_DIR}/test
-    )
-
-  target_link_libraries(
-    tapp-reference-driver
-    PRIVATE
-      tapp-reference
-    )
-
-  add_test(
-    NAME tapp-reference-driver
-    COMMAND $<TARGET_FILE:tapp-reference-driver>
-    )
-
-  # ----------------------------------------------------------------------------
-  # exercise: contraction
-
-  if(TAPP_BUILD_EXERCISE)
-    add_executable(tapp-reference-exercise_contraction)
-
-    target_sources(
-      tapp-reference-exercise_contraction
-      PRIVATE
-        examples/exercise_contraction/exercise_contraction.c
-        test/helpers.c
-        test/helpers.h
-      )
-
-    target_include_directories(
-      tapp-reference-exercise_contraction
-      PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/test
-      )
-
-    target_link_libraries(
-      tapp-reference-exercise_contraction
-      PRIVATE
-        tapp-reference
-      )
-
-    add_test(
-      NAME tapp-reference-exercise_contraction
-      COMMAND $<TARGET_FILE:tapp-reference-exercise_contraction>
-      )
+# cutensor bindings
+option(TAPP_CUTENSOR "Build cuTensor bindings" OFF)
+if (TAPP_CUTENSOR)
+  # enable_language must be called at the top level
+  include(CheckLanguage)
+  check_language(CXX)
+  if(CMAKE_CXX_COMPILER)
+    enable_language(CXX)
+  else()
+    message(FATAL_ERROR "Cannot build cuTENSOR bindings due to missing CXX language support")
   endif()
+  # since CUDAToolkit will be needed in tests/ also, load it here
+  cmake_minimum_required(VERSION 3.17) # CUDAToolkit
+  find_package(CUDAToolkit REQUIRED)
 
-  # ----------------------------------------------------------------------------
-  # exercise: contraction answers
-
-  add_executable(tapp-reference-exercise_contraction_answers)
-
-  target_sources(
-    tapp-reference-exercise_contraction_answers
-    PRIVATE
-      examples/exercise_contraction/answers/exercise_contraction_answers.c
-      test/helpers.c
-      test/helpers.h
-    )
-
-  target_include_directories(
-    tapp-reference-exercise_contraction_answers
-    PRIVATE
-      ${CMAKE_CURRENT_SOURCE_DIR}/test
-    )
-
-  target_link_libraries(
-    tapp-reference-exercise_contraction_answers
-    PRIVATE
-      tapp-reference
-    )
-
-  add_test(
-    NAME tapp-reference-exercise_contraction_answers
-    COMMAND $<TARGET_FILE:tapp-reference-exercise_contraction_answers>
-    )
-
-  # ----------------------------------------------------------------------------
-  # exercise: tucker
-
-  add_library(tapp-reference-exercise_tucker SHARED)
-
-  target_sources(
-    tapp-reference-exercise_tucker
-    PUBLIC
-      examples/exercise_tucker/tapp_tucker/exercise_tucker.h
-    PRIVATE
-      examples/exercise_tucker/tapp_tucker/exercise_tucker.c
-    )
-
-  target_link_libraries(
-    tapp-reference-exercise_tucker
-    PRIVATE
-      tapp-reference
-    )
-
-  # ----------------------------------------------------------------------------
-  # exercise: tucker answers
-
-  add_library(tapp-reference-exercise_tucker_answers SHARED)
+  add_subdirectory(cutensor_bindings)
+endif()
 
-  target_sources(
-    tapp-reference-exercise_tucker_answers
-    PUBLIC
-      examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.h
-    PRIVATE
-      examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c
-    )
+# ----------------------------------------------------------------------------
+# testing
 
-  target_link_libraries(
-    tapp-reference-exercise_tucker_answers
-    PRIVATE
-      tapp-reference
-    )
+include(CTest)
 
+if(BUILD_TESTING)
+  add_subdirectory(test)
+  add_subdirectory(examples)
 endif()
 
 # ============================================================================
diff --git a/api/include/tapp/tensor.h b/api/include/tapp/tensor.h
index 68bf287..113022d 100644
--- a/api/include/tapp/tensor.h
+++ b/api/include/tapp/tensor.h
@@ -3,6 +3,7 @@
 
 #include <stdint.h>
 
+#include "handle.h"
 #include "util.h"
 #include "error.h"
 #include "datatype.h"
@@ -20,6 +21,7 @@ typedef intptr_t TAPP_tensor_info;
  */ 
 
 TAPP_EXPORT TAPP_error TAPP_create_tensor_info(TAPP_tensor_info* info,
+                                               TAPP_handle handle,
                                                TAPP_datatype type,
                                                int nmode,
                                                const int64_t* extents,
diff --git a/cutensor_bindings/CMakeLists.txt b/cutensor_bindings/CMakeLists.txt
new file mode 100644
index 0000000..08dbf6f
--- /dev/null
+++ b/cutensor_bindings/CMakeLists.txt
@@ -0,0 +1,73 @@
+# cuTENSOR is not part of the CUDA toolkit; look for it separately
+if(NOT TARGET cutensor::cutensor)
+  find_path(CUTENSOR_INCLUDE_DIR
+    NAMES cutensor.h
+    HINTS ${CUTENSOR_ROOT} ENV CUTENSOR_ROOT
+          ${CUDAToolkit_LIBRARY_ROOT}
+    PATH_SUFFIXES include
+  )
+  find_library(CUTENSOR_LIBRARY
+    NAMES cutensor
+    HINTS ${CUTENSOR_ROOT} ENV CUTENSOR_ROOT
+          ${CUDAToolkit_LIBRARY_ROOT}
+    PATH_SUFFIXES lib lib64 lib/${CMAKE_LIBRARY_ARCHITECTURE}
+  )
+
+  if(NOT CUTENSOR_INCLUDE_DIR OR NOT CUTENSOR_LIBRARY)
+    message(FATAL_ERROR "cuTENSOR not found; set CUTENSOR_ROOT to the cuTENSOR installation prefix")
+  endif()
+  message(STATUS "Found cuTENSOR: ${CUTENSOR_LIBRARY}")
+  message(STATUS "cuTENSOR include: ${CUTENSOR_INCLUDE_DIR}")
+
+  add_library(cutensor::cutensor UNKNOWN IMPORTED GLOBAL)
+  set_target_properties(cutensor::cutensor PROPERTIES
+    IMPORTED_LOCATION "${CUTENSOR_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${CUTENSOR_INCLUDE_DIR}"
+  )
+endif()
+
+add_library(tapp-cutensor SHARED)
+set_property(TARGET tapp-cutensor PROPERTY EXPORT_NAME cutensor)
+add_library(tapp::cutensor ALIAS tapp-cutensor)
+target_link_libraries(
+        cutensor::cutensor
+        INTERFACE
+        CUDA::cudart
+)
+
+target_sources(tapp-cutensor
+  PRIVATE
+    src/attributes.cpp
+    src/datatype.cpp
+    src/error.cpp
+    src/executor.cpp
+    src/handle.cpp
+    src/product.cpp
+    src/tensor.cpp
+)
+
+set_target_properties(tapp-cutensor PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+  CXX_STANDARD 20
+  CXX_STANDARD_REQUIRED YES
+)
+
+target_include_directories(tapp-cutensor
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
+target_link_libraries(tapp-cutensor
+  PUBLIC
+    tapp::api
+  PRIVATE
+    cutensor::cutensor
+    CUDA::cudart
+)
+
+install(TARGETS tapp-cutensor EXPORT tapp
+  COMPONENT cutensor)
+
+if(APPLE AND CMAKE_C_COMPILER_ID MATCHES "^(Clang|AppleClang)$")
+  target_link_options(tapp-cutensor PRIVATE "-undefined;dynamic_lookup")
+endif()
diff --git a/cutensor_bindings/include/attributes.h b/cutensor_bindings/include/attributes.h
new file mode 100644
index 0000000..059d3dc
--- /dev/null
+++ b/cutensor_bindings/include/attributes.h
@@ -0,0 +1,12 @@
+#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_ATTRIBUTES_H_
+#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_ATTRIBUTES_H_
+
+#include <tapp/attributes.h>
+
+#include <cstring>
+
+#include "handle.h"
+
+#define ATTR_KEY_USE_DEVICE_MEMORY 0
+
+#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_ATTRIBUTES_H_ */
\ No newline at end of file
diff --git a/cutensor_bindings/include/datatype.h b/cutensor_bindings/include/datatype.h
new file mode 100644
index 0000000..dbebf13
--- /dev/null
+++ b/cutensor_bindings/include/datatype.h
@@ -0,0 +1,16 @@
+#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_DATATYPE_H_
+#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_DATATYPE_H_
+
+#include <tapp/datatype.h>
+
+#include <cutensor.h>
+
+#include <complex>
+
+cutensorDataType_t translate_datatype(TAPP_datatype type);
+
+cutensorComputeDescriptor_t translate_prectype(TAPP_prectype prec, TAPP_datatype datatype);
+
+size_t sizeof_datatype(TAPP_datatype type);
+
+#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_DATATYPE_H_ */
\ No newline at end of file
diff --git a/cutensor_bindings/include/error.h b/cutensor_bindings/include/error.h
new file mode 100644
index 0000000..219195e
--- /dev/null
+++ b/cutensor_bindings/include/error.h
@@ -0,0 +1,15 @@
+#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_ERROR_H_
+#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_ERROR_H_
+
+#include <tapp/error.h>
+
+#include <cutensor.h>
+
+#include <cstring>
+#include <string>
+
+int pack_error(int current_value, int tapp_err);
+int pack_error(int current_value, cutensorStatus_t e); 
+int pack_error(int current_value, cudaError_t e);
+
+#endif /* TAPP_REF_IMPL_CUTENSOR_BINDS_ERROR_H_ */
\ No newline at end of file
diff --git a/cutensor_bindings/include/executor.h b/cutensor_bindings/include/executor.h
new file mode 100644
index 0000000..3480deb
--- /dev/null
+++ b/cutensor_bindings/include/executor.h
@@ -0,0 +1,8 @@
+#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_EXECUTOR_H_
+#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_EXECUTOR_H_
+
+#include <tapp/executor.h>
+
+#include "error.h"
+
+#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_EXECUTOR_H_ */
\ No newline at end of file
diff --git a/cutensor_bindings/include/handle.h b/cutensor_bindings/include/handle.h
new file mode 100644
index 0000000..6b70173
--- /dev/null
+++ b/cutensor_bindings/include/handle.h
@@ -0,0 +1,16 @@
+#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_HANDLE_H_
+#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_HANDLE_H_
+
+#include <tapp/handle.h>
+
+#include <cutensor.h>
+
+#include "error.h"
+
+struct handle
+{
+    cutensorHandle_t* libhandle;
+    intptr_t* attributes;
+};
+
+#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_HANDLE_H_ */
\ No newline at end of file
diff --git a/cutensor_bindings/include/product.h b/cutensor_bindings/include/product.h
new file mode 100644
index 0000000..c89283c
--- /dev/null
+++ b/cutensor_bindings/include/product.h
@@ -0,0 +1,40 @@
+#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_PRODUCT_H_
+#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_PRODUCT_H_
+
+#include <tapp/product.h>
+
+#include <cutensor.h>
+
+#include <vector>
+#include <algorithm>
+#include <assert.h>
+#include <cstring>
+
+#include "error.h"
+#include "handle.h"
+#include "tensor.h"
+#include "attributes.h"
+
+struct product_plan
+{
+    int64_t data_offset_A;
+    size_t copy_size_A;
+    int64_t data_offset_B;
+    size_t copy_size_B;
+    int64_t data_offset_C;
+    size_t copy_size_C;
+    int64_t data_offset_D;
+    size_t copy_size_D;
+    int64_t sections_D;
+    int64_t section_size_D;
+    int64_t sections_nmode_D;
+    int64_t* section_extents_D;
+    int64_t* section_strides_D;
+    TAPP_datatype type_D;
+    TAPP_element_op op_D;
+    cutensorPlan_t* contraction_plan;
+    cutensorPlan_t* permutation_plan;
+    TAPP_handle handle;
+};
+
+#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_PRODUCT_H_ */
\ No newline at end of file
diff --git a/cutensor_bindings/include/tensor.h b/cutensor_bindings/include/tensor.h
new file mode 100644
index 0000000..2cb6f7e
--- /dev/null
+++ b/cutensor_bindings/include/tensor.h
@@ -0,0 +1,26 @@
+#ifndef TAPP_REF_IMPL_CUTENSOR_BINDINGS_TENSOR_H_
+#define TAPP_REF_IMPL_CUTENSOR_BINDINGS_TENSOR_H_
+
+#include <tapp/tensor.h>
+
+#include <cutensor.h>
+
+#include <cstring>
+
+#include "error.h"
+#include "handle.h"
+#include "datatype.h"
+
+struct tensor_info
+{
+    int nmode;
+    int64_t *extents;
+    int64_t *strides;
+    size_t elements;
+    size_t copy_size;
+    int64_t data_offset;
+    TAPP_datatype type;
+    cutensorTensorDescriptor_t* desc;
+};
+
+#endif /* TAPP_REF_IMPL_CUTENSOR_BINDINGS_TENSOR_H_ */
\ No newline at end of file
diff --git a/cutensor_bindings/src/attributes.cpp b/cutensor_bindings/src/attributes.cpp
new file mode 100644
index 0000000..203a2bb
--- /dev/null
+++ b/cutensor_bindings/src/attributes.cpp
@@ -0,0 +1,49 @@
+#include "../include/attributes.h"
+
+TAPP_error TAPP_attr_set(TAPP_attr attr, TAPP_key key, void* value)
+{
+    struct handle* handle_struct = (struct handle*) attr;
+    switch (key)
+    {
+    case 0:
+        memcpy((void*)handle_struct->attributes[0], value, sizeof(bool));
+        break;
+    
+    default:
+        return 15; // Invalid key
+    }
+    return 0;
+}
+
+TAPP_error TAPP_attr_get(TAPP_attr attr, TAPP_key key, void** value)
+{
+    struct handle* handle_struct = (struct handle*) attr;
+    switch (key)
+    {
+    case 0:
+        memcpy(value, (void*)handle_struct->attributes[0], sizeof(bool));
+        break;
+    
+    default:
+        return 15; // Invalid key
+    }
+    return 0;
+}
+
+TAPP_error TAPP_attr_clear(TAPP_attr attr, TAPP_key key)
+{
+    struct handle* handle_struct = (struct handle*) attr;
+    switch (key)
+    {
+    case 0:
+        {
+            bool default_value = false;
+            memcpy((void*)handle_struct->attributes[0], &default_value, sizeof(bool));
+        }
+        break;
+    
+    default:
+        return 15; // Invalid key
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/cutensor_bindings/src/datatype.cpp b/cutensor_bindings/src/datatype.cpp
new file mode 100644
index 0000000..2a63229
--- /dev/null
+++ b/cutensor_bindings/src/datatype.cpp
@@ -0,0 +1,92 @@
+#include "../include/datatype.h"
+
+cutensorDataType_t translate_datatype(TAPP_datatype type)
+{
+    switch (type)
+    {
+    case TAPP_F32:
+        return CUTENSOR_R_32F;
+        break;
+    case TAPP_F64:
+        return CUTENSOR_R_64F;
+        break;
+    case TAPP_C32:
+        return CUTENSOR_C_32F;
+        break;
+    case TAPP_C64:
+        return CUTENSOR_C_64F;
+        break;
+    case TAPP_F16:
+        return CUTENSOR_R_16F;
+        break;
+    case TAPP_BF16:
+        return CUTENSOR_R_16BF;
+        break;
+    default: // TODO: Default should probably be an error
+        return CUTENSOR_R_32F;
+        break;
+    }
+}
+
+cutensorComputeDescriptor_t translate_prectype(TAPP_prectype prec, TAPP_datatype datatype)
+{
+    switch (prec)
+    {
+        case TAPP_DEFAULT_PREC:
+            switch (datatype)
+            {
+            case TAPP_F32:
+            case TAPP_C32:
+                return CUTENSOR_COMPUTE_DESC_32F;
+                break;
+            case TAPP_F64:
+            case TAPP_C64:
+                return CUTENSOR_COMPUTE_DESC_64F;
+                break;
+            default: // TODO: Default should probably be an error
+                return CUTENSOR_COMPUTE_DESC_32F;
+                break;
+            }
+            break;
+        case TAPP_F32F32_ACCUM_F32:
+            return CUTENSOR_COMPUTE_DESC_32F;
+            break;
+        case TAPP_F64F64_ACCUM_F64:
+            return CUTENSOR_COMPUTE_DESC_64F;
+            break;
+        case TAPP_F16F16_ACCUM_F16:
+            return CUTENSOR_COMPUTE_DESC_16F;
+            break;
+        default: // TODO: Default should probably be an error
+            return CUTENSOR_COMPUTE_DESC_32F;
+            break;
+    }
+}
+
+size_t sizeof_datatype(TAPP_datatype type)
+{
+    switch (type)
+    {
+    case TAPP_F32:
+        return sizeof(float);
+        break;
+    case TAPP_F64:
+        return sizeof(double);
+        break;
+    case TAPP_C32: 
+        return sizeof(std::complex<float>);
+        break;
+    case TAPP_C64:
+        return sizeof(std::complex<double>);
+        break;
+    /*case TAPP_F16: // Fix these datatypes
+        //return _Float16;
+        break;
+    case TAPP_BF16:
+        //return __bf16;
+        break;*/
+    default: // TODO: Default should probably be an error
+        return sizeof(float);
+        break;
+    }
+}
\ No newline at end of file
diff --git a/cutensor_bindings/src/error.cpp b/cutensor_bindings/src/error.cpp
new file mode 100644
index 0000000..8c239aa
--- /dev/null
+++ b/cutensor_bindings/src/error.cpp
@@ -0,0 +1,133 @@
+#include "../include/error.h"
+
+// pack multiple types of error codes into one int
+constexpr int TAPP_BITS   = 5;
+constexpr int CUTENSOR_BITS = 9;
+constexpr int CUTENSOR_OFFS = TAPP_BITS;    // 5
+constexpr int CUDA_OFFS   = CUTENSOR_OFFS + CUTENSOR_BITS; // 14
+constexpr uint64_t TAPP_FIELD_MASK   = (1ULL << TAPP_BITS) - 1; // 0x1F
+constexpr uint64_t CUTENSOR_FIELD_MASK = ((1ULL << CUTENSOR_BITS) - 1) << CUTENSOR_OFFS;
+constexpr uint64_t TAPP_CLEAR_MASK   = ~TAPP_FIELD_MASK;
+constexpr uint64_t CUTENSOR_CLEAR_MASK = ~CUTENSOR_FIELD_MASK;
+
+
+bool TAPP_check_success(TAPP_error error) {
+    return error == 0;
+}
+
+
+size_t TAPP_explain_error(TAPP_error error,
+                          size_t maxlen,
+                          char* message) {
+
+    std::string str = "";
+
+    if (error == 0) {
+        str += "Success.";
+    }
+    uint64_t code = static_cast<uint64_t>(error);
+
+    //1. Extract TAPP (Bottom 5 bits)
+    uint64_t tappVal = code & TAPP_FIELD_MASK;
+    if (tappVal != 0) {
+        str += " [TAPP Error]: ";
+        switch (tappVal)
+        {
+        case 1:
+            str += "The extents for the indices shared between tensor A and B does not match.";
+            break;
+        case 2:
+            str += "The extents for the indices shared between tensor A and D does not match.";
+            break;
+        case 3:
+            str += "The extents for the indices shared between tensor B and D does not match.";
+            break;
+        case 4:
+            str += "Tensor D has indices not shared with tensor A or B.";
+            break;
+        case 5:
+            str += "The tensors C and D have different amount of dimensions.";
+            break;
+        case 6:
+            str += "The indices of tensor C and D does not line up.";
+            break;
+        case 7:
+            str += "The extents for the indices shared between tensor C and D does not match.";
+            break;
+        case 8:
+            str += "Aliasing found within tensor D.";
+            break;
+        case 9:
+            str += "An idx in tensor A has two different extents.";
+            break;
+        case 10:
+            str += "An idx in tensor B has two different extents.";
+            break;
+        case 11:
+            str += "An idx in tensor D has two different extents.";
+            break;
+        case 12:
+            str += "C should not be NULL while beta is not zero.";
+            break;
+        case 13:
+            str += "Nmode can not be negative.";
+            break;
+        case 14:
+            str += "Extents can not be negative.";
+            break;
+        case 15:
+            str += "Invalid attribute key.";
+            break;
+        default:
+            str += "Unknown TAPP error code.";
+            break;
+        }
+    }
+
+    //2. Extract cuTENSOR (Middle 9 bits)
+    uint64_t cutensorVal = (code & CUTENSOR_FIELD_MASK) >> CUTENSOR_OFFS;
+    if (cutensorVal != 0) {
+        cutensorStatus_t ts = static_cast<cutensorStatus_t>(cutensorVal);
+        str += " [cuTENSOR Status]: ";
+        str += cutensorGetErrorString(ts);
+    }
+
+    //3. Extract CUDA (Top 18 bits)
+    int cudaVal = (code >> CUDA_OFFS);
+    if (cudaVal != 0) {
+        cudaError_t cs = static_cast<cudaError_t>(cudaVal);
+        str += " [CUDA Error]: ";
+        str += cudaGetErrorString(cs);
+    }
+
+    const char* error_message = str.c_str();
+    size_t message_len = strlen(error_message);
+    if (maxlen == 0) {
+        return message_len;
+    }
+    size_t writelen = maxlen - 1 < message_len ? maxlen - 1 : message_len;
+    strncpy(message, error_message, writelen);
+    message[writelen] = '\0';
+    return writelen;
+}
+
+
+int pack_error(int current_value, int tapp_err) {
+    uint64_t val = static_cast<uint64_t>(current_value);
+    uint64_t new_tapp_val = static_cast<uint64_t>(tapp_err);
+    return static_cast<int>((val & TAPP_CLEAR_MASK) | new_tapp_val);
+}
+
+int pack_error(int current_value, cutensorStatus_t e) {
+    uint64_t val = static_cast<uint64_t>(current_value);
+    uint64_t new_tensor_val = static_cast<uint64_t>(e) << CUTENSOR_OFFS;
+    return static_cast<int>((val & CUTENSOR_CLEAR_MASK) | new_tensor_val);
+}
+
+int pack_error(int current_value, cudaError_t e) {
+    uint64_t val = static_cast<uint64_t>(current_value);
+    uint64_t new_cuda_val = static_cast<uint64_t>(e) << CUDA_OFFS;
+    uint64_t LOW_FIELDS_MASK = TAPP_FIELD_MASK | CUTENSOR_FIELD_MASK;
+    uint64_t cleared_val = val & (~LOW_FIELDS_MASK);
+    return static_cast<int>(cleared_val | new_cuda_val);
+}
diff --git a/cutensor_bindings/src/executor.cpp b/cutensor_bindings/src/executor.cpp
new file mode 100644
index 0000000..19c1f41
--- /dev/null
+++ b/cutensor_bindings/src/executor.cpp
@@ -0,0 +1,21 @@
+#include "../include/executor.h"
+
+TAPP_error TAPP_create_executor(TAPP_executor* exec)
+{
+    cudaStream_t* stream = (cudaStream_t*)malloc(sizeof(cudaStream_t));
+    cudaError_t cerr;
+    cerr = cudaStreamCreate(stream);
+    if (cerr != cudaSuccess) return pack_error(0, cerr);
+    *exec = (TAPP_executor)stream;
+    return pack_error(0, cerr);
+}
+
+TAPP_error TAPP_destroy_executor(TAPP_executor exec)
+{
+    cudaStream_t* stream = (cudaStream_t*)exec;
+    cudaError_t cerr;
+    cerr = cudaStreamDestroy(*stream);
+    if (cerr != cudaSuccess) return pack_error(0, cerr);
+    free(stream);
+    return pack_error(0, cerr);
+}
diff --git a/cutensor_bindings/src/handle.cpp b/cutensor_bindings/src/handle.cpp
new file mode 100644
index 0000000..c1ea80b
--- /dev/null
+++ b/cutensor_bindings/src/handle.cpp
@@ -0,0 +1,34 @@
+#include "../include/handle.h"
+
+TAPP_error TAPP_create_handle(TAPP_handle* handle)
+{
+    cutensorHandle_t* libhandle = new cutensorHandle_t;
+    cutensorStatus_t err = cutensorCreate(libhandle);
+    if (err != CUTENSOR_STATUS_SUCCESS)
+    {
+        delete libhandle;
+        return pack_error(0, err);
+    }
+    struct handle* handle_struct = new struct handle;
+    handle_struct->libhandle = libhandle;
+    bool* use_device_memory = new bool(true);
+    handle_struct->attributes = new intptr_t[1];
+    handle_struct->attributes[0] = (intptr_t) use_device_memory;
+    *handle = (TAPP_handle) handle_struct;
+    return 0; 
+}
+
+TAPP_error TAPP_destroy_handle(TAPP_handle handle)
+{
+    struct handle* handle_struct = (struct handle*) handle;
+    cutensorStatus_t err = cutensorDestroy(*handle_struct->libhandle);
+    if (err != CUTENSOR_STATUS_SUCCESS)
+    {
+        return pack_error(0, err);
+    }
+    delete handle_struct->libhandle;
+    delete (bool*)handle_struct->attributes[0];
+    delete[] handle_struct->attributes;
+    delete handle_struct;
+    return 0;
+}
\ No newline at end of file
diff --git a/cutensor_bindings/src/product.cpp b/cutensor_bindings/src/product.cpp
new file mode 100644
index 0000000..c441e91
--- /dev/null
+++ b/cutensor_bindings/src/product.cpp
@@ -0,0 +1,385 @@
+#include "../include/product.h"
+
+int64_t compute_index(const int64_t* coordinates, int nmode, const int64_t* strides);
+void increment_coordinates(int64_t* coordinates, int nmode, const int64_t* extents);
+cutensorOperator_t translate_operator(TAPP_element_op op);
+
+TAPP_error TAPP_create_tensor_product(TAPP_tensor_product* plan,
+                                      TAPP_handle handle,
+                                      TAPP_element_op op_A,
+                                      TAPP_tensor_info A,
+                                      const int64_t* idx_A,
+                                      TAPP_element_op op_B,
+                                      TAPP_tensor_info B,
+                                      const int64_t* idx_B,
+                                      TAPP_element_op op_C,
+                                      TAPP_tensor_info C,
+                                      const int64_t* idx_C,
+                                      TAPP_element_op op_D,
+                                      TAPP_tensor_info D,
+                                      const int64_t* idx_D,
+                                      TAPP_prectype prec)
+{
+    struct product_plan* plan_struct = new struct product_plan;
+    plan_struct->handle = handle;
+    struct handle* handle_struct = (struct handle*) plan_struct->handle;
+    std::vector<int32_t> cuidx_A = std::vector<int32_t>(idx_A, idx_A + TAPP_get_nmodes(A));
+    std::vector<int32_t> cuidx_B = std::vector<int32_t>(idx_B, idx_B + TAPP_get_nmodes(B));
+    std::vector<int32_t> cuidx_C = std::vector<int32_t>(idx_C, idx_C + TAPP_get_nmodes(C));
+    std::vector<int32_t> cuidx_D = std::vector<int32_t>(idx_D, idx_D + TAPP_get_nmodes(D));
+
+    cutensorStatus_t err;
+    cutensorOperationDescriptor_t contraction_desc;
+    err = cutensorCreateContraction(*handle_struct->libhandle, 
+                &contraction_desc,
+                *((struct tensor_info*)A)->desc, cuidx_A.data(), translate_operator(op_A),
+                *((struct tensor_info*)B)->desc, cuidx_B.data(), translate_operator(op_B),
+                *((struct tensor_info*)C)->desc, cuidx_C.data(), translate_operator(op_C),
+                *((struct tensor_info*)D)->desc, cuidx_D.data(),
+                translate_prectype(prec, ((struct tensor_info*)D)->type));
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+
+    cutensorDataType_t scalarType;
+    err = cutensorOperationDescriptorGetAttribute(*handle_struct->libhandle,
+                contraction_desc,
+                CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
+                (void*)&scalarType,
+                sizeof(scalarType));
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+
+    assert(scalarType == translate_datatype(((struct tensor_info*)D)->type));
+
+    cutensorOperationDescriptor_t permutation_desc;
+    err = cutensorCreatePermutation(*handle_struct->libhandle,
+        &permutation_desc,
+        *((struct tensor_info*)D)->desc, cuidx_D.data(), translate_operator(op_D),
+        *((struct tensor_info*)D)->desc, cuidx_D.data(),
+        translate_prectype(prec, ((tensor_info*)D)->type));
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+
+    err = cutensorOperationDescriptorGetAttribute(*handle_struct->libhandle,
+                permutation_desc,
+                CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
+                (void*)&scalarType,
+                sizeof(scalarType));
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+
+    assert(scalarType == translate_datatype(((struct tensor_info*)D)->type));
+
+    const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT;
+
+    cutensorPlanPreference_t plan_pref;
+    err = cutensorCreatePlanPreference(
+                *handle_struct->libhandle,
+                &plan_pref,
+                algo,
+                CUTENSOR_JIT_MODE_NONE);
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+
+    uint64_t workspace_size_estimate = 0;
+    const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
+    cutensorEstimateWorkspaceSize(*handle_struct->libhandle,
+                contraction_desc,
+                plan_pref,
+                workspacePref,
+                &workspace_size_estimate);
+
+    plan_struct->contraction_plan = new cutensorPlan_t;
+    err = cutensorCreatePlan(*handle_struct->libhandle,
+                plan_struct->contraction_plan,
+                contraction_desc,
+                plan_pref,
+                workspace_size_estimate);
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+
+    plan_struct->permutation_plan = new cutensorPlan_t;
+    err = cutensorCreatePlan(*handle_struct->libhandle,
+        plan_struct->permutation_plan,
+        permutation_desc,
+        plan_pref,
+        workspace_size_estimate
+    );
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+
+    plan_struct->data_offset_A = ((struct tensor_info*)A)->data_offset;
+    plan_struct->copy_size_A = ((struct tensor_info*)A)->copy_size;
+    plan_struct->data_offset_B = ((struct tensor_info*)B)->data_offset;
+    plan_struct->copy_size_B = ((struct tensor_info*)B)->copy_size;
+    plan_struct->data_offset_C = ((struct tensor_info*)C)->data_offset;
+    plan_struct->copy_size_C = ((struct tensor_info*)C)->copy_size;
+    plan_struct->data_offset_D = ((struct tensor_info*)D)->data_offset;
+    plan_struct->copy_size_D = ((struct tensor_info*)D)->copy_size;
+    plan_struct->sections_D = 1;
+    plan_struct->section_size_D = 1;
+    plan_struct->sections_nmode_D = 0;
+    plan_struct->section_strides_D = new int64_t[TAPP_get_nmodes(D)];
+    plan_struct->section_extents_D = new int64_t[TAPP_get_nmodes(D)];
+    plan_struct->type_D = ((struct tensor_info*)D)->type;
+    plan_struct->op_D = op_D;
+    int64_t sorted_strides_D[TAPP_get_nmodes(D)];
+    memcpy(sorted_strides_D, ((struct tensor_info*)D)->strides, TAPP_get_nmodes(D) * sizeof(int64_t));
+    auto compare = [](int64_t a, int64_t b) { return std::abs(a) < std::abs(b); };
+    std::sort(sorted_strides_D, sorted_strides_D + TAPP_get_nmodes(D), compare);
+    for (int i = 0; i < TAPP_get_nmodes(D); i++)
+    {
+        for (int j = 0; j < TAPP_get_nmodes(D); j++)
+        {
+            if (((struct tensor_info*)D)->strides[j] == sorted_strides_D[i])
+            {
+                if (std::abs(sorted_strides_D[i]) == plan_struct->section_size_D)
+                {
+                    plan_struct->section_size_D *= std::abs(((struct tensor_info*)D)->extents[i]);
+                }
+                else if (((struct tensor_info*)D)->extents[j] != 1) // if extent = 0 then stride will never be used i.e. no need for section, even if stride would create section
+                {
+                    plan_struct->sections_D *= ((struct tensor_info*)D)->extents[j];
+                    plan_struct->section_extents_D[plan_struct->sections_nmode_D] = ((struct tensor_info*)D)->extents[j];
+                    plan_struct->section_strides_D[plan_struct->sections_nmode_D] = ((struct tensor_info*)D)->strides[j];
+                    plan_struct->sections_nmode_D++;
+                }
+                break;
+            }
+        }
+    }
+    plan_struct->section_size_D *= sizeof_datatype(((struct tensor_info*)D)->type);
+    *plan = (TAPP_tensor_product) plan_struct;
+    err = cutensorDestroyOperationDescriptor(contraction_desc);
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+    err = cutensorDestroyOperationDescriptor(permutation_desc);
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+    cutensorDestroyPlanPreference(plan_pref);
+    return pack_error(0, err); 
+}
+
+TAPP_error TAPP_destroy_tensor_product(TAPP_tensor_product plan)
+{
+    struct product_plan* plan_struct = (struct product_plan*) plan;
+    cutensorStatus_t err;
+    err = cutensorDestroyPlan(*plan_struct->contraction_plan);
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+    delete plan_struct->contraction_plan;
+    err = cutensorDestroyPlan(*plan_struct->permutation_plan);
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+    delete plan_struct->permutation_plan;
+    delete[] plan_struct->section_strides_D;
+    delete[] plan_struct->section_extents_D;
+    delete plan_struct;
+    return pack_error(0, err); 
+}
+ 
+TAPP_error TAPP_execute_product(TAPP_tensor_product plan,
+                                TAPP_executor exec,
+                                TAPP_status* status,
+                                const void* alpha,
+                                const void* A,
+                                const void* B,
+                                const void* beta,
+                                const void* C,
+                                      void* D)
+{
+    void *A_d, *B_d, *C_d, *D_d;
+    struct handle* handle_struct = (struct handle*) ((struct product_plan*) plan)->handle;
+    bool use_device_memory = *(bool*)((handle_struct->attributes)[ATTR_KEY_USE_DEVICE_MEMORY]);
+    const bool do_permutation = ( ((struct product_plan*)plan)->op_D != TAPP_IDENTITY );
+    cudaError_t cerr;
+
+    void *E_d = nullptr;
+    if (do_permutation) {
+        cerr = cudaMallocAsync((void**)&E_d, ((struct product_plan*)plan)->copy_size_D, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+    }
+    
+    if (use_device_memory)
+    {
+        A_d = (void*)A;
+        B_d = (void*)B;
+        C_d = (void*)C;
+        D_d = (void*)D;
+    }
+    else
+    {
+        cerr = cudaMallocAsync((void**)&A_d, ((struct product_plan*)plan)->copy_size_A, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+        cerr = cudaMallocAsync((void**)&B_d, ((struct product_plan*)plan)->copy_size_B, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+        cerr = cudaMallocAsync((void**)&C_d, ((struct product_plan*)plan)->copy_size_C, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+        cerr = cudaMallocAsync((void**)&D_d, ((struct product_plan*)plan)->copy_size_D, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+        cerr = cudaMemcpyAsync(A_d, (void*)((intptr_t)A + ((struct product_plan*)plan)->data_offset_A), ((struct product_plan*)plan)->copy_size_A, cudaMemcpyHostToDevice, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+        cerr = cudaMemcpyAsync(B_d, (void*)((intptr_t)B + ((struct product_plan*)plan)->data_offset_B), ((struct product_plan*)plan)->copy_size_B, cudaMemcpyHostToDevice, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+        cerr = cudaMemcpyAsync(C_d, (void*)((intptr_t)C + ((struct product_plan*)plan)->data_offset_C), ((struct product_plan*)plan)->copy_size_C, cudaMemcpyHostToDevice, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+        A_d = (void*)((intptr_t)A_d + ((struct product_plan*)plan)->data_offset_A);
+        B_d = (void*)((intptr_t)B_d + ((struct product_plan*)plan)->data_offset_B);
+        C_d = (void*)((intptr_t)C_d + ((struct product_plan*)plan)->data_offset_C);
+        D_d = (void*)((intptr_t)D_d + ((struct product_plan*)plan)->data_offset_D);
+        if (do_permutation) {
+            E_d = (void*)((intptr_t)E_d + ((struct product_plan*)plan)->data_offset_D);
+        }
+        assert(uintptr_t(A_d) % 128 == 0);
+        assert(uintptr_t(B_d) % 128 == 0);
+        assert(uintptr_t(C_d) % 128 == 0);
+        assert(uintptr_t(D_d) % 128 == 0);
+    }
+    cutensorPlan_t* contraction_plan = ((struct product_plan*) plan)->contraction_plan;
+    uint64_t contraction_actual_workspace_size = 0;
+    cutensorStatus_t err;
+    err = cutensorPlanGetAttribute(*handle_struct->libhandle,
+                *contraction_plan,
+                CUTENSOR_PLAN_REQUIRED_WORKSPACE,
+                &contraction_actual_workspace_size,
+                sizeof(contraction_actual_workspace_size));
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+
+    contraction_actual_workspace_size = std::max(contraction_actual_workspace_size, uint64_t(128 * 1024 * 1024)); // 128 MiB recomended minimum size https://docs.nvidia.com/cuda/cutensor/latest/api/cutensor.html#cutensorcontract
+    void *contraction_work = nullptr;
+    cerr = cudaMallocAsync(&contraction_work, contraction_actual_workspace_size, *(cudaStream_t*)exec);
+    if (cerr != cudaSuccess) return pack_error(0, cerr);
+    assert(uintptr_t(contraction_work) % 128 == 0);
+
+    void* contraction_output = do_permutation ? E_d : D_d;
+    err = cutensorContract(*handle_struct->libhandle,
+                *contraction_plan,
+                alpha, A_d, B_d,
+                beta,  C_d, contraction_output, 
+                contraction_work, contraction_actual_workspace_size, *(cudaStream_t*)exec);
+    if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+
+    if (do_permutation)
+    {
+        cutensorPlan_t* permutation_plan = ((struct product_plan*) plan)->permutation_plan;
+        void* perm_scalar_ptr = NULL;
+
+        if (((struct product_plan*)plan)->type_D == TAPP_F32)
+        {
+            perm_scalar_ptr = malloc(sizeof(float));
+            *(float*)perm_scalar_ptr = 1.0f;
+        }
+        else if (((struct product_plan*)plan)->type_D == TAPP_F64)
+        {
+            perm_scalar_ptr = malloc(sizeof(double));
+            *(double*)perm_scalar_ptr = 1.0;
+        }
+        else if (((struct product_plan*)plan)->type_D == TAPP_C32)
+        {
+            perm_scalar_ptr = malloc(sizeof(std::complex<float>));
+            *(std::complex<float>*)perm_scalar_ptr = 1.0f;
+        }
+        else if (((struct product_plan*)plan)->type_D == TAPP_C64)
+        {
+            perm_scalar_ptr = malloc(sizeof(std::complex<double>));
+            *(std::complex<double>*)perm_scalar_ptr = 1.0;
+        }
+
+        err = cutensorPermute(*handle_struct->libhandle,
+                    *permutation_plan,
+                    perm_scalar_ptr,
+                    E_d,
+                    D_d,
+                    *(cudaStream_t*)exec);
+        if (err != CUTENSOR_STATUS_SUCCESS) return pack_error(0, err);
+        free(perm_scalar_ptr);
+    }
+
+    if (!use_device_memory)
+    {
+        int64_t section_coordinates_D[((struct product_plan*)plan)->sections_nmode_D];
+        for (size_t i = 0; i < ((struct product_plan*)plan)->sections_nmode_D; i++)
+        {
+            section_coordinates_D[i] = 0;
+        }
+
+        for (size_t i = 0; i < ((struct product_plan*)plan)->sections_D; i++)
+        {
+            int64_t index = compute_index(section_coordinates_D, ((struct product_plan*)plan)->sections_nmode_D, ((struct product_plan*)plan)->section_strides_D);
+            cerr = cudaMemcpyAsync((void*)((intptr_t)D + index * sizeof_datatype(((struct product_plan*)plan)->type_D)), 
+                (void*)((intptr_t)D_d + index * sizeof_datatype(((struct product_plan*)plan)->type_D)), 
+                ((struct product_plan*)plan)->section_size_D, cudaMemcpyDeviceToHost, *(cudaStream_t*)exec);
+            if (cerr != cudaSuccess) return pack_error(0, cerr);
+            increment_coordinates(section_coordinates_D, ((struct product_plan*)plan)->sections_nmode_D, ((struct product_plan*)plan)->section_extents_D);
+        }
+
+        A_d = (void*)((intptr_t)A_d - ((struct product_plan*)plan)->data_offset_A);
+        B_d = (void*)((intptr_t)B_d - ((struct product_plan*)plan)->data_offset_B);
+        C_d = (void*)((intptr_t)C_d - ((struct product_plan*)plan)->data_offset_C);
+        D_d = (void*)((intptr_t)D_d - ((struct product_plan*)plan)->data_offset_D);
+
+        if (A_d) { 
+            cerr = cudaFreeAsync(A_d, *(cudaStream_t*)exec);
+            if (cerr != cudaSuccess) return pack_error(0, cerr);
+        }
+        if (B_d) {
+            cerr = cudaFreeAsync(B_d, *(cudaStream_t*)exec);
+            if (cerr != cudaSuccess) return pack_error(0, cerr);
+        }
+        if (C_d) { 
+            cerr = cudaFreeAsync(C_d, *(cudaStream_t*)exec);
+            if (cerr != cudaSuccess) return pack_error(0, cerr);
+        }
+        if (D_d) {
+            cerr = cudaFreeAsync(D_d, *(cudaStream_t*)exec);
+            if (cerr != cudaSuccess) return pack_error(0, cerr);
+        }
+    }
+
+    if (E_d)
+    {
+        if (!use_device_memory)
+        {
+            E_d = (void*)((intptr_t)E_d - ((struct product_plan*)plan)->data_offset_D);
+        }
+        cerr = cudaFreeAsync(E_d, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+    }
+    if (contraction_work) {
+        cerr = cudaFreeAsync(contraction_work, *(cudaStream_t*)exec);
+        if (cerr != cudaSuccess) return pack_error(0, cerr);
+    }
+
+    return pack_error(0, err); 
+}
+
+int64_t compute_index(const int64_t* coordinates, int nmode, const int64_t* strides)
+{
+    int64_t index = 0;
+    for (int i = 0; i < nmode; i++)
+    {
+        index += coordinates[i] * strides[i];
+    }
+    return index;
+
+}
+
+void increment_coordinates(int64_t* coordinates, int nmode, const int64_t* extents)
+{
+    if (nmode <= 0)
+    {
+        return;
+    }
+
+    int k = 0;
+    do
+    {
+        coordinates[k] = (coordinates[k] + 1) % extents[k];
+        k++;
+    } while (coordinates[k - 1] == 0 && k < nmode);
+}
+
+cutensorOperator_t translate_operator(TAPP_element_op op)
+{
+    switch (op)
+    {
+    case TAPP_IDENTITY:
+        return CUTENSOR_OP_IDENTITY;
+        break;
+    case TAPP_CONJUGATE:
+        return CUTENSOR_OP_CONJ;
+        break;
+    default: // TODO: Default should probably be an error
+        return CUTENSOR_OP_IDENTITY;
+        break;
+    }
+}
diff --git a/cutensor_bindings/src/tensor.cpp b/cutensor_bindings/src/tensor.cpp
new file mode 100644
index 0000000..a316380
--- /dev/null
+++ b/cutensor_bindings/src/tensor.cpp
@@ -0,0 +1,106 @@
+#include "../include/tensor.h"
+
+TAPP_error TAPP_create_tensor_info(TAPP_tensor_info* info,
+                                   TAPP_handle handle,
+                                   TAPP_datatype type,
+                                   int nmode,
+                                   const int64_t* extents,
+                                   const int64_t* strides)
+{
+    struct tensor_info* tensor_info = new struct tensor_info;
+    tensor_info->desc = new cutensorTensorDescriptor_t;
+    struct handle* handle_struct = (struct handle*) handle;
+    
+    const uint32_t kAlignment = 128;
+    cutensorStatus_t err = cutensorCreateTensorDescriptor(*handle_struct->libhandle,
+                tensor_info->desc,
+                nmode,
+                extents,
+                strides,
+                translate_datatype(type), kAlignment);
+    if (err != CUTENSOR_STATUS_SUCCESS)
+    {
+        delete tensor_info->desc;
+        delete tensor_info;
+        return pack_error(0, err);
+    }
+    size_t elements = 1;
+    for (int i = 0; i < nmode; ++i)
+        elements *= extents[i];
+    tensor_info->copy_size = 1;
+    tensor_info->data_offset = 0;
+    for (int i = 0; i < nmode; i++)
+    {
+        tensor_info->copy_size += (extents[i] - 1)*strides[i];
+        if (strides[i] < 0)
+        {
+            tensor_info->data_offset += extents[i] * strides[i];
+        }
+    }
+    tensor_info->copy_size *= sizeof_datatype(type);
+    tensor_info->data_offset *= sizeof_datatype(type);
+    tensor_info->type = type;
+    tensor_info->elements = elements;
+    tensor_info->nmode = nmode;
+    tensor_info->extents = new int64_t[nmode];
+    tensor_info->strides = new int64_t[nmode];
+    for (int i = 0; i < nmode; ++i)
+    {
+        tensor_info->extents[i] = extents[i];
+        tensor_info->strides[i] = strides[i];
+    }
+    *info = (TAPP_tensor_info) tensor_info;
+    return 0;
+}
+
+TAPP_error TAPP_destroy_tensor_info(TAPP_tensor_info info)
+{
+    struct tensor_info* tensor_info = (struct tensor_info*) info;
+    cutensorStatus_t err = cutensorDestroyTensorDescriptor(*tensor_info->desc);
+    if (err != CUTENSOR_STATUS_SUCCESS)
+    {
+        return pack_error(0, err);
+    }
+    delete tensor_info->desc;
+    delete[] tensor_info->extents;
+    delete[] tensor_info->strides;
+    delete tensor_info;
+    return 0;
+}
+
+int TAPP_get_nmodes(TAPP_tensor_info info)
+{
+    return ((struct tensor_info*) info)->nmode;
+}
+
+TAPP_error TAPP_set_nmodes(TAPP_tensor_info info,
+                           int nmodes)
+{
+    return -1; // Can for now not be implemented. Cutensor does not support changing the number of modes after creation, so this would require recreating the descriptor, would need handle.
+}
+
+void TAPP_get_extents(TAPP_tensor_info info,
+                      int64_t* extents)
+{
+    memcpy(extents, ((struct tensor_info*) info)->extents, ((struct tensor_info*) info)->nmode * sizeof(int64_t));
+    return; 
+}
+
+TAPP_error TAPP_set_extents(TAPP_tensor_info info,
+                            const int64_t* extents)
+{
+    return -1; // Can for now not be implemented. Cutensor does not support changing the number of modes after creation, so this would require recreating the descriptor, would need handle.
+}
+
+void TAPP_get_strides(TAPP_tensor_info info,
+                      int64_t* strides)
+{
+    memcpy(strides, ((struct tensor_info*) info)->strides, ((struct tensor_info*) info)->nmode * sizeof(int64_t));
+    return; 
+}
+
+TAPP_error TAPP_set_strides(TAPP_tensor_info info,
+                            const int64_t* strides)
+{
+    return -1; // Can for now not be implemented. Cutensor does not support changing the number of modes after creation, so this would require recreating the descriptor, would need handle.
+}
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..009b438
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,129 @@
+# ----------------------------------------------------------------------------
+# driver
+
+add_executable(tapp-reference-driver)
+
+target_sources(
+  tapp-reference-driver
+  PRIVATE
+    driver/driver.c
+    ${PROJECT_SOURCE_DIR}/test/helpers.c
+    ${PROJECT_SOURCE_DIR}/test/helpers.h
+  )
+
+target_include_directories(
+  tapp-reference-driver
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}/test
+  )
+
+target_link_libraries(
+  tapp-reference-driver
+  PRIVATE
+    tapp::reference
+  )
+
+add_test(
+  NAME tapp-reference-driver
+  COMMAND $<TARGET_FILE:tapp-reference-driver>
+  )
+
+# ----------------------------------------------------------------------------
+# exercise: contraction
+
+if(TAPP_BUILD_EXERCISE)
+  add_executable(tapp-reference-exercise_contraction)
+
+  target_sources(
+    tapp-reference-exercise_contraction
+    PRIVATE
+      exercise_contraction/exercise_contraction.c
+      ${PROJECT_SOURCE_DIR}/test/helpers.c
+      ${PROJECT_SOURCE_DIR}/test/helpers.h
+    )
+
+  target_include_directories(
+    tapp-reference-exercise_contraction
+    PRIVATE
+      ${PROJECT_SOURCE_DIR}/test
+    )
+
+  target_link_libraries(
+    tapp-reference-exercise_contraction
+    PRIVATE
+      tapp::reference
+    )
+
+  add_test(
+    NAME tapp-reference-exercise_contraction
+    COMMAND $<TARGET_FILE:tapp-reference-exercise_contraction>
+    )
+endif()
+
+# ----------------------------------------------------------------------------
+# exercise: contraction answers
+
+add_executable(tapp-reference-exercise_contraction_answers)
+
+target_sources(
+  tapp-reference-exercise_contraction_answers
+  PRIVATE
+    exercise_contraction/answers/exercise_contraction_answers.c
+    ${PROJECT_SOURCE_DIR}/test/helpers.c
+    ${PROJECT_SOURCE_DIR}/test/helpers.h
+  )
+
+target_include_directories(
+  tapp-reference-exercise_contraction_answers
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}/test
+  )
+
+target_link_libraries(
+  tapp-reference-exercise_contraction_answers
+  PRIVATE
+    tapp::reference
+  )
+
+add_test(
+  NAME tapp-reference-exercise_contraction_answers
+  COMMAND $<TARGET_FILE:tapp-reference-exercise_contraction_answers>
+  )
+
+# ----------------------------------------------------------------------------
+# exercise: tucker
+
+add_library(tapp-reference-exercise_tucker SHARED)
+
+target_sources(
+  tapp-reference-exercise_tucker
+  PUBLIC
+    exercise_tucker/tapp_tucker/exercise_tucker.h
+  PRIVATE
+    exercise_tucker/tapp_tucker/exercise_tucker.c
+  )
+
+target_link_libraries(
+  tapp-reference-exercise_tucker
+  PRIVATE
+    tapp::reference
+  )
+
+# ----------------------------------------------------------------------------
+# exercise: tucker answers
+
+add_library(tapp-reference-exercise_tucker_answers SHARED)
+
+target_sources(
+  tapp-reference-exercise_tucker_answers
+  PUBLIC
+    exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.h
+  PRIVATE
+    exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c
+  )
+
+target_link_libraries(
+  tapp-reference-exercise_tucker_answers
+  PRIVATE
+    tapp::reference
+  )
diff --git a/examples/README.md b/examples/README.md
index ae41198..6608ada 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,7 +9,7 @@ for cmake: (Unix commands)
     Run CMake from directory: "cmake .."
     Run make from directory: "make -j"
     All files are created in the build directory
-    For use of TBLIS (not needed for the exercise) add: "-DTAPP_REFERENCE_ENABLE_TBLIS=1" after "cmake .."
+    For use of TBLIS (not needed for the exercise) add: "-DTAPP_REFERENCE_USE_TBLIS=1" after "cmake .."
 With TBLIS a file called test++ will be compiled
 
 2. Exercise contraction (try writing a tensor contraction with tapp)
diff --git a/examples/driver/driver.c b/examples/driver/driver.c
index 035ff33..c64d8ef 100644
--- a/examples/driver/driver.c
+++ b/examples/driver/driver.c
@@ -12,12 +12,19 @@
 
 int main(int argc, char const *argv[])
 {
+    TAPP_handle handle; // Declare handle
+    TAPP_create_handle(&handle); // Create handle
+
     /*
      * The tensor product looks in a simplified way as follows: D <- a*A*B+b*C.
      * Where the lowercase letters are constants and uppercase are tensors.
      * The operation requires four tensors that all needs to be initialized.
      */
 
+    /*
+     * Decide who the calculation should be executed, which indices to contract, elemental operations and precision.
+     */
+
     // Initialize the structures of the tensors
 
     // Tensor A
@@ -30,34 +37,28 @@ int main(int argc, char const *argv[])
 
     TAPP_tensor_info info_A; // Declare the variable that holds the tensor structure
 
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A); // Assign the structure to the variable, including datatype
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A); // Assign the structure to the variable, including datatype
 
     // Tensor B
     int nmode_B = 4;
     int64_t extents_B[4] = {3, 2, 2, 3};
     int64_t strides_B[4] = {1, 3, 6, 12};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     // Tensor C
     int nmode_C = 3;
     int64_t extents_C[3] = {4, 2, 2};
     int64_t strides_C[3] = {1, 4, 8};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     // Output tensor D
     int nmode_D = 3;
     int64_t extents_D[3] = {4, 2, 2};
     int64_t strides_D[3] = {1, 4, 8};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    /*
-     * Decide who the calculation should be executed, which indices to contract, elemental operations and precision.
-     */
-
-    TAPP_handle handle; // Declare handle (not yet in use)
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
     // Decide elemental operations (conjugate available for complex datatypes)
     TAPP_element_op op_A = TAPP_IDENTITY; // Decide elemental operation for tensor A
@@ -181,6 +182,7 @@ int main(int argc, char const *argv[])
     TAPP_destroy_tensor_info(info_C);
     TAPP_destroy_tensor_info(info_D);
     TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
 
     return 0;
 }
\ No newline at end of file
diff --git a/examples/exercise_contraction/answers/exercise_contraction_answers.c b/examples/exercise_contraction/answers/exercise_contraction_answers.c
index 5063b1c..a1258bf 100644
--- a/examples/exercise_contraction/answers/exercise_contraction_answers.c
+++ b/examples/exercise_contraction/answers/exercise_contraction_answers.c
@@ -17,6 +17,10 @@
 
 int main(int argc, char const *argv[])
 {
+    // Declare handle
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+    
     /*
      * Create the tensor structures for tensor A, B, C and D.
      * Tensor A with 3 indices, with the extents 4, 3, 2, and the strides 1, 4, 12.
@@ -44,28 +48,28 @@ int main(int argc, char const *argv[])
      * Uncomment code.
      * Fill in: the tensor info object, datatype(float32), structure for tensor A: number of indices, extents, strides.
      */
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     // Tensor B
     int nmode_B = 3;
     int64_t extents_B[3] = {3, 2, 4};
     int64_t strides_B[3] = {1, 3, 6};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     // Tensor C
     int nmode_C = 2;
     int64_t extents_C[2] = {3, 3};
     int64_t strides_C[2] = {1, 3};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     // Tensor D
     int nmode_D = 2;
     int64_t extents_D[2] = {3, 3};
     int64_t strides_D[2] = {1, 3};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
 
     /*
@@ -78,9 +82,6 @@ int main(int argc, char const *argv[])
      *  The second index for A and the first index for B are free indices, in that order. 
      */
 
-    // Declare handle (no assignment)
-    TAPP_handle handle;
-
     // Initialize the precision
     TAPP_prectype prec = TAPP_DEFAULT_PREC; 
 
@@ -225,6 +226,7 @@ int main(int argc, char const *argv[])
     TAPP_destroy_tensor_info(info_C);
     TAPP_destroy_tensor_info(info_D);
     TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
 
     /*
      * Expected output:
diff --git a/examples/exercise_contraction/exercise_contraction.c b/examples/exercise_contraction/exercise_contraction.c
index 2ed5d6c..d913107 100644
--- a/examples/exercise_contraction/exercise_contraction.c
+++ b/examples/exercise_contraction/exercise_contraction.c
@@ -16,6 +16,10 @@
 
 int main(int argc, char const *argv[])
 {
+    // Declare handle
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
     /*
      * Create the tensor structures for tensor A, B, C and D.
      * Tensor A with 3 indices, with the extents 4, 3, 2, and the strides 1, 4, 12.
@@ -41,30 +45,30 @@ int main(int argc, char const *argv[])
     /* 
      * TODO 1: Fill in the arguments for creating the tensor info.
      * Uncomment code.
-     * Fill in: the tensor info object, datatype(float32), structure for tensor A: number of indices, extents, strides.
+     * Fill in: the tensor info object, handle, datatype(float32), structure for tensor A: number of indices, extents, strides.
      */
-    //TAPP_create_tensor_info(, , , , );
+    //TAPP_create_tensor_info(, , , , , );
 
     // Tensor B
     int nmode_B = 3;
     int64_t extents_B[3] = {3, 2, 4};
     int64_t strides_B[3] = {1, 3, 6};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     // Tensor C
     int nmode_C = 2;
     int64_t extents_C[2] = {3, 3};
     int64_t strides_C[2] = {1, 3};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     // Tensor D
     int nmode_D = 2;
     int64_t extents_D[2] = {3, 3};
     int64_t strides_D[2] = {1, 3};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
 
     /*
@@ -77,9 +81,6 @@ int main(int argc, char const *argv[])
      *  The second index for A and the first index for B are free indices, in that order. 
      */
 
-    // Declare handle (no assignment)
-    TAPP_handle handle;
-
     // Initialize the precision
     TAPP_prectype prec = TAPP_DEFAULT_PREC; 
 
@@ -223,6 +224,7 @@ int main(int argc, char const *argv[])
     TAPP_destroy_tensor_info(info_C);
     TAPP_destroy_tensor_info(info_D);
     TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
 
     /*
      * Expected output:
diff --git a/examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c b/examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c
index 99f18d2..2221ddd 100644
--- a/examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c
+++ b/examples/exercise_tucker/tapp_tucker/answers/exercise_tucker_answers.c
@@ -12,6 +12,9 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str
                                    int nmode_D, int64_t* extents_D, int64_t* strides_D, void* D,
                                    int64_t* idx_A, int64_t* idx_B, int64_t* idx_D)
 {
+    TAPP_handle handle; // Declare handle
+    TAPP_create_handle(&handle); // Create handle
+
     /*
      * The tensor product looks in a simplified way as follows: D <- a*A*B+b*C.
      * Where the lowercase letters are constants and uppercase are tensors.
@@ -29,26 +32,24 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str
      * Uncomment function call
      * Add: nmode_A, extents_A, and strides_A
      */
-    TAPP_create_tensor_info(&info_A, TAPP_F64, nmode_A, extents_A, strides_A); // Assign the structure to the variable, including datatype
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F64, nmode_A, extents_A, strides_A); // Assign the structure to the variable, including datatype
 
     // Tensor B
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F64, nmode_B, extents_B, strides_B);
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F64, nmode_B, extents_B, strides_B);
 
     // Tensor C
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F64, nmode_D, extents_D, strides_D);
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F64, nmode_D, extents_D, strides_D);
 
     // Output tensor D
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F64, nmode_D, extents_D, strides_D);
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F64, nmode_D, extents_D, strides_D);
 
     /*
-     * Decide who the calculation should be executed, which indices to contract, elemental operations and precision.
+     * Decide how the calculation should be executed, which indices to contract, elemental operations and precision.
      */
 
-    TAPP_handle handle; // Declare handle (not yet in use)
-
     // Decide elemental operations (conjugate available for complex datatypes)
     TAPP_element_op op_A = TAPP_IDENTITY; // Decide elemental operation for tensor A
     TAPP_element_op op_B = TAPP_IDENTITY; // Decide elemental operation for tensor B
@@ -108,7 +109,7 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str
         int message_len = TAPP_explain_error(error, 0, NULL); // Get size of error message
         char *message_buff = malloc((message_len + 1) * sizeof(char)); // Allocate buffer for message, including null terminator
         TAPP_explain_error(error, message_len + 1, message_buff); // Fetch error message
-        printf(message_buff); // Print message
+        printf("%s", message_buff); // Print message
         free(message_buff); // Free buffer
         printf("\n");
     }
@@ -122,6 +123,7 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str
     TAPP_destroy_tensor_info(info_B);
     TAPP_destroy_tensor_info(info_D);
     TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
 
     return D;
 }
\ No newline at end of file
diff --git a/examples/exercise_tucker/tapp_tucker/exercise_tucker.c b/examples/exercise_tucker/tapp_tucker/exercise_tucker.c
index 9c0c86e..a67ea5d 100644
--- a/examples/exercise_tucker/tapp_tucker/exercise_tucker.c
+++ b/examples/exercise_tucker/tapp_tucker/exercise_tucker.c
@@ -12,6 +12,9 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str
                                    int nmode_D, int64_t* extents_D, int64_t* strides_D, void* D,
                                    int64_t* idx_A, int64_t* idx_B, int64_t* idx_D)
 {
+    TAPP_handle handle; // Declare handle
+    TAPP_create_handle(&handle); // Create handle
+
     /*
      * The tensor product looks in a simplified way as follows: D <- a*A*B+b*C.
      * Where the lowercase letters are constants and uppercase are tensors.
@@ -29,26 +32,24 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str
      * Uncomment function call
      * Add: nmode_A, extents_A, and strides_A
      */
-    //TAPP_create_tensor_info(&info_A, TAPP_F64, , , ); // Assign the structure to the variable, including datatype
+    //TAPP_create_tensor_info(&info_A, handle, TAPP_F64, , , ); // Assign the structure to the variable, including datatype
 
     // Tensor B
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F64, nmode_B, extents_B, strides_B);
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F64, nmode_B, extents_B, strides_B);
 
     // Tensor C
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F64, nmode_D, extents_D, strides_D);
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F64, nmode_D, extents_D, strides_D);
 
     // Output tensor D
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F64, nmode_D, extents_D, strides_D);
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F64, nmode_D, extents_D, strides_D);
 
     /*
-     * Decide who the calculation should be executed, which indices to contract, elemental operations and precision.
+     * Decide how the calculation should be executed, which indices to contract, elemental operations and precision.
      */
 
-    TAPP_handle handle; // Declare handle (not yet in use)
-
     // Decide elemental operations (conjugate available for complex datatypes)
     TAPP_element_op op_A = TAPP_IDENTITY; // Decide elemental operation for tensor A
     TAPP_element_op op_B = TAPP_IDENTITY; // Decide elemental operation for tensor B
@@ -108,7 +109,7 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str
         int message_len = TAPP_explain_error(error, 0, NULL); // Get size of error message
         char *message_buff = malloc((message_len + 1) * sizeof(char)); // Allocate buffer for message, including null terminator
         TAPP_explain_error(error, message_len + 1, message_buff); // Fetch error message
-        printf(message_buff); // Print message
+        printf("%s", message_buff); // Print message
         free(message_buff); // Free buffer
         printf("\n");
     }
@@ -122,6 +123,7 @@ void* tucker_to_tensor_contraction(int nmode_A, int64_t* extents_A, int64_t* str
     TAPP_destroy_tensor_info(info_B);
     TAPP_destroy_tensor_info(info_D);
     TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
 
     return D;
 }
\ No newline at end of file
diff --git a/reference_implementation/CMakeLists.txt b/reference_implementation/CMakeLists.txt
index 311e44b..a9c13a9 100644
--- a/reference_implementation/CMakeLists.txt
+++ b/reference_implementation/CMakeLists.txt
@@ -31,7 +31,7 @@ if (APPLE AND CMAKE_C_COMPILER_ID MATCHES "^(Clang|AppleClang)$")
   target_link_options(tapp-reference PRIVATE "-undefined;dynamic_lookup")
 endif()
 
-target_link_libraries(tapp-reference PUBLIC tapp-api)
+target_link_libraries(tapp-reference PUBLIC tapp::api)
 
 option(TAPP_BUILD_EXERCISE "Build contraction exercise with TODOs in it." OFF)
 
@@ -46,7 +46,7 @@ if(TAPP_REFERENCE_ENABLE_BF16)
   target_compile_definitions(tapp-reference PRIVATE TAPP_REFERENCE_ENABLE_BF16=1)
 endif()
 
-if(TAPP_REFERENCE_ENABLE_TBLIS)
+if(TAPP_REFERENCE_USE_TBLIS)
 
   set(TBLIS_INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/tblis)
 
@@ -63,7 +63,7 @@ if(TAPP_REFERENCE_ENABLE_TBLIS)
 
   FetchContent_MakeAvailable(tblis)
 
-  target_compile_definitions(tapp-reference PRIVATE TAPP_REFERENCE_ENABLE_TBLIS=1)
+  target_compile_definitions(tapp-reference PRIVATE TAPP_REFERENCE_USE_TBLIS=1)
 
   target_sources(
   tapp-reference
diff --git a/reference_implementation/src/executor.c b/reference_implementation/src/executor.c
index f352ed2..818602a 100644
--- a/reference_implementation/src/executor.c
+++ b/reference_implementation/src/executor.c
@@ -9,7 +9,7 @@
 TAPP_error TAPP_create_executor(TAPP_executor* exec) {
     *exec = (TAPP_executor)malloc(sizeof(int));
     int ex = 1; // the bruteforce reference executor
-#ifdef TAPP_REFERENCE_ENABLE_TBLIS
+#ifdef TAPP_REFERENCE_USE_TBLIS
     // ex = 2; // TBLIS used as executor, use 12 for debug mode
 #endif
     *((int*)(*exec)) = ex;
diff --git a/reference_implementation/src/product.c b/reference_implementation/src/product.c
index 1624839..276ac91 100644
--- a/reference_implementation/src/product.c
+++ b/reference_implementation/src/product.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#ifdef TAPP_REFERENCE_ENABLE_TBLIS
+#ifdef TAPP_REFERENCE_USE_TBLIS
 #include "tblis_bind.h"
 #endif
 
@@ -251,7 +251,7 @@ TAPP_error TAPP_execute_product(TAPP_tensor_product plan,
     if((*exec_int_ptr) == 2 || (*exec_int_ptr) == 12 ) { // 1 = bruteforce, 2 = tblis, 12 = tblis + bruteforce check
       // if((*exec_int_ptr) == 2) printf("tapp used2 \n");
 
-#ifdef TAPP_REFERENCE_ENABLE_TBLIS
+#ifdef TAPP_REFERENCE_USE_TBLIS
       bind_tblis_execute_product(nmode_A, extents_A, strides_A, A, op_A, idx_A,
                        nmode_B, extents_B, strides_B, B, op_B, idx_B,
                        nmode_C, extents_C, strides_C, C, op_C, idx_D,
@@ -423,7 +423,7 @@ TAPP_error TAPP_execute_product(TAPP_tensor_product plan,
 
     bool comp_ = true;
     if((*exec_int_ptr) == 12 ) { // 1 = bruteforce, 2 = tblis, 12 = tblis + bruteforce check
-#ifdef TAPP_REFERENCE_ENABLE_TBLIS
+#ifdef TAPP_REFERENCE_USE_TBLIS
       comp_ = compare_tensors_(D, E_, (int64_t)size_D, type_D);
 #endif
       if(!comp_){
diff --git a/reference_implementation/src/status.c b/reference_implementation/src/status.c
new file mode 100644
index 0000000..cc1cf79
--- /dev/null
+++ b/reference_implementation/src/status.c
@@ -0,0 +1,10 @@
+/*
+ * Ed Valeev
+ */
+#include "ref_impl.h"
+#include <stdlib.h>
+
+TAPP_error TAPP_destroy_status(TAPP_status status) {
+    return 0;
+}
+
diff --git a/reference_implementation/src/tensor.c b/reference_implementation/src/tensor.c
index 56e8234..c55c208 100644
--- a/reference_implementation/src/tensor.c
+++ b/reference_implementation/src/tensor.c
@@ -9,6 +9,7 @@
 #include <string.h>
 
 TAPP_error TAPP_create_tensor_info(TAPP_tensor_info* info,
+                                   TAPP_handle handle,
                                    TAPP_datatype type,
                                    int nmode,
                                    const int64_t* extents,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..4408300
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,155 @@
+# ----------------------------------------------------------------------------
+# TBLIS test
+
+if(TAPP_REFERENCE_USE_TBLIS)
+  add_executable(tapp-reference-test)
+
+  target_sources(
+    tapp-reference-test
+    PRIVATE
+      test.cpp
+      test.h
+    )
+
+  target_link_libraries(
+    tapp-reference-test
+    PRIVATE
+      tapp::reference
+      tblis-static
+    )
+
+  set_property(
+    TARGET tapp-reference-test
+    PROPERTY
+      CXX_STANDARD 20
+      CXX_STANDARD_REQUIRED YES
+      CXX_EXTENSIONS NO
+  )
+
+  add_test(
+    NAME tapp-reference-test
+    COMMAND $<TARGET_FILE:tapp-reference-test>
+    )
+endif()
+
+# ----------------------------------------------------------------------------
+# demo
+
+add_executable(tapp-reference-demo)
+
+target_sources(
+  tapp-reference-demo
+  PRIVATE
+    demo.c
+    helpers.c
+    helpers.h
+  )
+
+target_link_libraries(
+  tapp-reference-demo
+  PRIVATE
+    tapp::reference
+  )
+
+add_test(
+  NAME tapp-reference-demo
+  COMMAND $<TARGET_FILE:tapp-reference-demo>
+  )
+
+# ----------------------------------------------------------------------------
+# cutensor specific code
+
+if (TAPP_CUTENSOR)
+  # ----------------------------------------------------------------------------
+  # cutensor demo
+
+  add_executable(tapp-cutensor-demo)
+
+  target_sources(
+    tapp-cutensor-demo
+    PRIVATE
+      cutensor_demo.cpp
+      helpers.c
+      helpers.h
+  )
+
+  target_link_libraries(
+    tapp-cutensor-demo
+    PRIVATE
+      tapp::cutensor
+      CUDA::cudart
+  )
+
+  target_include_directories(
+    tapp-cutensor-demo
+    PRIVATE
+      ${CMAKE_CURRENT_SOURCE_DIR}
+  )
+
+  add_test(
+    NAME tapp-cutensor-demo
+    COMMAND $<TARGET_FILE:tapp-cutensor-demo>
+  )
+
+  # ----------------------------------------------------------------------------
+  # demo using dynamic library
+
+  add_executable(tapp-reference-demo-dynamic)
+  
+  target_compile_definitions(
+    tapp-reference-demo-dynamic
+    PRIVATE
+      TAPP_DYNAMIC_LAUNCH
+  )
+
+  target_sources(
+    tapp-reference-demo-dynamic
+    PRIVATE
+      demo.c
+      helpers.c
+      helpers.h
+  )
+
+  target_link_libraries(
+    tapp-reference-demo-dynamic
+    PRIVATE
+      tapp::api
+      ${CMAKE_DL_LIBS}
+  )
+
+  add_test(
+    NAME tapp-reference-demo-dynamic
+    COMMAND $<TARGET_FILE:tapp-reference-demo-dynamic>
+  )
+
+  # ----------------------------------------------------------------------------
+  # test using dynamic library
+
+  add_executable(tapp-reference-test-dynamic)
+  
+  target_compile_definitions(
+    tapp-reference-test-dynamic
+    PRIVATE
+      TAPP_DYNAMIC_LAUNCH
+  )
+
+  target_sources(
+    tapp-reference-test-dynamic
+    PRIVATE
+      test.cpp
+      test.h
+  )
+
+  target_link_libraries(
+    tapp-reference-test-dynamic
+    PRIVATE
+      tapp::api
+      ${CMAKE_DL_LIBS}
+  )
+
+  add_test(
+    NAME tapp-reference-test-dynamic
+    COMMAND $<TARGET_FILE:tapp-reference-test-dynamic>
+  )
+
+endif()
diff --git a/test/cutensor_demo.cpp b/test/cutensor_demo.cpp
new file mode 100644
index 0000000..87d3ab8
--- /dev/null
+++ b/test/cutensor_demo.cpp
@@ -0,0 +1,1518 @@
+/*
+ * Niklas Hörnblad
+ * Paolo Bientinesi
+ * Umeå University - December 2025
+ */
+
+#include <tapp.h>
+
+#include <cuda_runtime.h>
+
+#include <cstdlib>
+#include <cstdio>
+#include <complex>
+#include <cassert>
+
+extern "C" {
+    #include "helpers.h"
+}
+
+void contraction();
+void hadamard();
+void complex_num();
+void conjugate();
+void zero_dim();
+void one_ext_contracted();
+void one_ext_transfered();
+void chained_diff_op();
+void chained_same_op();
+void negative_str();
+void subtensors();
+void print_tensor_c_cpp(int nmode, const int64_t *extents, const int64_t *strides, const std::complex<float> *data);
+
+int main(int argc, char const *argv[])
+{
+    printf("Contraction: \n");
+    contraction();
+    printf("Hadamard: \n");
+    hadamard();
+    printf("Complex: \n");
+    complex_num();
+    printf("Conjugate: \n"); // Incorrect result with CuTensor, CuTensor applies opC in another way
+    conjugate();
+    printf("Zero dim: \n");
+    zero_dim();
+    printf("One ext contracted: \n");
+    one_ext_contracted();
+    printf("One ext transfered: \n");
+    one_ext_transfered();
+    printf("Chained diff op: \n");
+    chained_diff_op();
+    printf("Chained same op: \n");
+    chained_same_op();
+    /*printf("Negative str: \n"); // Doesn't work with CuTensor, does not allow negative strides
+    negative_str();*/
+    printf("Subtensors: \n");
+    subtensors();
+    return 0;
+}
+
+void contraction()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 3;
+    int64_t extents_A[3] = {4, 3, 3};
+    int64_t strides_A[3] = {1, 4, 12};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 4;
+    int64_t extents_B[4] = {3, 2, 2, 3};
+    int64_t strides_B[4] = {1, 3, 6, 12};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 3;
+    int64_t extents_C[3] = {4, 2, 2};
+    int64_t strides_C[3] = {1, 4, 8};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 3;
+    int64_t extents_D[3] = {4, 2, 2};
+    int64_t strides_D[3] = {1, 4, 8};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[3] = {'a', 'b', 'c'};
+    int64_t idx_B[4] = {'c', 'd', 'e', 'b'};
+    int64_t idx_C[3] = {'a', 'd', 'e'};
+    int64_t idx_D[3] = {'a', 'd', 'e'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    // int exec_id = 1;
+    // exec = (intptr_t)&exec_id;
+    TAPP_status status;
+
+    float alpha = 1;
+
+    float A[36] = {
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1};
+
+    float B[36] = {
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6,
+
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6,
+
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6};
+
+    float beta = 0;
+
+    float C[16] = {
+        2, 4, 6, 8,
+        2, 4, 6, 8,
+
+        2, 4, 6, 8,
+        2, 4, 6, 8};
+
+    float D[16] = {
+        1, 2, 3, 4,
+        5, 6, 7, 8,
+
+        1, 2, 3, 4,
+        5, 6, 7, 8};
+
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 36 * sizeof(float));
+    cudaMalloc((void**)&B_d, 36 * sizeof(float));
+    cudaMalloc((void**)&C_d, 16 * sizeof(float));
+    cudaMalloc((void**)&D_d, 16 * sizeof(float));
+
+    cudaMemcpy(A_d, (void*)A, 36 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B, 36 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_error error = TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+    printf(TAPP_check_success(error) ? "Success\n" : "Fail\n");
+    int message_len = TAPP_explain_error(error, 0, NULL);
+    char *message_buff = (char*)malloc((message_len + 1) * sizeof(char));
+    TAPP_explain_error(error, message_len + 1, message_buff);
+    printf("%s", message_buff);
+    free(message_buff);
+
+    cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    print_tensor_s(nmode_D, extents_D, strides_D, D);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+void hadamard()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 2;
+    int64_t extents_A[2] = {4, 4};
+    int64_t strides_A[2] = {1, 4};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 2;
+    int64_t extents_B[2] = {4, 4};
+    int64_t strides_B[2] = {1, 4};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 2;
+    int64_t extents_C[2] = {4, 4};
+    int64_t strides_C[2] = {1, 4};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 2;
+    int64_t extents_D[2] = {4, 4};
+    int64_t strides_D[2] = {1, 4};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[2] = {'a', 'b'};
+    int64_t idx_B[2] = {'a', 'b'};
+    int64_t idx_C[2] = {'a', 'b'};
+    int64_t idx_D[2] = {'a', 'b'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    float alpha = 3;
+
+    float A[16] = {
+        1, 2, 3, 4,
+        1, 2, 3, 4,
+        1, 2, 3, 4,
+        1, 2, 3, 4};
+
+    float B[16] = {
+        1, 1, 1, 1,
+        2, 2, 2, 2,
+        3, 3, 3, 3,
+        4, 4, 4, 4};
+
+    float beta = 2;
+
+    float C[16] = {
+        1, 2, 1, 2,
+        1, 2, 1, 2,
+        1, 2, 1, 2,
+        1, 2, 1, 2};
+
+    float D[16] = {
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+    };
+
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 16 * sizeof(float));
+    cudaMalloc((void**)&B_d, 16 * sizeof(float));
+    cudaMalloc((void**)&C_d, 16 * sizeof(float));
+    cudaMalloc((void**)&D_d, 16 * sizeof(float));
+
+    cudaMemcpy(A_d, (void*)A, 16 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B, 16 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+
+    cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    print_tensor_s(nmode_D, extents_D, strides_D, D);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+void complex_num()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 2;
+    int64_t extents_A[2] = {3, 3};
+    int64_t strides_A[2] = {1, 3};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_C32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 2;
+    int64_t extents_B[2] = {3, 3};
+    int64_t strides_B[2] = {1, 3};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_C32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 2;
+    int64_t extents_C[2] = {3, 3};
+    int64_t strides_C[2] = {1, 3};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_C32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 2;
+    int64_t extents_D[2] = {3, 3};
+    int64_t strides_D[2] = {1, 3};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_C32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[2] = {'a', 'b'};
+    int64_t idx_B[2] = {'b', 'c'};
+    int64_t idx_C[2] = {'a', 'c'};
+    int64_t idx_D[2] = {'a', 'c'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    std::complex<float> alpha = 1;
+
+    std::complex<float> A[9] = {
+        {1, 1}, {3, 2}, {5, 3},
+        {1, 1}, {3, 2}, {5, 3},
+        {1, 1}, {3, 2}, {5, 3}};
+
+    std::complex<float> B[9] = {
+        {1, 1}, {1, 1}, {1, 1},
+        {2, 2}, {2, 2}, {2, 2},
+        {3, 3}, {3, 3}, {3, 3}};
+
+    std::complex<float> beta = {0, 1};
+
+    std::complex<float> C[9] = {
+        {1, 2}, {2, 1}, {3, 1},
+        {1, 2}, {2, 1}, {3, 1},
+        {1, 2}, {2, 1}, {3, 1}};
+
+    std::complex<float> D[9] = {
+        {1, 1}, {2, 2}, {3, 3},
+        {4, 4}, {5, 5}, {6, 6},
+        {7, 7}, {8, 8}, {9, 2}};
+
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 9 * sizeof(std::complex<float>));
+    cudaMalloc((void**)&B_d, 9 * sizeof(std::complex<float>));
+    cudaMalloc((void**)&C_d, 9 * sizeof(std::complex<float>));
+    cudaMalloc((void**)&D_d, 9 * sizeof(std::complex<float>));
+
+    cudaMemcpy(A_d, (void*)A, 9 * sizeof(std::complex<float>), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B, 9 * sizeof(std::complex<float>), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 9 * sizeof(std::complex<float>), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+
+    cudaMemcpy((void*)D, (void*)D_d, 9 * sizeof(std::complex<float>), cudaMemcpyDeviceToHost);
+
+    print_tensor_c_cpp(nmode_D, extents_D, strides_D, D);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+void conjugate()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 2;
+    int64_t extents_A[2] = {3, 3};
+    int64_t strides_A[2] = {1, 3};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_C32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 2;
+    int64_t extents_B[2] = {3, 3};
+    int64_t strides_B[2] = {1, 3};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_C32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 2;
+    int64_t extents_C[2] = {3, 3};
+    int64_t strides_C[2] = {1, 3};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_C32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 2;
+    int64_t extents_D[2] = {3, 3};
+    int64_t strides_D[2] = {1, 3};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_C32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_CONJUGATE;
+    TAPP_element_op op_C = TAPP_CONJUGATE;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[2] = {'a', 'b'};
+    int64_t idx_B[2] = {'b', 'c'};
+    int64_t idx_C[2] = {'a', 'c'};
+    int64_t idx_D[2] = {'a', 'c'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    std::complex<float> alpha = 1;
+
+    std::complex<float> A[9] = {
+        {1, 1}, {3, 2}, {5, 3},
+        {1, 1}, {3, 2}, {5, 3},
+        {1, 1}, {3, 2}, {5, 3}};
+
+    std::complex<float> B[9] = {
+        {1, 1}, {1, 1}, {1, 1},
+        {2, 2}, {2, 2}, {2, 2},
+        {3, 3}, {3, 3}, {3, 3}};
+
+    std::complex<float> beta = {0, 1};
+
+    std::complex<float> C[9] = {
+        {1, 2}, {2, 1}, {3, 1},
+        {1, 2}, {2, 1}, {3, 1},
+        {1, 2}, {2, 1}, {3, 1}};
+
+    std::complex<float> D[9] = {
+        {1, 1}, {2, 2}, {3, 3},
+        {4, 4}, {5, 5}, {6, 6},
+        {7, 7}, {8, 8}, {9, 2}};
+        
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 9 * sizeof(std::complex<float>));
+    cudaMalloc((void**)&B_d, 9 * sizeof(std::complex<float>));
+    cudaMalloc((void**)&C_d, 9 * sizeof(std::complex<float>));
+    cudaMalloc((void**)&D_d, 9 * sizeof(std::complex<float>));
+
+    cudaMemcpy(A_d, (void*)A, 9 * sizeof(std::complex<float>), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B, 9 * sizeof(std::complex<float>), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 9 * sizeof(std::complex<float>), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+
+    cudaMemcpy((void*)D, (void*)D_d, 9 * sizeof(std::complex<float>), cudaMemcpyDeviceToHost);
+
+    print_tensor_c_cpp(nmode_D, extents_D, strides_D, D);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+void zero_dim()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 0;
+    int64_t extents_A[0] = {};
+    int64_t strides_A[0] = {};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 2;
+    int64_t extents_B[2] = {3, 3};
+    int64_t strides_B[2] = {1, 3};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 2;
+    int64_t extents_C[2] = {3, 3};
+    int64_t strides_C[2] = {1, 3};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 2;
+    int64_t extents_D[2] = {3, 3};
+    int64_t strides_D[2] = {1, 3};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[0] = {};
+    int64_t idx_B[2] = {'a', 'b'};
+    int64_t idx_C[2] = {'a', 'b'};
+    int64_t idx_D[2] = {'a', 'b'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    float alpha = 1;
+
+    float A[1] = {
+        5};
+
+    float B[9] = {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9};
+
+    float beta = 0;
+
+    float C[9] = {
+        1, 1, 1,
+        1, 1, 1,
+        1, 1, 1};
+
+    float D[9] = {
+        2, 2, 2,
+        2, 2, 2,
+        2, 2, 2};
+        
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 1 * sizeof(float));
+    cudaMalloc((void**)&B_d, 9 * sizeof(float));
+    cudaMalloc((void**)&C_d, 9 * sizeof(float));
+    cudaMalloc((void**)&D_d, 9 * sizeof(float));
+
+    cudaMemcpy(A_d, (void*)A, 1 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B, 9 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 9 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+
+    cudaMemcpy((void*)D, (void*)D_d, 9 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    print_tensor_s(nmode_D, extents_D, strides_D, D);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+void one_ext_contracted()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 4;
+    int64_t extents_A[4] = {4, 1, 3, 3};
+    int64_t strides_A[4] = {1, 4, 4, 12};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 5;
+    int64_t extents_B[5] = {3, 2, 1, 2, 3};
+    int64_t strides_B[5] = {1, 3, 6, 6, 12};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 3;
+    int64_t extents_C[3] = {4, 2, 2};
+    int64_t strides_C[3] = {1, 4, 8};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 3;
+    int64_t extents_D[3] = {4, 2, 2};
+    int64_t strides_D[3] = {1, 4, 8};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[4] = {'a', 'b', 'c', 'd'};
+    int64_t idx_B[5] = {'d', 'e', 'b', 'f', 'c'};
+    int64_t idx_C[3] = {'a', 'e', 'f'};
+    int64_t idx_D[3] = {'a', 'e', 'f'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    float alpha = 1;
+
+    float A[36] = {
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1};
+
+    float B[36] = {
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6,
+
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6,
+
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6};
+
+    float beta = 0;
+
+    float C[16] = {
+        2, 4, 6, 8,
+        2, 4, 6, 8,
+
+        2, 4, 6, 8,
+        2, 4, 6, 8};
+
+    float D[16] = {
+        1, 2, 3, 4,
+        5, 6, 7, 8,
+
+        1, 2, 3, 4,
+        5, 6, 7, 8};
+
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 36 * sizeof(float));
+    cudaMalloc((void**)&B_d, 36 * sizeof(float));
+    cudaMalloc((void**)&C_d, 16 * sizeof(float));
+    cudaMalloc((void**)&D_d, 16 * sizeof(float));
+
+    cudaMemcpy(A_d, (void*)A, 36 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B, 36 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+
+    cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    print_tensor_s(nmode_D, extents_D, strides_D, D);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+void one_ext_transfered()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 4;
+    int64_t extents_A[4] = {4, 1, 3, 3};
+    int64_t strides_A[4] = {1, 4, 4, 12};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 4;
+    int64_t extents_B[4] = {3, 2, 2, 3};
+    int64_t strides_B[4] = {1, 3, 6, 12};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 4;
+    int64_t extents_C[4] = {4, 1, 2, 2};
+    int64_t strides_C[4] = {1, 4, 4, 8};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 4;
+    int64_t extents_D[4] = {4, 1, 2, 2};
+    int64_t strides_D[4] = {1, 4, 4, 8};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[4] = {'a', 'b', 'c', 'd'};
+    int64_t idx_B[4] = {'d', 'e', 'f', 'c'};
+    int64_t idx_C[4] = {'a', 'b', 'e', 'f'};
+    int64_t idx_D[4] = {'a', 'b', 'e', 'f'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    float alpha = 1;
+
+    float A[36] = {
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1};
+
+    float B[36] = {
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6,
+
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6,
+
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6};
+
+    float beta = 0;
+
+    float C[16] = {
+        2, 4, 6, 8,
+        2, 4, 6, 8,
+
+        2, 4, 6, 8,
+        2, 4, 6, 8};
+
+    float D[16] = {
+        1, 2, 3, 4,
+        5, 6, 7, 8,
+
+        1, 2, 3, 4,
+        5, 6, 7, 8};
+
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 36 * sizeof(float));
+    cudaMalloc((void**)&B_d, 36 * sizeof(float));
+    cudaMalloc((void**)&C_d, 16 * sizeof(float));
+    cudaMalloc((void**)&D_d, 16 * sizeof(float));
+
+    cudaMemcpy(A_d, (void*)A, 36 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B, 36 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+
+    cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    print_tensor_s(nmode_D, extents_D, strides_D, D);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+void chained_diff_op()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 3;
+    int64_t extents_A[3] = {4, 3, 3};
+    int64_t strides_A[3] = {1, 4, 12};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 4;
+    int64_t extents_B[4] = {3, 2, 2, 3};
+    int64_t strides_B[4] = {1, 3, 6, 12};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 3;
+    int64_t extents_C[3] = {4, 2, 2};
+    int64_t strides_C[3] = {1, 4, 8};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 3;
+    int64_t extents_D[3] = {4, 2, 2};
+    int64_t strides_D[3] = {1, 4, 8};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[3] = {'a', 'b', 'c'};
+    int64_t idx_B[4] = {'c', 'd', 'e', 'b'};
+    int64_t idx_C[3] = {'a', 'd', 'e'};
+    int64_t idx_D[3] = {'a', 'd', 'e'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    float alpha = 2;
+
+    float A[36] = {
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1,
+        1, 2, 1.01, -1};
+
+    float B[36] = {
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6,
+
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6,
+
+        1, 1, 1,
+        2, 2, 2,
+
+        3, 3, 3,
+        6, 6, 6};
+
+    float beta = 0;
+
+    float C[16] = {
+        2, 4, 6, 8,
+        2, 4, 6, 8,
+
+        2, 4, 6, 8,
+        2, 4, 6, 8};
+
+    float D[16] = {
+        1, 2, 3, 4,
+        5, 6, 7, 8,
+
+        1, 2, 3, 4,
+        5, 6, 7, 8};
+
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 36 * sizeof(float));
+    cudaMalloc((void**)&B_d, 36 * sizeof(float));
+    cudaMalloc((void**)&C_d, 16 * sizeof(float));
+    cudaMalloc((void**)&D_d, 16 * sizeof(float));
+
+    cudaMemcpy(A_d, (void*)A, 36 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B, 36 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+
+    cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    printf("\tOperation 1:\n");
+    print_tensor_s(nmode_D, extents_D, strides_D, D);
+
+    alpha = 0.5;
+
+    int nmode_E = 3;
+    int64_t extents_E[3] = {4, 2, 2};
+    int64_t strides_E[3] = {1, 4, 8};
+    TAPP_tensor_info info_E;
+    TAPP_create_tensor_info(&info_E, handle, TAPP_F32, nmode_E, extents_E, strides_E);
+
+    TAPP_tensor_product plan2;
+    TAPP_element_op op_E = TAPP_IDENTITY;
+    int64_t idx_E[3] = {'a', 'd', 'e'};
+    TAPP_create_tensor_product(&plan2, handle, op_D, info_D, idx_D, op_C, info_C, idx_C, op_C, info_C, idx_C, op_E, info_E, idx_E, prec);
+
+    float E[16] = {
+        1, 2, 3, 4,
+        5, 6, 7, 8,
+
+        1, 2, 3, 4,
+        5, 6, 7, 8};
+
+    void* E_d; // Device pointer
+    cudaMalloc((void**)&E_d, 16 * sizeof(float));
+
+    cudaMemcpy(E_d, (void*)E, 16 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(E_d) % 128 == 0);
+    
+    TAPP_execute_product(plan2, exec, &status, (void *)&alpha, (void *)D_d, (void *)C_d, (void *)&beta, (void *)C_d, (void *)E_d);
+
+    cudaMemcpy((void*)E, (void*)E_d, 16 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    printf("\tOperation 2:\n");
+    print_tensor_s(nmode_E, extents_E, strides_E, E);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    if (E_d) cudaFree(E_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_product(plan2);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_tensor_info(info_E);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+void chained_same_op()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 2;
+    int64_t extents_A[2] = {4, 4};
+    int64_t strides_A[2] = {1, 4};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 2;
+    int64_t extents_B[2] = {4, 4};
+    int64_t strides_B[2] = {1, 4};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 2;
+    int64_t extents_C[2] = {4, 4};
+    int64_t strides_C[2] = {1, 4};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 2;
+    int64_t extents_D[2] = {4, 4};
+    int64_t strides_D[2] = {1, 4};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[2] = {'a', 'b'};
+    int64_t idx_B[2] = {'a', 'b'};
+    int64_t idx_C[2] = {'a', 'b'};
+    int64_t idx_D[2] = {'a', 'b'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    float alpha = 3;
+
+    float A[16] = {
+        1, 2, 3, 4,
+        1, 2, 3, 4,
+        1, 2, 3, 4,
+        1, 2, 3, 4};
+
+    float B[16] = {
+        1, 1, 1, 1,
+        2, 2, 2, 2,
+        3, 3, 3, 3,
+        4, 4, 4, 4};
+
+    float beta = 2;
+
+    float C[16] = {
+        1, 2, 1, 2,
+        1, 2, 1, 2,
+        1, 2, 1, 2,
+        1, 2, 1, 2};
+
+    float D[16] = {
+        1, 2, 3, 4,
+        5, 6, 7, 8,
+        9, 10, 11, 12,
+        13, 14, 15, 16};
+
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 16 * sizeof(float));
+    cudaMalloc((void**)&B_d, 16 * sizeof(float));
+    cudaMalloc((void**)&C_d, 16 * sizeof(float));
+    cudaMalloc((void**)&D_d, 16 * sizeof(float));
+
+    cudaMemcpy(A_d, (void*)A, 16 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B, 16 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 16 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+
+    cudaMemcpy((void*)D, (void*)D_d, 16 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    printf("\tOperation 1:\n");
+    print_tensor_s(nmode_D, extents_D, strides_D, D);
+
+    alpha = 1;
+    beta = 2;
+    float E[16] = {
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+    };
+
+    void* E_d; // Device pointer
+    cudaMalloc((void**)&E_d, 16 * sizeof(float));
+
+    cudaMemcpy(E_d, (void*)E, 16 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(E_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)D_d, (void *)&beta, (void *)C_d, (void *)E_d);
+
+    cudaMemcpy((void*)E, (void*)E_d, 16 * sizeof(float), cudaMemcpyDeviceToHost);
+    
+    printf("\tOperation 2:\n");
+    print_tensor_s(nmode_D, extents_D, strides_D, E);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+/*void negative_str() //cutensor does not support negative strides
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 3;
+    int64_t extents_A[3] = {4, 3, 3};
+    int64_t strides_A[3] = {-1, -4, -12};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 4;
+    int64_t extents_B[4] = {3, 2, 2, 3};
+    int64_t strides_B[4] = {-1, -3, -6, -12};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 3;
+    int64_t extents_C[3] = {4, 2, 2};
+    int64_t strides_C[3] = {1, 4, 8};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 3;
+    int64_t extents_D[3] = {4, 2, 2};
+    int64_t strides_D[3] = {1, 4, 8};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[3] = {'a', 'b', 'c'};
+    int64_t idx_B[4] = {'c', 'd', 'e', 'b'};
+    int64_t idx_C[3] = {'a', 'd', 'e'};
+    int64_t idx_D[3] = {'a', 'd', 'e'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    float alpha = 1;
+
+    float A[36] = {
+        -1, 1.01, 2, 1,
+        -1, 1.01, 2, 1,
+        -1, 1.01, 2, 1,
+
+        -1, 1.01, 2, 1,
+        -1, 1.01, 2, 1,
+        -1, 1.01, 2, 1,
+
+        -1, 1.01, 2, 1,
+        -1, 1.01, 2, 1,
+        -1, 1.01, 2, 1};
+
+    float B[36] = {
+        6, 6, 6,
+        3, 3, 3,
+
+        2, 2, 2,
+        1, 1, 1,
+
+        6, 6, 6,
+        3, 3, 3,
+
+        2, 2, 2,
+        1, 1, 1,
+
+        6, 6, 6,
+        3, 3, 3,
+
+        2, 2, 2,
+        1, 1, 1};
+
+    float beta = 0;
+
+    float C[16] = {
+        2, 4, 6, 8,
+        2, 4, 6, 8,
+
+        2, 4, 6, 8,
+        2, 4, 6, 8};
+
+    float D[16] = {
+        1, 2, 3, 4,
+        5, 6, 7, 8,
+
+        1, 2, 3, 4,
+        5, 6, 7, 8};
+
+    float *A_ptr = &A[35];
+    float *B_ptr = &B[35];
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D);
+
+    print_tensor_s(nmode_D, extents_D, strides_D, D);
+
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}*/
+
+void subtensors()
+{
+    TAPP_handle handle;
+    TAPP_create_handle(&handle);
+
+    int nmode_A = 3;
+    int64_t extents_A[3] = {3, 2, 2};
+    int64_t strides_A[3] = {1, 12, 24};
+    TAPP_tensor_info info_A;
+    TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
+
+    int nmode_B = 3;
+    int64_t extents_B[3] = {2, 2, 3};
+    int64_t strides_B[3] = {3, 6, 12};
+    TAPP_tensor_info info_B;
+    TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
+
+    int nmode_C = 2;
+    int64_t extents_C[2] = {3, 3};
+    int64_t strides_C[2] = {1, 3};
+    TAPP_tensor_info info_C;
+    TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
+
+    int nmode_D = 2;
+    int64_t extents_D[2] = {3, 3};
+    int64_t strides_D[2] = {1, 3};
+    TAPP_tensor_info info_D;
+    TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
+
+    TAPP_tensor_product plan;
+    TAPP_element_op op_A = TAPP_IDENTITY;
+    TAPP_element_op op_B = TAPP_IDENTITY;
+    TAPP_element_op op_C = TAPP_IDENTITY;
+    TAPP_element_op op_D = TAPP_IDENTITY;
+    int64_t idx_A[3] = {'a', 'b', 'c'};
+    int64_t idx_B[3] = {'b', 'c', 'd'};
+    int64_t idx_C[2] = {'a', 'd'};
+    int64_t idx_D[2] = {'a', 'd'};
+    TAPP_prectype prec = TAPP_DEFAULT_PREC;
+    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+
+    TAPP_executor exec;
+    TAPP_create_executor(&exec);
+    TAPP_status status;
+
+    float alpha = 1;
+
+    float A[48] = {
+        0,
+        0,
+        0,
+        0,
+        0,
+        2,
+        1.01,
+        -1,
+        0,
+        0,
+        0,
+        0,
+
+        0,
+        0,
+        0,
+        0,
+        0,
+        2,
+        1.01,
+        -1,
+        0,
+        0,
+        0,
+        0,
+
+        0,
+        0,
+        0,
+        0,
+        0,
+        2,
+        1.01,
+        -1,
+        0,
+        0,
+        0,
+        0,
+
+        0,
+        0,
+        0,
+        0,
+        0,
+        2,
+        1.01,
+        -1,
+        0,
+        0,
+        0,
+        0,
+    };
+
+    float B[36] = {
+        0, 1, 0,
+        0, 2, 0,
+
+        0, 3, 0,
+        0, 4, 0,
+
+        0, 2, 0,
+        0, 4, 0,
+
+        0, 6, 0,
+        0, 8, 0,
+
+        0, 3, 0,
+        0, 6, 0,
+
+        0, 9, 0,
+        0, 12, 0};
+
+    float beta = 0.5;
+
+    float C[9] = {
+        2, 4, 6,
+        2, 4, 6,
+        2, 4, 6};
+
+    float D[12] = {
+        1, 2, 3, 4,
+        5, 6, 7, 8,
+        9, 10, 11, 12};
+
+    float *A_ptr = &A[5];
+
+    float *B_ptr = &B[1];
+
+    void *A_d, *B_d, *C_d, *D_d; // Device pointers
+    cudaMalloc((void**)&A_d, 43 * sizeof(float));
+    cudaMalloc((void**)&B_d, 35 * sizeof(float));
+    cudaMalloc((void**)&C_d, 9 * sizeof(float));
+    cudaMalloc((void**)&D_d, 12 * sizeof(float));
+
+    cudaMemcpy(A_d, (void*)A_ptr, 43 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_d, (void*)B_ptr, 35 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(C_d, (void*)C, 9 * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(D_d, (void*)D, 12 * sizeof(float), cudaMemcpyHostToDevice);
+
+    assert(uintptr_t(A_d) % 128 == 0);
+    assert(uintptr_t(B_d) % 128 == 0);
+    assert(uintptr_t(C_d) % 128 == 0);
+    assert(uintptr_t(D_d) % 128 == 0);
+
+    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_d, (void *)B_d, (void *)&beta, (void *)C_d, (void *)D_d);
+
+    int64_t super_extents_D[2] = {4, 3};
+    int64_t super_strides_D[2] = {1, 4};
+
+    cudaMemcpy((void*)D, (void*)D_d, 12 * sizeof(float), cudaMemcpyDeviceToHost);
+    print_tensor_s(nmode_D, super_extents_D, super_strides_D, D);
+
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (D_d) cudaFree(D_d);
+    TAPP_destroy_tensor_product(plan);
+    TAPP_destroy_tensor_info(info_A);
+    TAPP_destroy_tensor_info(info_B);
+    TAPP_destroy_tensor_info(info_C);
+    TAPP_destroy_tensor_info(info_D);
+    TAPP_destroy_executor(exec);
+    TAPP_destroy_handle(handle);
+}
+
+void print_tensor_c_cpp(int nmode, const int64_t *extents, const int64_t *strides, const std::complex<float> *data)
+{
+    int64_t *coords = (int64_t *)malloc(nmode * sizeof(int64_t));
+    int64_t size = 1;
+    for (size_t i = 0; i < nmode; i++)
+    {
+        coords[i] = 0;
+        size *= extents[i];
+    }
+    printf("\t");
+    for (size_t j = 0; j < size; j++)
+    {
+        int64_t index = 0;
+        for (size_t i = 0; i < nmode; i++)
+        {
+            index += coords[i] * strides[i];
+        }
+        printf("%.3f+%.3fi", data[index].real(), data[index].imag());
+
+        if (nmode <= 0)
+            continue;
+
+        int k = 0;
+        do
+        {
+            if (k != 0)
+            {
+                printf("\n");
+                if (j < size - 1)
+                {
+                    printf("\t");
+                }
+            }
+            else
+            {
+                printf(" ");
+            }
+            coords[k] = (coords[k] + 1) % extents[k];
+            k++;
+        } while (coords[k - 1] == 0 && k < nmode);
+    }
+    free(coords);
+}
\ No newline at end of file
diff --git a/test/demo.c b/test/demo.c
index 3f26335..6cd6a42 100644
--- a/test/demo.c
+++ b/test/demo.c
@@ -10,6 +10,74 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <complex.h>
+#ifdef TAPP_DYNAMIC_LAUNCH
+#include <dlfcn.h>  // POSIX dynamic loading, TODO: fix for windows
+#include <unistd.h>
+#endif
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+const char* path = "./cutensor_bindings/libtapp-cutensor.so";
+#endif
+
+void* dlhandle;
+TAPP_error (*fn_TAPP_attr_set)(TAPP_attr attr, TAPP_key key, void* value);
+TAPP_error (*fn_TAPP_attr_get)(TAPP_attr attr, TAPP_key key, void** value);
+TAPP_error (*fn_TAPP_attr_clear)(TAPP_attr attr, TAPP_key key);
+bool (*fn_TAPP_check_success)(TAPP_error error);
+size_t (*fn_TAPP_explain_error)(TAPP_error error, size_t maxlen, char* message);
+TAPP_error (*fn_TAPP_create_executor)(TAPP_executor* exec);
+TAPP_error (*fn_TAPP_destroy_executor)(TAPP_executor exec);
+TAPP_error (*fn_TAPP_create_handle)(TAPP_handle* handle);
+TAPP_error (*fn_TAPP_destroy_handle)(TAPP_handle handle);
+TAPP_error (*fn_TAPP_create_tensor_product)(TAPP_tensor_product* plan,
+                                            TAPP_handle handle,
+                                            TAPP_element_op op_A,
+                                            TAPP_tensor_info A,
+                                            const int64_t* idx_A,
+                                            TAPP_element_op op_B,
+                                            TAPP_tensor_info B,
+                                            const int64_t* idx_B,
+                                            TAPP_element_op op_C,
+                                            TAPP_tensor_info C,
+                                            const int64_t* idx_C,
+                                            TAPP_element_op op_D,
+                                            TAPP_tensor_info D,
+                                            const int64_t* idx_D,
+                                            TAPP_prectype prec);
+TAPP_error (*fn_TAPP_destroy_tensor_product)(TAPP_tensor_product plan);
+TAPP_error (*fn_TAPP_execute_product)(TAPP_tensor_product plan,
+                                    TAPP_executor exec,
+                                    TAPP_status* status,
+                                    const void* alpha,
+                                    const void* A,
+                                    const void* B,
+                                    const void* beta,
+                                    const void* C,
+                                            void* D);
+TAPP_error (*fn_TAPP_execute_batched_product)(TAPP_tensor_product plan,
+                                            TAPP_executor exec,
+                                            TAPP_status* status,
+                                            int num_batches,
+                                            const void* alpha,
+                                            const void** A,
+                                            const void** B,
+                                            const void* beta,
+                                            const void** C,
+                                                    void** D);
+TAPP_error (*fn_TAPP_destroy_status)(TAPP_status status);
+TAPP_error (*fn_TAPP_create_tensor_info)(TAPP_tensor_info* info,
+                                        TAPP_handle handle,
+                                        TAPP_datatype type,
+                                        int nmode,
+                                        const int64_t* extents,
+                                        const int64_t* strides);
+TAPP_error (*fn_TAPP_destroy_tensor_info)(TAPP_tensor_info info);
+int (*fn_TAPP_get_nmodes)(TAPP_tensor_info info);
+TAPP_error (*fn_TAPP_set_nmodes)(TAPP_tensor_info info, int nmodes);
+void (*fn_TAPP_get_extents)(TAPP_tensor_info info, int64_t* extents);
+TAPP_error (*fn_TAPP_set_extents)(TAPP_tensor_info info, const int64_t* extents);
+void (*fn_TAPP_get_strides)(TAPP_tensor_info info, int64_t* strides);
+TAPP_error (*fn_TAPP_set_strides)(TAPP_tensor_info info, const int64_t* strides);
 
 void contraction();
 void hadamard();
@@ -23,15 +91,88 @@ void chained_same_op();
 void negative_str();
 void subtensors();
 
+void load_implementation() {
+#ifdef TAPP_DYNAMIC_LAUNCH
+    dlhandle = dlopen(path, RTLD_LAZY);
+    if (!dlhandle) {
+        fprintf(stderr, "dlopen failed: %s\n", dlerror());
+        return;
+    }
+    dlerror();
+    *(void**)(&fn_TAPP_attr_set) = dlsym(dlhandle, "TAPP_attr_set");
+    *(void**)(&fn_TAPP_attr_get) = dlsym(dlhandle, "TAPP_attr_get");
+    *(void**)(&fn_TAPP_attr_clear) = dlsym(dlhandle, "TAPP_attr_clear");
+    *(void**)(&fn_TAPP_check_success) = dlsym(dlhandle, "TAPP_check_success");
+    *(void**)(&fn_TAPP_explain_error) = dlsym(dlhandle, "TAPP_explain_error");
+    *(void**)(&fn_TAPP_create_executor) = dlsym(dlhandle, "TAPP_create_executor");
+    *(void**)(&fn_TAPP_destroy_executor) = dlsym(dlhandle, "TAPP_destroy_executor");
+    *(void**)(&fn_TAPP_create_handle) = dlsym(dlhandle, "TAPP_create_handle");
+    *(void**)(&fn_TAPP_destroy_handle) = dlsym(dlhandle, "TAPP_destroy_handle");
+    *(void**)(&fn_TAPP_create_tensor_product) = dlsym(dlhandle, "TAPP_create_tensor_product");
+    *(void**)(&fn_TAPP_destroy_tensor_product) = dlsym(dlhandle, "TAPP_destroy_tensor_product");
+    *(void**)(&fn_TAPP_execute_product) = dlsym(dlhandle, "TAPP_execute_product");
+    *(void**)(&fn_TAPP_execute_batched_product) = dlsym(dlhandle, "TAPP_execute_batched_product");
+    *(void**)(&fn_TAPP_destroy_status) = dlsym(dlhandle, "TAPP_destroy_status");
+    *(void**)(&fn_TAPP_create_tensor_info) = dlsym(dlhandle, "TAPP_create_tensor_info");
+    *(void**)(&fn_TAPP_destroy_tensor_info) = dlsym(dlhandle, "TAPP_destroy_tensor_info");
+    *(void**)(&fn_TAPP_get_nmodes) = dlsym(dlhandle, "TAPP_get_nmodes");
+    *(void**)(&fn_TAPP_set_nmodes) = dlsym(dlhandle, "TAPP_set_nmodes");
+    *(void**)(&fn_TAPP_get_extents) = dlsym(dlhandle, "TAPP_get_extents");
+    *(void**)(&fn_TAPP_set_extents) = dlsym(dlhandle, "TAPP_set_extents");
+    *(void**)(&fn_TAPP_get_strides) = dlsym(dlhandle, "TAPP_get_strides");
+    *(void**)(&fn_TAPP_set_strides) = dlsym(dlhandle, "TAPP_set_strides");
+    const char* error = dlerror();
+    if (error != NULL) {
+        fprintf(stderr, "dlsym failed: %s\n", error);
+        dlclose(dlhandle);
+        return;
+    }
+#else
+    //fn_TAPP_attr_set = TAPP_attr_set; Not implemented in the reference implementation
+    //fn_TAPP_attr_get = TAPP_attr_get; Not implemented in the reference implementation
+    //fn_TAPP_attr_clear = TAPP_attr_clear; Not implemented in the reference implementation
+    fn_TAPP_check_success = TAPP_check_success;
+    fn_TAPP_explain_error = TAPP_explain_error;
+    fn_TAPP_create_executor = TAPP_create_executor;
+    fn_TAPP_destroy_executor = TAPP_destroy_executor;
+    fn_TAPP_create_handle = TAPP_create_handle;
+    fn_TAPP_destroy_handle = TAPP_destroy_handle;
+    fn_TAPP_create_tensor_product = TAPP_create_tensor_product;
+    fn_TAPP_destroy_tensor_product = TAPP_destroy_tensor_product;
+    fn_TAPP_execute_product = TAPP_execute_product;
+    //fn_TAPP_execute_batched_product = TAPP_execute_batched_product; Not implemented in the reference implementation
+    //fn_TAPP_destroy_status = TAPP_destroy_status; Not implemented in the reference implementation
+    fn_TAPP_create_tensor_info = TAPP_create_tensor_info;
+    fn_TAPP_destroy_tensor_info = TAPP_destroy_tensor_info;
+    fn_TAPP_get_nmodes = TAPP_get_nmodes;
+    fn_TAPP_set_nmodes = TAPP_set_nmodes;
+    fn_TAPP_get_extents = TAPP_get_extents;
+    fn_TAPP_set_extents = TAPP_set_extents;
+    fn_TAPP_get_strides = TAPP_get_strides;
+    fn_TAPP_set_strides = TAPP_set_strides;
+#endif
+}
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+void unload_implementation() {
+    if (dlhandle) {
+        dlclose(dlhandle);
+        dlhandle = NULL;
+    }
+}
+#endif
+
 int main(int argc, char const *argv[])
 {
+    load_implementation();
+    
     printf("Contraction: \n");
     contraction();
     printf("Hadamard: \n");
     hadamard();
     printf("Complex: \n");
     complex_num();
-    printf("Conjugate: \n");
+    printf("Conjugate: \n"); // Incorrect result with CuTensor, CuTensor applies opC in another way
     conjugate();
     printf("Zero dim: \n");
     zero_dim();
@@ -43,40 +184,51 @@ int main(int argc, char const *argv[])
     chained_diff_op();
     printf("Chained same op: \n");
     chained_same_op();
-    printf("Negative str: \n");
+    printf("Negative str: \n"); // Doesn't work with CuTensor, does not allow negative strides
     negative_str();
     printf("Subtensors: \n");
     subtensors();
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    unload_implementation();
+#endif
+
     return 0;
 }
 
 void contraction()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 3;
     int64_t extents_A[3] = {4, 3, 3};
     int64_t strides_A[3] = {1, 4, 12};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 4;
     int64_t extents_B[4] = {3, 2, 2, 3};
     int64_t strides_B[4] = {1, 3, 6, 12};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 3;
     int64_t extents_C[3] = {4, 2, 2};
     int64_t strides_C[3] = {1, 4, 8};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 3;
     int64_t extents_D[3] = {4, 2, 2};
     int64_t strides_D[3] = {1, 4, 8};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_handle handle;
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -87,10 +239,10 @@ void contraction()
     int64_t idx_C[3] = {'a', 'd', 'e'};
     int64_t idx_D[3] = {'a', 'd', 'e'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     // int exec_id = 1;
     // exec = (intptr_t)&exec_id;
     TAPP_status status;
@@ -145,51 +297,59 @@ void contraction()
         1, 2, 3, 4,
         5, 6, 7, 8};
 
-    TAPP_error error = TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
-    printf(TAPP_check_success(error) ? "Success\n" : "Fail\n");
-    int message_len = TAPP_explain_error(error, 0, NULL);
+    TAPP_error error = fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
+    printf(fn_TAPP_check_success(error) ? "Success\n" : "Fail\n");
+    int message_len = fn_TAPP_explain_error(error, 0, NULL);
     char *message_buff = malloc((message_len + 1) * sizeof(char));
-    TAPP_explain_error(error, message_len + 1, message_buff);
-    printf(message_buff);
+    fn_TAPP_explain_error(error, message_len + 1, message_buff);
+    printf("%s", message_buff);
     free(message_buff);
 
     print_tensor_s(nmode_D, extents_D, strides_D, D);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void hadamard()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 2;
     int64_t extents_A[2] = {4, 4};
     int64_t strides_A[2] = {1, 4};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 2;
     int64_t extents_B[2] = {4, 4};
     int64_t strides_B[2] = {1, 4};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 2;
     int64_t extents_C[2] = {4, 4};
     int64_t strides_C[2] = {1, 4};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 2;
     int64_t extents_D[2] = {4, 4};
     int64_t strides_D[2] = {1, 4};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -200,10 +360,10 @@ void hadamard()
     int64_t idx_C[2] = {'a', 'b'};
     int64_t idx_D[2] = {'a', 'b'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float alpha = 3;
@@ -247,45 +407,53 @@ void hadamard()
         16,
     };
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
 
     print_tensor_s(nmode_D, extents_D, strides_D, D);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void complex_num()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 2;
     int64_t extents_A[2] = {3, 3};
     int64_t strides_A[2] = {1, 3};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_C32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_C32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 2;
     int64_t extents_B[2] = {3, 3};
     int64_t strides_B[2] = {1, 3};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_C32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_C32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 2;
     int64_t extents_C[2] = {3, 3};
     int64_t strides_C[2] = {1, 3};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_C32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_C32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 2;
     int64_t extents_D[2] = {3, 3};
     int64_t strides_D[2] = {1, 3};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_C32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_C32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -296,10 +464,10 @@ void complex_num()
     int64_t idx_C[2] = {'a', 'c'};
     int64_t idx_D[2] = {'a', 'c'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float complex alpha = 1;
@@ -326,45 +494,53 @@ void complex_num()
         4 + 4 * I, 5 + 5 * I, 6 + 6 * I,
         7 + 7 * I, 8 + 8 * I, 9 + 2 * I};
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
 
     print_tensor_c(nmode_D, extents_D, strides_D, D);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void conjugate()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 2;
     int64_t extents_A[2] = {3, 3};
     int64_t strides_A[2] = {1, 3};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_C32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_C32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 2;
     int64_t extents_B[2] = {3, 3};
     int64_t strides_B[2] = {1, 3};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_C32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_C32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 2;
     int64_t extents_C[2] = {3, 3};
     int64_t strides_C[2] = {1, 3};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_C32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_C32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 2;
     int64_t extents_D[2] = {3, 3};
     int64_t strides_D[2] = {1, 3};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_C32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_C32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_CONJUGATE;
@@ -375,10 +551,10 @@ void conjugate()
     int64_t idx_C[2] = {'a', 'c'};
     int64_t idx_D[2] = {'a', 'c'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float complex alpha = 1;
@@ -405,45 +581,53 @@ void conjugate()
         4 + 4 * I, 5 + 5 * I, 6 + 6 * I,
         7 + 7 * I, 8 + 8 * I, 9 + 2 * I};
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
 
     print_tensor_c(nmode_D, extents_D, strides_D, D);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void zero_dim()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+    
     int nmode_A = 0;
     int64_t extents_A[0] = {};
     int64_t strides_A[0] = {};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 2;
     int64_t extents_B[2] = {3, 3};
     int64_t strides_B[2] = {1, 3};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 2;
     int64_t extents_C[2] = {3, 3};
     int64_t strides_C[2] = {1, 3};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 2;
     int64_t extents_D[2] = {3, 3};
     int64_t strides_D[2] = {1, 3};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -454,10 +638,10 @@ void zero_dim()
     int64_t idx_C[2] = {'a', 'b'};
     int64_t idx_D[2] = {'a', 'b'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float alpha = 1;
@@ -482,45 +666,53 @@ void zero_dim()
         2, 2, 2,
         2, 2, 2};
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
 
     print_tensor_s(nmode_D, extents_D, strides_D, D);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void one_ext_contracted()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 4;
     int64_t extents_A[4] = {4, 1, 3, 3};
     int64_t strides_A[4] = {1, 4, 4, 12};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 5;
     int64_t extents_B[5] = {3, 2, 1, 2, 3};
     int64_t strides_B[5] = {1, 3, 6, 6, 12};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 3;
     int64_t extents_C[3] = {4, 2, 2};
     int64_t strides_C[3] = {1, 4, 8};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 3;
     int64_t extents_D[3] = {4, 2, 2};
     int64_t strides_D[3] = {1, 4, 8};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -531,10 +723,10 @@ void one_ext_contracted()
     int64_t idx_C[3] = {'a', 'e', 'f'};
     int64_t idx_D[3] = {'a', 'e', 'f'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float alpha = 1;
@@ -587,45 +779,53 @@ void one_ext_contracted()
         1, 2, 3, 4,
         5, 6, 7, 8};
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
 
     print_tensor_s(nmode_D, extents_D, strides_D, D);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void one_ext_transfered()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 4;
     int64_t extents_A[4] = {4, 1, 3, 3};
     int64_t strides_A[4] = {1, 4, 4, 12};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 4;
     int64_t extents_B[4] = {3, 2, 2, 3};
     int64_t strides_B[4] = {1, 3, 6, 12};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 4;
     int64_t extents_C[4] = {4, 1, 2, 2};
     int64_t strides_C[4] = {1, 4, 4, 8};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 4;
     int64_t extents_D[4] = {4, 1, 2, 2};
     int64_t strides_D[4] = {1, 4, 4, 8};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -636,10 +836,10 @@ void one_ext_transfered()
     int64_t idx_C[4] = {'a', 'b', 'e', 'f'};
     int64_t idx_D[4] = {'a', 'b', 'e', 'f'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float alpha = 1;
@@ -692,45 +892,53 @@ void one_ext_transfered()
         1, 2, 3, 4,
         5, 6, 7, 8};
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
 
     print_tensor_s(nmode_D, extents_D, strides_D, D);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void chained_diff_op()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 3;
     int64_t extents_A[3] = {4, 3, 3};
     int64_t strides_A[3] = {1, 4, 12};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 4;
     int64_t extents_B[4] = {3, 2, 2, 3};
     int64_t strides_B[4] = {1, 3, 6, 12};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 3;
     int64_t extents_C[3] = {4, 2, 2};
     int64_t strides_C[3] = {1, 4, 8};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 3;
     int64_t extents_D[3] = {4, 2, 2};
     int64_t strides_D[3] = {1, 4, 8};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -741,10 +949,10 @@ void chained_diff_op()
     int64_t idx_C[3] = {'a', 'd', 'e'};
     int64_t idx_D[3] = {'a', 'd', 'e'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float alpha = 2;
@@ -797,7 +1005,7 @@ void chained_diff_op()
         1, 2, 3, 4,
         5, 6, 7, 8};
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
 
     printf("\tOperation 1:\n");
     print_tensor_s(nmode_D, extents_D, strides_D, D);
@@ -808,12 +1016,12 @@ void chained_diff_op()
     int64_t extents_E[3] = {4, 2, 2};
     int64_t strides_E[3] = {1, 4, 8};
     TAPP_tensor_info info_E;
-    TAPP_create_tensor_info(&info_E, TAPP_F32, nmode_E, extents_E, strides_E);
+    fn_TAPP_create_tensor_info(&info_E, handle, TAPP_F32, nmode_E, extents_E, strides_E);
 
     TAPP_tensor_product plan2;
     TAPP_element_op op_E = TAPP_IDENTITY;
     int64_t idx_E[3] = {'a', 'd', 'e'};
-    TAPP_create_tensor_product(&plan2, handle, op_D, info_D, idx_D, op_C, info_C, idx_C, op_C, info_C, idx_C, op_E, info_E, idx_E, prec);
+    fn_TAPP_create_tensor_product(&plan2, handle, op_D, info_D, idx_D, op_C, info_C, idx_C, op_C, info_C, idx_C, op_E, info_E, idx_E, prec);
 
     float E[16] = {
         1, 2, 3, 4,
@@ -821,48 +1029,56 @@ void chained_diff_op()
 
         1, 2, 3, 4,
         5, 6, 7, 8};
-    TAPP_execute_product(plan2, exec, &status, (void *)&alpha, (void *)D, (void *)C, (void *)&beta, (void *)C, (void *)E);
+    fn_TAPP_execute_product(plan2, exec, &status, (void *)&alpha, (void *)D, (void *)C, (void *)&beta, (void *)C, (void *)E);
 
     printf("\tOperation 2:\n");
     print_tensor_s(nmode_E, extents_E, strides_E, E);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_product(plan2);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_tensor_info(info_E);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_product(plan2);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_tensor_info(info_E);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void chained_same_op()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 2;
     int64_t extents_A[2] = {4, 4};
     int64_t strides_A[2] = {1, 4};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 2;
     int64_t extents_B[2] = {4, 4};
     int64_t strides_B[2] = {1, 4};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 2;
     int64_t extents_C[2] = {4, 4};
     int64_t strides_C[2] = {1, 4};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 2;
     int64_t extents_D[2] = {4, 4};
     int64_t strides_D[2] = {1, 4};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -873,10 +1089,10 @@ void chained_same_op()
     int64_t idx_C[2] = {'a', 'b'};
     int64_t idx_D[2] = {'a', 'b'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float alpha = 3;
@@ -907,7 +1123,7 @@ void chained_same_op()
         9, 10, 11, 12,
         13, 14, 15, 16};
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
 
     printf("\tOperation 1:\n");
     print_tensor_s(nmode_D, extents_D, strides_D, D);
@@ -932,46 +1148,54 @@ void chained_same_op()
         15,
         16,
     };
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)D, (void *)&beta, (void *)C, (void *)E);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)D, (void *)&beta, (void *)C, (void *)E);
 
     printf("\tOperation 2:\n");
     print_tensor_s(nmode_D, extents_D, strides_D, E);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void negative_str()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 3;
     int64_t extents_A[3] = {4, 3, 3};
     int64_t strides_A[3] = {-1, -4, -12};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 4;
     int64_t extents_B[4] = {3, 2, 2, 3};
     int64_t strides_B[4] = {-1, -3, -6, -12};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 3;
     int64_t extents_C[3] = {4, 2, 2};
     int64_t strides_C[3] = {1, 4, 8};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 3;
     int64_t extents_D[3] = {4, 2, 2};
     int64_t strides_D[3] = {1, 4, 8};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -982,10 +1206,10 @@ void negative_str()
     int64_t idx_C[3] = {'a', 'd', 'e'};
     int64_t idx_D[3] = {'a', 'd', 'e'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float alpha = 1;
@@ -1041,45 +1265,53 @@ void negative_str()
     float *A_ptr = &A[35];
     float *B_ptr = &B[35];
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D);
 
     print_tensor_s(nmode_D, extents_D, strides_D, D);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
 
 void subtensors()
 {
+    TAPP_handle handle;
+    fn_TAPP_create_handle(&handle);
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    bool use_device_memory = false; // CuTensor specific attribute
+    fn_TAPP_attr_set(handle, 0, (void*)&use_device_memory); // CuTensor specific attribute
+#endif
+
     int nmode_A = 3;
     int64_t extents_A[3] = {3, 2, 2};
     int64_t strides_A[3] = {1, 12, 24};
     TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
+    fn_TAPP_create_tensor_info(&info_A, handle, TAPP_F32, nmode_A, extents_A, strides_A);
 
     int nmode_B = 3;
     int64_t extents_B[3] = {2, 2, 3};
     int64_t strides_B[3] = {3, 6, 12};
     TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
+    fn_TAPP_create_tensor_info(&info_B, handle, TAPP_F32, nmode_B, extents_B, strides_B);
 
     int nmode_C = 2;
     int64_t extents_C[2] = {3, 3};
     int64_t strides_C[2] = {1, 3};
     TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
+    fn_TAPP_create_tensor_info(&info_C, handle, TAPP_F32, nmode_C, extents_C, strides_C);
 
     int nmode_D = 2;
     int64_t extents_D[2] = {3, 3};
-    int64_t strides_D[2] = {1, 4};
+    int64_t strides_D[2] = {1, 3};
     TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    fn_TAPP_create_tensor_info(&info_D, handle, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_handle handle;
     TAPP_tensor_product plan;
     TAPP_element_op op_A = TAPP_IDENTITY;
     TAPP_element_op op_B = TAPP_IDENTITY;
@@ -1090,10 +1322,10 @@ void subtensors()
     int64_t idx_C[2] = {'a', 'd'};
     int64_t idx_D[2] = {'a', 'd'};
     TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
+    fn_TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
 
     TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    fn_TAPP_create_executor(&exec);
     TAPP_status status;
 
     float alpha = 1;
@@ -1187,16 +1419,17 @@ void subtensors()
 
     float *B_ptr = &B[1];
 
-    TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D);
+    fn_TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A_ptr, (void *)B_ptr, (void *)&beta, (void *)C, (void *)D);
 
     int64_t super_extents_D[2] = {4, 3};
     int64_t super_strides_D[2] = {1, 4};
     print_tensor_s(nmode_D, super_extents_D, super_strides_D, D);
 
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_tensor_product(plan);
+    fn_TAPP_destroy_tensor_info(info_A);
+    fn_TAPP_destroy_tensor_info(info_B);
+    fn_TAPP_destroy_tensor_info(info_C);
+    fn_TAPP_destroy_tensor_info(info_D);
+    fn_TAPP_destroy_executor(exec);
+    fn_TAPP_destroy_handle(handle);
 }
\ No newline at end of file
diff --git a/test/exercise.c b/test/exercise.c
deleted file mode 100644
index 31a5baa..0000000
--- a/test/exercise.c
+++ /dev/null
@@ -1,207 +0,0 @@
-#include <tapp.h>
-
-#include "helpers.h"
-#include <complex.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-int main(int argc, char const *argv[])
-{
-    /*
-     * Create the tensor structures for tensor A, B, C and D.
-     * Tensor A 3 dimensional tensor with the extents 4, 3, 2, and the strides 1, 4, 12.
-     * Tensor B 3 dimensional tensor with the extents 3, 2, 4, and the strides 1, 3, 6.
-     * Tensor C 2 dimensional tensor with the extents 3, 3, and the strides 1, 3.
-     * Tensor D 2 dimensional tensor with the extents 3, 3, and the strides 1, 3.
-     */
-
-    // Tensor A
-    // Assign the number of indices
-    /* Remove */ int nmode_A = 3;
-
-    // Assign the extents
-    /* Remove */ int64_t extents_A[3] = {4, 3, 2};
-
-    // Assign the strides
-    /* Remove */ int64_t strides_A[3] = {1, 4, 12};
-
-    // Declare the tensor structure variable
-    /* Remove */ TAPP_tensor_info info_A;
-
-    // Assign the structure to the variable
-    /* Remove */ TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-
-    // Tensor B
-    /* Remove */ int nmode_B = 3;
-    /* Remove */ int64_t extents_B[3] = {3, 2, 4};
-    /* Remove */ int64_t strides_B[3] = {1, 3, 6};
-    /* Remove */ TAPP_tensor_info info_B;
-    /* Remove */ TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-
-    // Tensor C
-    /* Remove */ int nmode_C = 2;
-    /* Remove */ int64_t extents_C[2] = {3, 3};
-    /* Remove */ int64_t strides_C[2] = {1, 3};
-    /* Remove */ TAPP_tensor_info info_C;
-    /* Remove */ TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-
-    // Tensor D
-    /* Remove */ int nmode_D = 2;
-    /* Remove */ int64_t extents_D[2] = {3, 3};
-    /* Remove */ int64_t strides_D[2] = {1, 3};
-    /* Remove */ TAPP_tensor_info info_D;
-    /* Remove */ TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-
-    /*
-     * Assign the options for the calculation.
-     * The precision used will be the default precision.
-     * The elemental operations should be the identity one (doesn't really matter since this exercise doesn't use complex numbers).
-     * The operation that should be executed is:
-     *  Contraction between the first index for tensor A and third index for tensor B.
-     *  Contraction between the third index for tensor A and second index for tensor B.
-     *  The second index for A and the first index for B are free indices, in that order. 
-     */
-
-    // Declare handle (no assignment)
-    /* Remove */ TAPP_handle handle;
-
-    // Initialize the precision
-    /* Remove */ TAPP_prectype prec = TAPP_DEFAULT_PREC; 
-
-    // Initialize the elemental operations for each of the tensors
-    /* Remove */ TAPP_element_op op_A = TAPP_IDENTITY;
-    /* Remove */ TAPP_element_op op_B = TAPP_IDENTITY;
-    /* Remove */ TAPP_element_op op_C = TAPP_IDENTITY;
-    /* Remove */ TAPP_element_op op_D = TAPP_IDENTITY;
-
-    // Create ths indicies arrays for each of the tensor
-    /* Remove */ int64_t idx_A[3] = {'a', 'b', 'c'};
-    /* Remove */ int64_t idx_B[3] = {'d', 'c', 'a'};
-    /* Remove */ int64_t idx_C[2] = {'b', 'd'};
-    /* Remove */ int64_t idx_D[2] = {'b', 'd'};
-
-    // Declare plan
-    /* Remove */ TAPP_tensor_product plan;
-
-    // Create plan/Assign the options to the plan
-    /* Remove */ TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
-
-    // Declare executor
-    /* Remove */ TAPP_executor exec;
-
-    // Create executor
-    TAPP_create_executor(&exec);
-
-    // Declare status object
-    /* Remove */ TAPP_status status;
-
-
-    /*
-     * Assign data for the execution
-     */
-    
-    // Initialize alpha
-    float alpha = 3;
-
-    // Initialize data for tensor A
-    float A[24] = {
-        1, 2, 1.01, -1,
-        1, 2, 1.01, -1,
-        1, 2, 1.01, -1,
-
-        1, 2, 1.01, -1,
-        1, 2, 1.01, -1,
-        1, 2, 1.01, -1};
-
-    // Initialize data for tensor B
-    float B[24] = {
-        1, 1, 1,
-        2, 2, 2,
-
-        3, 3, 3,
-        6, 6, 6,
-
-        1, 1, 1,
-        2, 2, 2,
-
-        3, 3, 3,
-        6, 6, 6};
-
-    // Initialize beta
-    float beta = 2;
-
-    // Initialize data for tensor C
-    float C[9] = {
-        4, 4, 8,
-        4, 8, 8,
-        8, 8, 8};
-
-    // Initialize data for tensor D
-    float D[9] = {
-        2, 3, 4,
-        5, 6, 7,
-        9, 1, 2};
-    
-
-    /*
-     * Run the execution
-     */
-
-    // Call the execution function
-    /* Remove */TAPP_error error = TAPP_execute_product(plan, exec, &status, (void *)&alpha, (void *)A, (void *)B, (void *)&beta, (void *)C, (void *)D);
-
-
-    /*
-     * Print results
-     */
-
-    // Check if the execution was successful
-    bool success = /* Remove */ TAPP_check_success(error);
-    
-    // Print if the execution was successful
-    printf(success ? "Success\n" : "Fail\n");
-
-    // Get the length of the error message
-    /* Remove */ int message_len = TAPP_explain_error(error, 0, NULL);
-
-    // Create a buffer to hold the message + 1 character for null terminator
-    /* Remove */ char* message_buff = malloc((message_len + 1) * sizeof(char));
-
-    // Fetch error message
-    /* Remove */ TAPP_explain_error(error, message_len + 1, message_buff);
-
-    // Print error message
-    printf("%s", message_buff);
-    printf("\n");
-
-    // Print the output
-    print_tensor_s(nmode_D, extents_D, strides_D, D);
-    
-
-    /*
-     * Free data
-     */
-
-    // Free buffer
-    free(message_buff);
-
-    // Destroy structures
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
-
-    /*
-     * Expected output:
-    Success
-    Success.
-        53.090 53.090 61.090 
-        53.090 61.090 61.090 
-        61.090 61.090 61.090 
-     */
-
-    return 0;
-}
diff --git a/test/helpers.h b/test/helpers.h
index 0e6cbc8..eb062e2 100644
--- a/test/helpers.h
+++ b/test/helpers.h
@@ -8,4 +8,4 @@
 #include <stdint.h>
 
 void print_tensor_s(int nmode, const int64_t *extents, const int64_t *strides, const float *data);
-void print_tensor_c(int nmode, const int64_t *extents, const int64_t *strides, const float complex *data);
+void print_tensor_c(int nmode, const int64_t *extents, const int64_t *strides, const float _Complex *data);
diff --git a/test/test.c b/test/test.c
deleted file mode 100644
index d8c0134..0000000
--- a/test/test.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Niklas Hörnblad
- * Paolo Bientinesi
- * Umeå University - June 2024
- */
-
-#include <tapp.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-
-int main(int argc, char const *argv[])
-{
-    int nmode_A = 3;
-    int64_t extents_A[3] = {4, 3, 3};
-    int64_t strides_A[3] = {1, 4, 12};
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-
-    int nmode_B = 4;
-    int64_t extents_B[4] = {3, 2, 2, 3};
-    int64_t strides_B[4] = {1, 3, 6, 12};
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-
-    int nmode_C = 2;
-    int64_t extents_C[2] = {4, 2};
-    int64_t strides_C[2] = {1, 4};
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    
-    int nmode_D = 2;
-    int64_t extents_D[2] = {4, 2};
-    int64_t strides_D[2] = {1, 4};
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_handle handle;
-    TAPP_tensor_product plan;
-    TAPP_element_op op_A = TAPP_IDENTITY;
-    TAPP_element_op op_B = TAPP_IDENTITY;
-    TAPP_element_op op_C = TAPP_IDENTITY;
-    TAPP_element_op op_D = TAPP_IDENTITY;
-    int64_t idx_A[3] = {'a', 'b', 'c'};
-    int64_t idx_B[4] = {'c', 'd', 'e', 'b'};
-    int64_t idx_C[2] = {'a', 'd'};
-    int64_t idx_D[3] = {'a', 'd'};
-    TAPP_prectype prec = TAPP_DEFAULT_PREC;
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, prec);
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-    TAPP_status status;
-
-    float alpha = 1;
-
-    float A[36] = {
-        1,  2,  1.01, -1,
-        1,  2,  1.01, -1,
-        1,  2,  1.01, -1,
-
-        1,  2,  1.01, -1,
-        1,  2,  1.01, -1,
-        1,  2,  1.01, -1,
-
-        1,  2,  1.01, -1,
-        1,  2,  1.01, -1,
-        1,  2,  1.01, -1
-    };
-
-    float B[36] = {
-        1,  1,  1,
-        2,  2,  2,
-
-        3,  3,  3,
-        6,  6,  6,
-
-
-        1,  1,  1,
-        2,  2,  2,
-
-        3,  3,  3,
-        6,  6,  6,
-
-
-        1,  1,  1,
-        2,  2,  2,
-
-        3,  3,  3,
-        6,  6,  6
-    };
-
-    float beta = 0;
-
-    float C[16] = {
-        2,  4,  6,  8,
-        2,  4,  6,  8,
-
-        2,  4,  6,  8,
-        2,  4,  6,  8
-    };
-
-    float D[16] = {
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-        
-         1,  2,  3,  4,
-         5,  6,  7,  8
-    };
-
-    TAPP_error error = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-    printf(TAPP_check_success(error) ? "Success\n" : "Fail\n");
-    int message_len = TAPP_explain_error(error, 0, NULL);
-    char* message_buff = malloc((message_len + 1) * sizeof(char));
-    TAPP_explain_error(error, message_len + 1, message_buff);
-    printf(message_buff);
-    free(message_buff);
-
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
-    TAPP_destroy_executor(exec);
-    return 0;
-}
diff --git a/test/test.cpp b/test/test.cpp
index e28b3d8..31d9e2f 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -6,13 +6,16 @@
 
 #include "test.h"
 
-// TODO replace by #include of <blis.h> when possible
-extern "C" {
-  extern void bli_init();
-  extern void bli_finalize();
+unsigned int current_rand_seed = 1;
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+// TODO include ATTR_KEY_USE_DEVICE_MEMORY from cutensor_bindings attributes header
+bool use_device_memory = false; // Global variable to control device memory usage in tests
+inline void set_use_device_memory(struct impl& impl, TAPP_handle handle) {
+    impl.TAPP_attr_set(handle, 0, (void*)&use_device_memory);
 }
+#endif
 
-unsigned int current_rand_seed = 0;
 auto& rand_engine() {
     static std::mt19937 engine(current_rand_seed);
     return engine;
@@ -20,50 +23,217 @@ auto& rand_engine() {
 
 int main(int argc, char const *argv[])
 {
+#ifdef TAPP_DYNAMIC_LAUNCH
+    if (argc >= 3)
+    {
+        pathA = argv[1];
+        pathB = argv[2];
+    }
+    
+    struct impl implA;
+    if (load_implementation(&implA, pathA) == -1) return -1;
+    struct impl implB;
+    if (load_implementation(&implB, pathB) != 0) return -1;
+    std::cout << "NOTE: CuTensor does not support negative nor 0 strides" << std::endl;
+#endif
+    
     if (argc >= 2) current_rand_seed = std::atoi(argv[1]); // now ready to generate random numbers
-    bli_init();
     std::cout << std::boolalpha;
     std::cout << "Starting seed for random numbers = " << current_rand_seed << std::endl;
-    std::cout << "Hadamard Product: " << test_hadamard_product() << std::endl;
-    std::cout << "Contraction: " << test_contraction() << std::endl;
-    std::cout << "Commutativity: " << test_commutativity() << std::endl;
-    std::cout << "Permutations: " << test_permutations() << std::endl;
-    std::cout << "Equal Extents: " << test_equal_extents() << std::endl;
-    std::cout << "Outer Product: " << test_outer_product() << std::endl;
-    std::cout << "Full Contraction: " << test_full_contraction() << std::endl;
+    std::cout << "Hadamard Product: " << test_hadamard_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                               implA, implB
+#endif
+                                                               ) << std::endl;
+    std::cout << "Contraction: " << test_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                     implA, implB
+#endif
+                                                     ) << std::endl;
+    std::cout << "Commutativity: " << test_commutativity(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                         implA, implB
+#endif
+                                                         ) << std::endl;
+    std::cout << "Permutations: " << test_permutations(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                       implA, implB
+#endif
+                                                       ) << std::endl;
+    std::cout << "Equal Extents: " << test_equal_extents(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                        implA, implB
+#endif
+                                                        ) << std::endl;
+    std::cout << "Outer Product: " << test_outer_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                         implA, implB
+#endif
+                                                         ) << std::endl;
+    std::cout << "Full Contraction: " << test_full_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                               implA, implB
+#endif
+                                                               ) << std::endl;
     //for(int i=0;i<0;i++)
-    std::cout << "Zero Dim Tensor Contraction: " << test_zero_dim_tensor_contraction() << std::endl;
-    std::cout << "One Dim Tensor Contraction: " << test_one_dim_tensor_contraction() << std::endl;
-    std::cout << "Subtensor Same Nmode: " << test_subtensor_unchanged_nmode() << std::endl;
-    std::cout << "Subtensor Lower Nmode: " << test_subtensor_lower_nmode() << std::endl;
-    std::cout << "Negative Strides: " << test_negative_strides() << std::endl;
-    std::cout << "Negative Strides Subtensor Same Nmode: " << test_negative_strides_subtensor_unchanged_nmode() << std::endl;
-    std::cout << "Negative Strides Subtensor Lower Nmode: " << test_negative_strides_subtensor_lower_nmode() << std::endl;
-    std::cout << "Mixed Strides: " << test_mixed_strides() << std::endl;
-    std::cout << "Mixed Strides Subtensor Same Nmode: " << test_mixed_strides_subtensor_unchanged_nmode() << std::endl;
-    std::cout << "Mixed Strides Subtensor Lower Nmode: " << test_mixed_strides_subtensor_lower_nmode() << std::endl;
-    std::cout << "Contraction Double Precision: " << test_contraction_double_precision() << std::endl;
-    std::cout << "Contraction Complex: " << test_contraction_complex() << std::endl;
+    std::cout << "Zero Dim Tensor Contraction: " << test_zero_dim_tensor_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                                     implA, implB
+#endif
+                                                                                     ) << std::endl;
+    std::cout << "One Dim Tensor Contraction: " << test_one_dim_tensor_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                                   implA, implB
+#endif
+                                                                                   ) << std::endl;
+    std::cout << "Subtensor Same Index: " << test_subtensor_same_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                     implA, implB
+#endif
+                                                                     ) << std::endl;
+    std::cout << "Subtensor Lower Index: " << test_subtensor_lower_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                       implA, implB
+#endif
+                                                                       ) << std::endl;
+    std::cout << "Negative Strides: " << test_negative_strides(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                               implA, implB
+#endif
+                                                               ) << std::endl; // Cutensor doesn't support negative strides
+    std::cout << "Negative Strides Subtensor Same Index: " << test_negative_strides_subtensor_same_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                                                       implA, implB
+#endif
+                                                                                                       ) << std::endl;
+    std::cout << "Negative Strides Subtensor Lower Index: " << test_negative_strides_subtensor_lower_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                                                         implA, implB
+#endif
+                                                                                                         ) << std::endl;
+    std::cout << "Mixed Strides: " << test_mixed_strides(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                         implA, implB
+#endif
+                                                         ) << std::endl; // Cutensor doesn't support negative strides
+    std::cout << "Mixed Strides Subtensor Same Index: " << test_mixed_strides_subtensor_same_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                         implA, implB
+#endif
+                                                         ) << std::endl;
+    std::cout << "Mixed Strides Subtensor Lower Index: " << test_mixed_strides_subtensor_lower_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                                                   implA, implB
+#endif
+                                                                                                   ) << std::endl;
+    std::cout << "Contraction Double Precision: " << test_contraction_double_precision(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                                       implA, implB
+#endif
+                                                                                       ) << std::endl;
+    std::cout << "Contraction Complex: " << test_contraction_complex(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                     implA, implB
+#endif
+                                                                     ) << std::endl;
     //for(int i=0;i<1;i++)
-    std::cout << "Contraction Complex Double Precision: " << test_contraction_complex_double_precision() << std::endl;
-    std::cout << "Zero stride: " << test_zero_stride() << std::endl;
-    std::cout << "Isolated Indices: " << test_isolated_idx() << std::endl;
-    std::cout << "Repeated Indices: " << test_repeated_idx() << std::endl;
-    std::cout << "Hadamard And Free: " << test_hadamard_and_free() << std::endl;
-    std::cout << "Hadamard And Contraction: " << test_hadamard_and_contraction() << std::endl;
-    std::cout << "Error: Non Matching Extents: " << test_error_non_matching_ext() << std::endl;
+    std::cout << "Contraction Complex Double Precision: " << test_contraction_complex_double_precision(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                                                       implA, implB
+#endif
+                                                                                                       ) << std::endl;
+    std::cout << "Zero stride: " << test_zero_stride(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                     implA, implB
+#endif
+                                                     ) << std::endl; // Cutensor doesn't support zero strides
+    std::cout << "Unique Index: " << test_unique_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                     implA, implB
+#endif
+                                                     ) << std::endl;
+    std::cout << "Repeated Index: " << test_repeated_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                         implA, implB
+#endif
+                                                         ) << std::endl;
+    std::cout << "Hadamard And Free: " << test_hadamard_and_free(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                 implA, implB
+#endif
+                                                                 ) << std::endl;
+    std::cout << "Hadamard And Contraction: " << test_hadamard_and_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                                                               implA, implB
+#endif
+                                                                               ) << std::endl;
+
+#ifndef TAPP_DYNAMIC_LAUNCH
+    std::cout << "Error: Non Matching Extents: " << test_error_non_matching_ext() << std::endl; //TODO CuTensor bindings should comply to a TAPP error handling
     std::cout << "Error: C Other Structure: " << test_error_C_other_structure() << std::endl;
     std::cout << "Error: Aliasing Within D: " << test_error_aliasing_within_D() << std::endl;
-    bli_finalize();
+#endif
+
+#ifdef TAPP_DYNAMIC_LAUNCH
+    unload_implementation(&implA);
+    unload_implementation(&implB);
+#endif
+
     return 0;
 }
 
+#ifdef TAPP_DYNAMIC_LAUNCH
+int load_implementation(struct impl* impl, const char* path) {
+    impl->handle = dlopen(path, RTLD_LAZY);
+    if (!impl->handle) {
+        fprintf(stderr, "dlopen failed: %s\n", dlerror());
+        return -1;
+    }
+    dlerror();
+    *(void**)(&impl->TAPP_attr_set) = dlsym(impl->handle, "TAPP_attr_set");
+    *(void**)(&impl->TAPP_attr_get) = dlsym(impl->handle, "TAPP_attr_get");
+    *(void**)(&impl->TAPP_attr_clear) = dlsym(impl->handle, "TAPP_attr_clear");
+    *(void**)(&impl->TAPP_check_success) = dlsym(impl->handle, "TAPP_check_success");
+    *(void**)(&impl->TAPP_explain_error) = dlsym(impl->handle, "TAPP_explain_error");
+    *(void**)(&impl->TAPP_create_executor) = dlsym(impl->handle, "TAPP_create_executor");
+    *(void**)(&impl->TAPP_destroy_executor) = dlsym(impl->handle, "TAPP_destroy_executor");
+    *(void**)(&impl->TAPP_create_handle) = dlsym(impl->handle, "TAPP_create_handle");
+    *(void**)(&impl->TAPP_destroy_handle) = dlsym(impl->handle, "TAPP_destroy_handle");
+    *(void**)(&impl->TAPP_create_tensor_product) = dlsym(impl->handle, "TAPP_create_tensor_product");
+    *(void**)(&impl->TAPP_destroy_tensor_product) = dlsym(impl->handle, "TAPP_destroy_tensor_product");
+    *(void**)(&impl->TAPP_execute_product) = dlsym(impl->handle, "TAPP_execute_product");
+    *(void**)(&impl->TAPP_execute_batched_product) = dlsym(impl->handle, "TAPP_execute_batched_product");
+    *(void**)(&impl->TAPP_destroy_status) = dlsym(impl->handle, "TAPP_destroy_status");
+    *(void**)(&impl->TAPP_create_tensor_info) = dlsym(impl->handle, "TAPP_create_tensor_info");
+    *(void**)(&impl->TAPP_destroy_tensor_info) = dlsym(impl->handle, "TAPP_destroy_tensor_info");
+    *(void**)(&impl->TAPP_get_nmodes) = dlsym(impl->handle, "TAPP_get_nmodes");
+    *(void**)(&impl->TAPP_set_nmodes) = dlsym(impl->handle, "TAPP_set_nmodes");
+    *(void**)(&impl->TAPP_get_extents) = dlsym(impl->handle, "TAPP_get_extents");
+    *(void**)(&impl->TAPP_set_extents) = dlsym(impl->handle, "TAPP_set_extents");
+    *(void**)(&impl->TAPP_get_strides) = dlsym(impl->handle, "TAPP_get_strides");
+    *(void**)(&impl->TAPP_set_strides) = dlsym(impl->handle, "TAPP_set_strides");
+    const char* error = dlerror();
+    if (error != NULL) {
+        fprintf(stderr, "dlsym failed: %s\n", error);
+        dlclose(impl->handle);
+        return -1;
+    }
+    return 0;
+}
+
+void unload_implementation(struct impl* impl) {
+    if (impl->handle) {
+        dlclose(impl->handle);
+        impl->handle = NULL;
+    }
+}
+#else
 template<typename T>
-void run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A,
-                    int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B,
-                    int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C,
-                    int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D,
-                    T alpha, T beta)
+T* run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A,
+                  int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B,
+                  int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C,
+                  int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D,
+                  T alpha, T beta)
 {
     tblis::len_type* tblis_len_A = change_array_type<int64_t, tblis::len_type>(extents_A, nmode_A);
     tblis::stride_type* tblis_stride_A = change_array_type<int64_t, tblis::stride_type>(strides_A, nmode_A);
@@ -118,9 +288,9 @@ void run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, i
         }
     }
 
-    auto [tblis_A_reduced, tblis_idx_A_reduced, tblis_len_A_reduced, tblis_stride_A_reduced, tblis_data_A_reduced] = contract_unique_idx<T>(&tblis_A, tblis_idx_A, nmode_B, tblis_idx_B, nmode_D, tblis_idx_D);
+    auto [tblis_A_reduced, tblis_idx_A_reduced, tblis_len_A_reduced, tblis_stride_A_reduced, tblis_data_A_reduced] = reduce_isolated_indices<T>(&tblis_A, tblis_idx_A, nmode_B, tblis_idx_B, nmode_D, tblis_idx_D);
 
-    auto [tblis_B_reduced, tblis_idx_B_reduced, tblis_len_B_reduced, tblis_stride_B_reduced, tblis_data_B_reduced] = contract_unique_idx<T>(&tblis_B, tblis_idx_B, nmode_A, tblis_idx_A, nmode_D, tblis_idx_D);    
+    auto [tblis_B_reduced, tblis_idx_B_reduced, tblis_len_B_reduced, tblis_stride_B_reduced, tblis_data_B_reduced] = reduce_isolated_indices<T>(&tblis_B, tblis_idx_B, nmode_A, tblis_idx_A, nmode_D, tblis_idx_D);
 
     tblis_tensor_mult(tblis_single, NULL, tblis_A_reduced, tblis_idx_A_reduced, tblis_B_reduced, tblis_idx_B_reduced, &tblis_D, tblis_idx_D);
 
@@ -142,41 +312,49 @@ void run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, i
     delete[] tblis_len_D;
     delete[] tblis_stride_D;
 
-    delete[] tblis_idx_A_reduced;
-    delete[] tblis_len_A_reduced;
-    delete[] tblis_stride_A_reduced;
-    delete[] tblis_data_A_reduced;
-    delete tblis_A_reduced;
+    if (tblis_A_reduced != &tblis_A)
+    {
+        delete[] tblis_idx_A_reduced;
+        delete[] tblis_len_A_reduced;
+        delete[] tblis_stride_A_reduced;
+        delete[] tblis_data_A_reduced;
+        delete tblis_A_reduced;
+    }
 
-    delete[] tblis_idx_B_reduced;
-    delete[] tblis_len_B_reduced;
-    delete[] tblis_stride_B_reduced;
-    delete[] tblis_data_B_reduced;
-    delete tblis_B_reduced;
+    if (tblis_B_reduced != &tblis_B)
+    {
+        delete[] tblis_idx_B_reduced;
+        delete[] tblis_len_B_reduced;
+        delete[] tblis_stride_B_reduced;
+        delete[] tblis_data_B_reduced;
+        delete tblis_B_reduced;
+    }
+
+    return D;
 }
 
 template<typename T>
-std::tuple<tblis::tblis_tensor*, tblis::label_type*, tblis::len_type*, tblis::stride_type*, T*> contract_unique_idx(tblis::tblis_tensor* tensor, tblis::label_type* idx, int nmode_1, tblis::label_type* idx_1, int nmode_2, tblis::label_type* idx_2)
-{
-    int nmode_reduced = 0;
-    int64_t size_reduced = 1;
-    tblis::tblis_tensor* tblis_reduced = new tblis::tblis_tensor;
-    tblis::len_type* len_reduced = new tblis::len_type[tensor->ndim];
-    tblis::stride_type* stride_reduced = new tblis::stride_type[tensor->ndim];
-    tblis::label_type* idx_reduced = new tblis::label_type[tensor->ndim+1];
+std::tuple<tblis::tblis_tensor*, tblis::label_type*, tblis::len_type*, tblis::stride_type*, T*> reduce_isolated_indices(tblis::tblis_tensor* tensor, tblis::label_type* idx, int nmode_X, tblis::label_type* idx_X, int nmode_Y, tblis::label_type* idx_Y)
+{
+    int new_nmode = 0;
+    int64_t new_size = 1;
+    tblis::tblis_tensor* new_tensor = new tblis::tblis_tensor;
+    tblis::len_type* new_len = new tblis::len_type[tensor->ndim];
+    tblis::stride_type* new_stride = new tblis::stride_type[tensor->ndim];
+    tblis::label_type* new_idx = new tblis::label_type[tensor->ndim+1];
     for (size_t i = 0; i < tensor->ndim; i++)
     {
         bool found = false;
-        for (size_t j = 0; j < nmode_1; j++)
+        for (size_t j = 0; j < nmode_X; j++)
         {
-            if (idx[i] == idx_1[j]) 
+            if (idx[i] == idx_X[j]) 
             {
                 found = true;
             }
         }
-        for (size_t j = 0; j < nmode_2; j++)
+        for (size_t j = 0; j < nmode_Y; j++)
         {
-            if (idx[i] == idx_2[j]) 
+            if (idx[i] == idx_Y[j]) 
             {
                 found = true;
             }
@@ -184,45 +362,175 @@ std::tuple<tblis::tblis_tensor*, tblis::label_type*, tblis::len_type*, tblis::st
         
         if (found)
         {
-            len_reduced[nmode_reduced] = tensor->len[i];
-            stride_reduced[nmode_reduced] = nmode_reduced == 0 ? 1 : stride_reduced[nmode_reduced - 1] * len_reduced[nmode_reduced - 1];
-            idx_reduced[nmode_reduced] = idx[i];
-            size_reduced *= len_reduced[nmode_reduced];
-            nmode_reduced++;
+            new_len[new_nmode] = tensor->len[i];
+            new_stride[new_nmode] = new_nmode == 0 ? 1 : new_stride[new_nmode - 1] * new_len[new_nmode - 1];
+            new_idx[new_nmode] = idx[i];
+            new_size *= new_len[new_nmode];
+            new_nmode++;
+        }
+    }
+    new_idx[new_nmode] = '\0';
+
+    if (new_nmode == tensor->ndim)
+    {
+        delete new_tensor;
+        delete[] new_len;
+        delete[] new_stride;
+        delete[] new_idx;
+        return {tensor, idx, (tblis::len_type*)NULL, (tblis::stride_type*)NULL, (T*)NULL};
+    }
+    T* new_data = new T[new_size];
+    for (size_t i = 0; i < new_size; i++)
+    {
+        new_data[i] = 0;
+    }
+
+    if constexpr (std::is_same_v<T, float>)
+    {
+        tblis_init_tensor_s(new_tensor, new_nmode, new_len, new_data, new_stride);
+    }
+    else if constexpr (std::is_same_v<T, double>)
+    {
+        tblis_init_tensor_d(new_tensor, new_nmode, new_len, new_data, new_stride);
+    }
+    else if constexpr (is_complex_v<T>) 
+    {
+        using value_type = typename T::value_type;
+        if constexpr (std::is_same_v<value_type, float>)
+        {
+            tblis_init_tensor_c(new_tensor, new_nmode, new_len, new_data, new_stride);
+        }
+        else if constexpr (std::is_same_v<value_type, double>)
+        {
+            tblis_init_tensor_z(new_tensor, new_nmode, new_len, new_data, new_stride);
         }
     }
-    idx_reduced[nmode_reduced] = '\0';
+    tblis_tensor_add(tblis_single, NULL, tensor, idx, new_tensor, new_idx);
+    return {new_tensor, new_idx, new_len, new_stride, new_data};
+}
+#endif
+
+template<typename T>
+TAPP_error run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                  struct impl impl, bool use_device_memory,
+#else
+                  bool use_tblis,
+#endif
+                  int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A,
+                  int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B,
+                  int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C,
+                  int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D,
+                  T alpha, T beta
+                  )
+{
+#ifndef TAPP_DYNAMIC_LAUNCH
+    if (use_tblis)
+    {
+        run_tblis_mult(nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                       nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                       nmode_C, extents_C, strides_C, C, op_C, idx_D,
+                       nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                       alpha, beta);
+    }
+    auto fn_create_handle = TAPP_create_handle;
+    auto fn_destroy_handle = TAPP_destroy_handle;
+    auto fn_create_tensor_info = TAPP_create_tensor_info;
+    auto fn_destroy_tensor_info = TAPP_destroy_tensor_info;
+    auto fn_create_tensor_product = TAPP_create_tensor_product;
+    auto fn_destroy_tensor_product = TAPP_destroy_tensor_product;
+    auto fn_create_executor = TAPP_create_executor;
+    auto fn_destroy_executor = TAPP_destroy_executor;
+    auto fn_execute_product = TAPP_execute_product;
+#else
+    auto fn_create_handle = impl.TAPP_create_handle;
+    auto fn_destroy_handle = impl.TAPP_destroy_handle;
+    auto fn_create_tensor_info = impl.TAPP_create_tensor_info;
+    auto fn_destroy_tensor_info = impl.TAPP_destroy_tensor_info;
+    auto fn_create_tensor_product = impl.TAPP_create_tensor_product;
+    auto fn_destroy_tensor_product = impl.TAPP_destroy_tensor_product;
+    auto fn_create_executor = impl.TAPP_create_executor;
+    auto fn_destroy_executor = impl.TAPP_destroy_executor;
+    auto fn_execute_product = impl.TAPP_execute_product;
+#endif
+
+    TAPP_error error_status;
 
-    T* data_reduced = new T[size_reduced];
-    for (size_t i = 0; i < size_reduced; i++)
+    TAPP_handle handle;
+    error_status = fn_create_handle(&handle);
+    if (error_status != 0) goto at_return;
+#ifdef TAPP_DYNAMIC_LAUNCH
+    if (use_device_memory)
     {
-        data_reduced[i] = 0;
+        set_use_device_memory(impl, handle);
     }
+#endif
+    TAPP_datatype datatype;
 
     if constexpr (std::is_same_v<T, float>)
     {
-        tblis_init_tensor_s(tblis_reduced, nmode_reduced, len_reduced, data_reduced, stride_reduced);
+        datatype = TAPP_FLOAT;
     }
     else if constexpr (std::is_same_v<T, double>)
     {
-        tblis_init_tensor_d(tblis_reduced, nmode_reduced, len_reduced, data_reduced, stride_reduced);
+        datatype = TAPP_DOUBLE;
     }
     else if constexpr (is_complex_v<T>) 
     {
         using value_type = typename T::value_type;
         if constexpr (std::is_same_v<value_type, float>)
         {
-            tblis_init_tensor_c(tblis_reduced, nmode_reduced, len_reduced, data_reduced, stride_reduced);
+            datatype = TAPP_SCOMPLEX;
         }
         else if constexpr (std::is_same_v<value_type, double>)
         {
-            tblis_init_tensor_z(tblis_reduced, nmode_reduced, len_reduced, data_reduced, stride_reduced);
+            datatype = TAPP_DCOMPLEX;
         }
     }
-    tblis_tensor_add(tblis_single, NULL, tensor, idx, tblis_reduced, idx_reduced);
-    return {tblis_reduced, idx_reduced, len_reduced, stride_reduced, data_reduced};
+
+    TAPP_tensor_info info_A;
+    error_status = fn_create_tensor_info(&info_A, handle, datatype, nmode_A, extents_A, strides_A);
+    if (error_status != 0) goto at_free_handle;
+    TAPP_tensor_info info_B;
+    error_status = fn_create_tensor_info(&info_B, handle, datatype, nmode_B, extents_B, strides_B);
+    if (error_status != 0) goto at_free_info_A;
+    TAPP_tensor_info info_C;
+    error_status = fn_create_tensor_info(&info_C, handle, datatype, nmode_C, extents_C, strides_C);
+    if (error_status != 0) goto at_free_info_B;
+    TAPP_tensor_info info_D;
+    error_status = fn_create_tensor_info(&info_D, handle, datatype, nmode_D, extents_D, strides_D);
+    if (error_status != 0) goto at_free_info_C;
+
+    TAPP_tensor_product plan;
+    error_status = fn_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, TAPP_DEFAULT_PREC);
+    if (error_status != 0) goto at_free_info_D;
+    TAPP_status status;
+    
+    TAPP_executor exec;
+    error_status = fn_create_executor(&exec);
+    if (error_status != 0) goto at_free_plan;
+
+    error_status = fn_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
+    
+    fn_destroy_executor(exec);
+    at_free_plan:
+    fn_destroy_tensor_product(plan);
+    at_free_info_D:
+    fn_destroy_tensor_info(info_D);
+    at_free_info_C:
+    fn_destroy_tensor_info(info_C);
+    at_free_info_B:
+    fn_destroy_tensor_info(info_B);
+    at_free_info_A:
+    fn_destroy_tensor_info(info_A);
+    at_free_handle:
+    fn_destroy_handle(handle);
+    at_return:
+
+    return error_status;
 }
 
+
 template<typename T, typename U>
 U* change_array_type(T* array, int size)
 {
@@ -282,9 +590,9 @@ std::tuple<int, int64_t*, int64_t*, T*, int64_t*,
                                                                                  bool hadamard_indices_enabled, bool hadamard_only,
                                                                                  bool repeated_indices_enabled, bool isolated_indices_enabled)
 {
-    int nmode_C, free_indices_A, free_indices_B, isolated_indices_A, isolated_indices_B, repeated_indices_A, repeated_indices_B;
+    int free_indices_A, free_indices_B, isolated_indices_A, isolated_indices_B, repeated_indices_A, repeated_indices_B;
 
-    std::tie(nmode_A, nmode_B, nmode_C, nmode_D,
+    std::tie(nmode_A, nmode_B, nmode_D,
              contracted_indices, hadamard_indices,
              free_indices_A, free_indices_B,
              isolated_indices_A, isolated_indices_B,
@@ -292,6 +600,7 @@ std::tuple<int, int64_t*, int64_t*, T*, int64_t*,
                                                                                     contracted_indices, hadamard_indices,
                                                                                     hadamard_only, hadamard_indices_enabled,
                                                                                     isolated_indices_enabled, repeated_indices_enabled);
+    int nmode_C = nmode_D;
 
     int64_t total_unique_indices = contracted_indices + hadamard_indices +
                                    free_indices_A + free_indices_B +
@@ -300,84 +609,80 @@ std::tuple<int, int64_t*, int64_t*, T*, int64_t*,
 
     int* unique_indices = generate_unique_indices(total_unique_indices);
 
-    auto [idx_A, idx_B, idx_C, idx_D] = assign_indices(unique_indices,
-                                                       contracted_indices, hadamard_indices,
-                                                       free_indices_A, free_indices_B,
-                                                       isolated_indices_A, isolated_indices_B,
-                                                       repeated_indices_A, repeated_indices_B);
+    auto [idx_A, idx_B, idx_D] = assign_indices(unique_indices,
+                                                contracted_indices, hadamard_indices,
+                                                free_indices_A, free_indices_B,
+                                                isolated_indices_A, isolated_indices_B,
+                                                repeated_indices_A, repeated_indices_B);
+    int64_t* idx_C = new int64_t[nmode_C];
+    std::copy(idx_D, idx_D + nmode_D, idx_C);
 
     std::unordered_map<int, int64_t> index_extent_map = generate_index_extent_map(min_extent, 4, equal_extents_only, total_unique_indices, unique_indices);
 
-    auto [extents_A, extents_B, extents_C, extents_D] = assign_extents(index_extent_map, nmode_A, idx_A, nmode_B, idx_B, nmode_D, idx_D);
+    auto [extents_A, extents_B, extents_D] = assign_extents(index_extent_map, nmode_A, idx_A, nmode_B, idx_B, nmode_D, idx_D);
+    int64_t* extents_C = new int64_t[nmode_C];
+    std::copy(extents_D, extents_D + nmode_D, extents_C);
 
     int outer_nmode_A = subtensor_on_nmode ? nmode_A + rand(1, 4) : nmode_A;
     int outer_nmode_B = subtensor_on_nmode ? nmode_B + rand(1, 4) : nmode_B;
-    int outer_nmode_C = subtensor_on_nmode ? nmode_C + rand(1, 4) : nmode_C;
     int outer_nmode_D = subtensor_on_nmode ? nmode_D + rand(1, 4) : nmode_D;
 
     int* stride_signs_A = choose_stride_signs(nmode_A, negative_strides_enabled, mixed_strides_enabled);
     int* stride_signs_B = choose_stride_signs(nmode_B, negative_strides_enabled, mixed_strides_enabled);
-    int* stride_signs_C = choose_stride_signs(nmode_C, negative_strides_enabled, mixed_strides_enabled);
     int* stride_signs_D = choose_stride_signs(nmode_D, negative_strides_enabled, mixed_strides_enabled);
 
     bool* subtensor_dims_A = choose_subtensor_dims(nmode_A, outer_nmode_A);
     bool* subtensor_dims_B = choose_subtensor_dims(nmode_B, outer_nmode_B);
-    bool* subtensor_dims_C = choose_subtensor_dims(nmode_C, outer_nmode_C);
     bool* subtensor_dims_D = choose_subtensor_dims(nmode_D, outer_nmode_D);
 
     int64_t* outer_extents_A = calculate_outer_extents(outer_nmode_A, extents_A, subtensor_dims_A, subtensor_on_extents);
     int64_t* outer_extents_B = calculate_outer_extents(outer_nmode_B, extents_B, subtensor_dims_B, subtensor_on_extents);
-    int64_t* outer_extents_C = calculate_outer_extents(outer_nmode_C, extents_C, subtensor_dims_C, subtensor_on_extents);
     int64_t* outer_extents_D = calculate_outer_extents(outer_nmode_D, extents_D, subtensor_dims_D, subtensor_on_extents);
 
     int64_t* offsets_A = calculate_offsets(nmode_A, outer_nmode_A, extents_A, outer_extents_A, subtensor_dims_A, subtensor_on_extents);
     int64_t* offsets_B = calculate_offsets(nmode_B, outer_nmode_B, extents_B, outer_extents_B, subtensor_dims_B, subtensor_on_extents);
-    int64_t* offsets_C = calculate_offsets(nmode_C, outer_nmode_C, extents_C, outer_extents_C, subtensor_dims_C, subtensor_on_extents);
     int64_t* offsets_D = calculate_offsets(nmode_D, outer_nmode_D, extents_D, outer_extents_D, subtensor_dims_D, subtensor_on_extents);
 
     int64_t* strides_A = calculate_strides(nmode_A, outer_nmode_A, outer_extents_A, stride_signs_A, subtensor_dims_A);
     int64_t* strides_B = calculate_strides(nmode_B, outer_nmode_B, outer_extents_B, stride_signs_B, subtensor_dims_B);
-    int64_t* strides_C = calculate_strides(nmode_C, outer_nmode_C, outer_extents_C, stride_signs_C, subtensor_dims_C);
     int64_t* strides_D = calculate_strides(nmode_D, outer_nmode_D, outer_extents_D, stride_signs_D, subtensor_dims_D);
+    int64_t* strides_C = new int64_t[nmode_C];
+    std::copy(strides_D, strides_D + nmode_D, strides_C);
     
     int64_t size_A = calculate_size(outer_nmode_A, outer_extents_A);
     int64_t size_B = calculate_size(outer_nmode_B, outer_extents_B);
-    int64_t size_C = calculate_size(outer_nmode_C, outer_extents_C);
     int64_t size_D = calculate_size(outer_nmode_D, outer_extents_D);
+    int64_t size_C = size_D;
 
-    T* data_A = create_tensor_data<T>(size_A);
-    T* data_B = create_tensor_data<T>(size_B);
-    T* data_C = create_tensor_data<T>(size_C);
-    T* data_D = create_tensor_data<T>(size_D);
+    T* data_A = create_tensor_data<T>(size_A, -10, 10);
+    T* data_B = create_tensor_data<T>(size_B, -10, 10);
+    T* data_C = create_tensor_data<T>(size_C, -10, 10);
+    T* data_D = create_tensor_data<T>(size_D, -10, 10);
 
     T* A = calculate_tensor_pointer<T>(data_A, nmode_A, extents_A, offsets_A, strides_A);
     T* B = calculate_tensor_pointer<T>(data_B, nmode_B, extents_B, offsets_B, strides_B);
-    T* C = calculate_tensor_pointer<T>(data_C, nmode_C, extents_C, offsets_C, strides_C);
+    T* C = calculate_tensor_pointer<T>(data_C, nmode_C, extents_C, offsets_D, strides_C);
     T* D = calculate_tensor_pointer<T>(data_D, nmode_D, extents_D, offsets_D, strides_D);
 
-    T alpha = rand<T>();
-    T beta = rand<T>();
+    T alpha = rand<T>(-10, 10);
+    T beta = rand<T>(-10, 10);
 
     delete[] unique_indices;
 
     delete[] subtensor_dims_A;
     delete[] subtensor_dims_B;
-    delete[] subtensor_dims_C;
     delete[] subtensor_dims_D;
 
     delete[] outer_extents_A;
     delete[] outer_extents_B;
-    delete[] outer_extents_C;
     delete[] outer_extents_D;
 
     delete[] stride_signs_A;
     delete[] stride_signs_B;
-    delete[] stride_signs_C;
     delete[] stride_signs_D;
 
     delete[] offsets_A;
     delete[] offsets_B;
-    delete[] offsets_C;
     delete[] offsets_D;
     
     return {nmode_A, extents_A, strides_A, A, idx_A,
@@ -391,7 +696,7 @@ std::tuple<int, int64_t*, int64_t*, T*, int64_t*,
 
 // nmode_A, nmode_B, nmode_C, nmode_D, contracted_modes, hadamard_modes, free_indices_A, free_indices_B, isolated_indices_A, isolated_indices_B, repeated_indices_A, repeated_indices_B
 // OBS: If something is enabled at least one of those instances will be generated
-std::tuple<int, int, int, int,
+std::tuple<int, int, int,
            int, int, int, int,
            int, int, int, int> generate_index_configuration(int nmode_A, int nmode_B, int nmode_D,
                                                             int contracted_indices, int hadamard_indices,
@@ -741,7 +1046,7 @@ std::tuple<int, int, int, int,
         }
     }
 
-    return {nmode_A, nmode_B, nmode_D, nmode_D, contracted_indices, hadamard_indices, free_indices_A, free_indices_B, isolated_indices_A, isolated_indices_B, repeated_indices_A, repeated_indices_B};
+    return {nmode_A, nmode_B, nmode_D, contracted_indices, hadamard_indices, free_indices_A, free_indices_B, isolated_indices_A, isolated_indices_B, repeated_indices_A, repeated_indices_B};
 }
 
 int* generate_unique_indices(int64_t total_unique_indices)
@@ -755,16 +1060,15 @@ int* generate_unique_indices(int64_t total_unique_indices)
     return unique_indices;
 }
 
-std::tuple<int64_t*, int64_t*, int64_t*, int64_t*> assign_indices(int* unique_indices,
-                                                                  int contracted_indices, int hadamard_indices,
-                                                                  int free_indices_A, int free_indices_B,
-                                                                  int isolated_indices_A, int isolated_indices_B,
-                                                                  int repeated_indices_A, int repeated_indices_B)
+std::tuple<int64_t*, int64_t*, int64_t*> assign_indices(int* unique_indices,
+                                                        int contracted_indices, int hadamard_indices,
+                                                        int free_indices_A, int free_indices_B,
+                                                        int isolated_indices_A, int isolated_indices_B,
+                                                        int repeated_indices_A, int repeated_indices_B)
 {
     // Create index arrays
     int64_t* idx_A = new int64_t[repeated_indices_A + isolated_indices_A + free_indices_A + hadamard_indices + contracted_indices];
     int64_t* idx_B = new int64_t[repeated_indices_B + isolated_indices_B + free_indices_B + hadamard_indices + contracted_indices];
-    int64_t* idx_C = new int64_t[free_indices_A + hadamard_indices + free_indices_B];
     int64_t* idx_D = new int64_t[free_indices_A + hadamard_indices + free_indices_B];
 
     /*
@@ -792,10 +1096,6 @@ std::tuple<int64_t*, int64_t*, int64_t*, int64_t*> assign_indices(int* unique_in
 
     std::shuffle(idx_D, idx_D + (free_indices_A + hadamard_indices + free_indices_B), rand_engine()); // Shuffle indices for D
 
-    std::copy(idx_D,
-              idx_D + free_indices_A + hadamard_indices + free_indices_B,
-              idx_C); // C has the same indices as D
-
     for (int i = 0; i < repeated_indices_A; i++) // Add repeated indices to A
     {
         idx_A[i + isolated_indices_A + free_indices_A + hadamard_indices + contracted_indices] = idx_A[rand(0, isolated_indices_A + free_indices_A + hadamard_indices + contracted_indices - 1)];
@@ -810,7 +1110,7 @@ std::tuple<int64_t*, int64_t*, int64_t*, int64_t*> assign_indices(int* unique_in
 
     std::shuffle(idx_B, idx_B + repeated_indices_B + isolated_indices_B + free_indices_B + hadamard_indices + contracted_indices, rand_engine()); // Shuffle final indices for B
     
-    return {idx_A, idx_B, idx_C, idx_D};
+    return {idx_A, idx_B, idx_D};
 }
 
 std::unordered_map<int, int64_t> generate_index_extent_map(int64_t min_extent, int64_t max_extent,
@@ -827,7 +1127,7 @@ std::unordered_map<int, int64_t> generate_index_extent_map(int64_t min_extent, i
     return index_to_extent;
 }
 
-std::tuple<int64_t*, int64_t*, int64_t*, int64_t*> assign_extents(std::unordered_map<int, int64_t> index_extent_map,
+std::tuple<int64_t*, int64_t*, int64_t*> assign_extents(std::unordered_map<int, int64_t> index_extent_map,
                                                                   int nmode_A, int64_t* idx_A,
                                                                   int nmode_B, int64_t* idx_B,
                                                                   int nmode_D, int64_t* idx_D)
@@ -835,7 +1135,6 @@ std::tuple<int64_t*, int64_t*, int64_t*, int64_t*> assign_extents(std::unordered
     // Create extent arrays
     int64_t* extents_A = new int64_t[nmode_A];
     int64_t* extents_B = new int64_t[nmode_B];
-    int64_t* extents_C = new int64_t[nmode_D];
     int64_t* extents_D = new int64_t[nmode_D];
 
     // Map extents to tensors based on their indices
@@ -852,9 +1151,7 @@ std::tuple<int64_t*, int64_t*, int64_t*, int64_t*> assign_extents(std::unordered
         extents_D[i] = index_extent_map[idx_D[i]]; // Assign extents to D
     }
 
-    std::copy(extents_D, extents_D + nmode_D, extents_C);
-
-    return {extents_A, extents_B, extents_C, extents_D};
+    return {extents_A, extents_B, extents_D};
 }
 
 int* choose_stride_signs(int nmode, bool negative_strides_enabled, bool mixed_strides_enabled)
@@ -1089,11 +1386,11 @@ T rand()
 {
     if constexpr (is_complex_v<T>) {
         using value_type = typename T::value_type;
-        return rand<T>(-std::numeric_limits<value_type>::max(), std::numeric_limits<value_type>::max());
+        return rand<T>(-std::numeric_limits<value_type>::min(), std::numeric_limits<value_type>::max());
     }
     else
     {
-        return rand<T>(-std::numeric_limits<T>::max(), std::numeric_limits<T>::max());
+        return rand<T>(-std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
     }
 }
 
@@ -1282,7 +1579,11 @@ void add_idx(int* nmode, int64_t** idx, int64_t** extents, int64_t** strides, in
     *strides = strides_tmp;
 }
 
-bool test_hadamard_product()
+bool test_hadamard_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                           struct impl implA, struct impl implB
+#endif
+                           )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1294,46 +1595,41 @@ bool test_hadamard_product()
 
     auto [E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    int op_A = 0;
-    int op_B = 0;
-    int op_C = 0;
-    int op_D = 0;
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, op_A, idx_A,
-                   nmode_B, extents_B, strides_B, B, op_B, idx_B,
-                   nmode_C, extents_C, strides_C, C, op_C, idx_D,
-                   nmode_D, extents_D, strides_D, E, op_D, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(D, E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] strides_A;
     delete[] extents_B;
@@ -1355,7 +1651,11 @@ bool test_hadamard_product()
     return result;
 }
 
-bool test_contraction()
+bool test_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                      struct impl implA, struct impl implB
+#endif
+                      )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1367,41 +1667,41 @@ bool test_contraction()
 
     auto [E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -1423,7 +1723,11 @@ bool test_contraction()
     return result;
 }
 
-bool test_commutativity()
+bool test_commutativity(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                        struct impl implA, struct impl implB
+#endif
+                        )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1439,52 +1743,69 @@ bool test_commutativity()
 
     auto [G, data_G] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_tensor_product planAB;
-    TAPP_create_tensor_product(&planAB, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_tensor_product planBA;
-    TAPP_create_tensor_product(&planBA, handle, 0, info_B, idx_B, 0, info_A, idx_A, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(planAB, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
-
-    TAPP_execute_product(planBA, exec, &status, (void*)&alpha, (void*)B, (void*)A, (void*)&beta, (void*)C, (void*)F);
-
-    run_tblis_mult(nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, G, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, F, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, G, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D) && compare_tensors(data_F, data_G, size_D) && compare_tensors(data_D, data_F, size_D);
-    
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(planAB);
-    TAPP_destroy_tensor_product(planBA);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
+
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -1508,7 +1829,11 @@ bool test_commutativity()
     return result;
 }
 
-bool test_permutations()
+bool test_permutations(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                       struct impl implA, struct impl implB
+#endif
+                       )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1519,49 +1844,50 @@ bool test_permutations()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, rand(2, 4));
           
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
     
     bool result = true;
 
     for (int i = 0; i < nmode_D; i++)
     {
-        TAPP_tensor_info info_C;
-        TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-        TAPP_tensor_info info_D;
-        TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-        TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-        TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-        run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                    nmode_B, extents_B, strides_B, B, 0, idx_B,
-                    nmode_C, extents_C, strides_C, C, 0, idx_D,
-                    nmode_D, extents_D, strides_D, E, 0, idx_D,
-                    alpha, beta);
+        int op_A = TAPP_IDENTITY;
+        int op_B = TAPP_IDENTITY;
+        int op_C = TAPP_IDENTITY;
+        int op_D = TAPP_IDENTITY;
+
+        run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                    implA, false,
+#else
+                    false,
+#endif
+                    nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                    nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                    nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                    nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                    alpha,
+                    beta
+        );
+
+        run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                    implB, true,
+#else
+                    true,
+#endif
+                    nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                    nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                    nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                    nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                    alpha,
+                    beta
+        );
         
         result = result && compare_tensors(data_D, data_E, size_D);
 
         rotate_indices(idx_C, nmode_C, extents_C, strides_C);
         rotate_indices(idx_D, nmode_D, extents_D, strides_D);
-        TAPP_destroy_tensor_info(info_C);
-        TAPP_destroy_tensor_info(info_D);
-        TAPP_destroy_tensor_product(plan);
     }
-    
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
+
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -1583,7 +1909,11 @@ bool test_permutations()
     return result;
 }
 
-bool test_equal_extents()
+bool test_equal_extents(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                        struct impl implA, struct impl implB
+#endif
+                        )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1595,41 +1925,41 @@ bool test_equal_extents()
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -1651,7 +1981,11 @@ bool test_equal_extents()
     return result;
 }
 
-bool test_outer_product()
+bool test_outer_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                        struct impl implA, struct impl implB
+#endif
+                        )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1662,42 +1996,41 @@ bool test_outer_product()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, 0);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-    
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -1719,7 +2052,11 @@ bool test_outer_product()
     return result;
 }
 
-bool test_full_contraction()
+bool test_full_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                           struct impl implA, struct impl implB
+#endif
+                           )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1730,42 +2067,42 @@ bool test_full_contraction()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, 0);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -1787,7 +2124,11 @@ bool test_full_contraction()
     return result;
 }
 
-bool test_zero_dim_tensor_contraction()
+bool test_zero_dim_tensor_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                      struct impl implA, struct impl implB
+#endif
+                                      )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1798,42 +2139,42 @@ bool test_zero_dim_tensor_contraction()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(0);//2,2,0,2);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -1855,7 +2196,11 @@ bool test_zero_dim_tensor_contraction()
     return result;
 }
 
-bool test_one_dim_tensor_contraction()
+bool test_one_dim_tensor_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                     struct impl implA, struct impl implB
+#endif
+                                     )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1866,42 +2211,42 @@ bool test_one_dim_tensor_contraction()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(1);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
 
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -1923,7 +2268,11 @@ bool test_one_dim_tensor_contraction()
     return result;
 }
 
-bool test_subtensor_unchanged_nmode()
+bool test_subtensor_same_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                             struct impl implA, struct impl implB
+#endif
+                             )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -1934,42 +2283,42 @@ bool test_subtensor_unchanged_nmode()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, true);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -1991,7 +2340,11 @@ bool test_subtensor_unchanged_nmode()
     return result;
 }
 
-bool test_subtensor_lower_nmode()
+bool test_subtensor_lower_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                              struct impl implA, struct impl implB
+#endif
+                              )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2002,42 +2355,42 @@ bool test_subtensor_lower_nmode()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, true, true);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
 
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2059,7 +2412,11 @@ bool test_subtensor_lower_nmode()
     return result;
 }
 
-bool test_negative_strides()
+bool test_negative_strides(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                           struct impl implA, struct impl implB
+#endif
+                           )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2070,41 +2427,42 @@ bool test_negative_strides()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, false, false, true);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
 
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);    
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2123,10 +2481,14 @@ bool test_negative_strides()
     delete[] data_D;
     delete[] data_E;
 
-    return true;
+    return result;
 }
 
-bool test_negative_strides_subtensor_unchanged_nmode()
+bool test_negative_strides_subtensor_same_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                              struct impl implA, struct impl implB
+#endif
+                                              )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2137,42 +2499,42 @@ bool test_negative_strides_subtensor_unchanged_nmode()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, true, false, true);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
 
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2194,7 +2556,11 @@ bool test_negative_strides_subtensor_unchanged_nmode()
     return result;
 }
 
-bool test_negative_strides_subtensor_lower_nmode()
+bool test_negative_strides_subtensor_lower_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                               struct impl implA, struct impl implB
+#endif
+                                               )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2205,42 +2571,42 @@ bool test_negative_strides_subtensor_lower_nmode()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, true, true, true);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
 
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2262,7 +2628,11 @@ bool test_negative_strides_subtensor_lower_nmode()
     return result;
 }
 
-bool test_mixed_strides()
+bool test_mixed_strides(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                        struct impl implA, struct impl implB
+#endif
+                        )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2270,44 +2640,45 @@ bool test_mixed_strides()
           nmode_D, extents_D, strides_D, D, idx_D,
           alpha, beta,
           data_A, data_B, data_C, data_D,
-          size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, false, false, false, true);
+          size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, false, false, true, true);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
 
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2326,10 +2697,14 @@ bool test_mixed_strides()
     delete[] data_D;
     delete[] data_E;
 
-    return true;
+    return result;
 }
 
-bool test_mixed_strides_subtensor_unchanged_nmode()
+bool test_mixed_strides_subtensor_same_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                           struct impl implA, struct impl implB
+#endif
+                                           )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2337,45 +2712,45 @@ bool test_mixed_strides_subtensor_unchanged_nmode()
           nmode_D, extents_D, strides_D, D, idx_D,
           alpha, beta,
           data_A, data_B, data_C, data_D,
-          size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, true, false, false, true);
+          size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, true, false, true, true);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
 
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2397,7 +2772,11 @@ bool test_mixed_strides_subtensor_unchanged_nmode()
     return result;
 }
 
-bool test_mixed_strides_subtensor_lower_nmode()
+bool test_mixed_strides_subtensor_lower_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                            struct impl implA, struct impl implB
+#endif
+                                            )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2405,45 +2784,45 @@ bool test_mixed_strides_subtensor_lower_nmode()
           nmode_D, extents_D, strides_D, D, idx_D,
           alpha, beta,
           data_A, data_B, data_C, data_D,
-          size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, true, true, false, true);
+          size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>(-1, -1, -1, -1, -1, 1, false, true, true, true, true);
     
     auto[E, data_E] = copy_tensor_data(size_D, data_D, D);
-    
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
 
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2465,7 +2844,11 @@ bool test_mixed_strides_subtensor_lower_nmode()
     return result;
 }
 
-bool test_contraction_double_precision()
+bool test_contraction_double_precision(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                       struct impl implA, struct impl implB
+#endif
+                                       )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2477,41 +2860,41 @@ bool test_contraction_double_precision()
 
     auto [E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F64, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F64, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F64, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F64, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2533,7 +2916,11 @@ bool test_contraction_double_precision()
     return result;
 }
 
-bool test_contraction_complex()
+bool test_contraction_complex(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                              struct impl implA, struct impl implB
+#endif
+                              )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2545,46 +2932,41 @@ bool test_contraction_complex()
 
     auto [E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_C32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_C32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_C32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_C32, nmode_D, extents_D, strides_D);
-
-    int op_A = rand(0, 1);
-    int op_B = rand(0, 1);
-    int op_C = rand(0, 1);
-    int op_D = rand(0, 1);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, op_A, idx_A,
-                   nmode_B, extents_B, strides_B, B, op_B, idx_B,
-                   nmode_C, extents_C, strides_C, C, op_C, idx_D,
-                   nmode_D, extents_D, strides_D, E, op_D, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2606,7 +2988,11 @@ bool test_contraction_complex()
     return result;
 }
 
-bool test_contraction_complex_double_precision()
+bool test_contraction_complex_double_precision(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                               struct impl implA, struct impl implB
+#endif
+                                               )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2618,47 +3004,41 @@ bool test_contraction_complex_double_precision()
 
     auto [E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_C64, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_C64, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_C64, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_C64, nmode_D, extents_D, strides_D);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
-    int op_A = rand(0, 1);
-    int op_B = rand(0, 1);
-    int op_C = rand(0, 1);
-    int op_D = rand(0, 1);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, op_A, info_A, idx_A, op_B, info_B, idx_B, op_C, info_C, idx_C, op_D, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    int terr = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, op_A, idx_A,
-                     nmode_B, extents_B, strides_B, B, op_B, idx_B,
-                     nmode_C, extents_C, strides_C, C, op_C, idx_D,
-                     nmode_D, extents_D, strides_D, E, op_D, idx_D,
-                     alpha, beta);
-    // std::complex<double> zma = 1.0+1.0e-12;
-    // data_D[0] = data_D[0]*zma;
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2680,7 +3060,11 @@ bool test_contraction_complex_double_precision()
     return result;
 }
 
-bool test_zero_stride()
+bool test_zero_stride(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                      struct impl implA, struct impl implB
+#endif
+                      )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2700,41 +3084,41 @@ bool test_zero_stride()
         strides_B[0] = 0;
     }
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2756,7 +3140,11 @@ bool test_zero_stride()
     return result;
 }
 
-bool test_isolated_idx()
+bool test_unique_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                     struct impl implA, struct impl implB
+#endif
+                     )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2768,41 +3156,41 @@ bool test_isolated_idx()
 
     auto [E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2824,7 +3212,11 @@ bool test_isolated_idx()
     return result;
 }
 
-bool test_repeated_idx()
+bool test_repeated_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                       struct impl implA, struct impl implB
+#endif
+                       )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2836,41 +3228,41 @@ bool test_repeated_idx()
 
     auto [E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2892,7 +3284,11 @@ bool test_repeated_idx()
     return result;
 }
 
-bool test_hadamard_and_free()
+bool test_hadamard_and_free(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                            struct impl implA, struct impl implB
+#endif
+                            )
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
           nmode_B, extents_B, strides_B, B, idx_B,
@@ -2904,41 +3300,41 @@ bool test_hadamard_and_free()
 
     auto [E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)data_A, (void*)data_B, (void*)&beta, (void*)data_C, (void*)data_D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, data_A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, data_B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, data_C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, data_E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -2960,7 +3356,11 @@ bool test_hadamard_and_free()
     return result;
 }
 
-bool test_hadamard_and_contraction()
+bool test_hadamard_and_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                   struct impl implA, struct impl implB
+#endif
+                                   )
 {
     int input_nmode = rand(0, 4);
     auto [nmode_A, extents_A, strides_A, A, idx_A,
@@ -2973,41 +3373,41 @@ bool test_hadamard_and_contraction()
 
     auto [E, data_E] = copy_tensor_data(size_D, data_D, D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)data_A, (void*)data_B, (void*)&beta, (void*)data_C, (void*)data_D);
-
-    run_tblis_mult(nmode_A, extents_A, strides_A, data_A, 0, idx_A,
-                   nmode_B, extents_B, strides_B, data_B, 0, idx_B,
-                   nmode_C, extents_C, strides_C, data_C, 0, idx_D,
-                   nmode_D, extents_D, strides_D, data_E, 0, idx_D,
-                   alpha, beta);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implA, false,
+#else
+                false,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                alpha,
+                beta
+    );
+
+    run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                implB, true,
+#else
+                true,
+#endif
+                nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                nmode_D, extents_D, strides_D, E, op_D, idx_D,
+                alpha,
+                beta
+    );
 
     bool result = compare_tensors(data_D, data_E, size_D);
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -3029,6 +3429,7 @@ bool test_hadamard_and_contraction()
     return result;
 }
 
+#ifndef TAPP_DYNAMIC_LAUNCH
 bool test_error_too_many_idx_D()
 {
     auto [nmode_A, extents_A, strides_A, A, idx_A,
@@ -3040,21 +3441,21 @@ bool test_error_too_many_idx_D()
           size_A, size_B, size_C, size_D] = generate_pseudorandom_contraction<float>();
 
     int64_t max_idx = 0;
-    for (size_t i = 0; i < nmode_A; i++)
+    for (int i = 0; i < nmode_A; i++)
     {
         if (max_idx < idx_A[i])
         {
             max_idx = idx_A[i];
         }
     }
-    for (size_t i = 0; i < nmode_B; i++)
+    for (int i = 0; i < nmode_B; i++)
     {
         if (max_idx < idx_B[i])
         {
             max_idx = idx_B[i];
         }
     }
-    for (size_t i = 0; i < nmode_D; i++)
+    for (int i = 0; i < nmode_D; i++)
     {
         if (max_idx < idx_D[i])
         {
@@ -3064,33 +3465,20 @@ bool test_error_too_many_idx_D()
 
     add_incorrect_idx(max_idx, &nmode_D, &idx_D, &extents_D, &strides_D);
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
 
-    int error_status = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
+    TAPP_error error_status = run_product(false,
+                                          nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                                          nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                                          nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                                          nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                                          alpha,
+                                          beta
+                                          );
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -3108,7 +3496,7 @@ bool test_error_too_many_idx_D()
     delete[] data_C;
     delete[] data_D;
 
-    return error_status == 7;
+    return error_status == 7; // && error_status_B == 7; Error status isn't the same for CuTensor and reference imp
 }
 
 bool test_error_non_matching_ext()
@@ -3155,33 +3543,19 @@ bool test_error_non_matching_ext()
         break;
     }
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
 
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    int error_status = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
+    TAPP_error error_status = run_product(false,
+                                          nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                                          nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                                          nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                                          nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                                          alpha,
+                                          beta
+                                          );
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -3199,7 +3573,7 @@ bool test_error_non_matching_ext()
     delete[] data_C;
     delete[] data_D;
 
-    return error_status == 1 || error_status == 2 || error_status == 3;
+    return (error_status == 1 || error_status == 2 || error_status == 3); // && (error_status_B == 1 || error_status_B == 2 || error_status_B == 3); Error status isn't the same for CuTensor and reference imp
 }
 
 bool test_error_C_other_structure()
@@ -3247,33 +3621,20 @@ bool test_error_C_other_structure()
         break;
     }
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
 
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
+    TAPP_error error_status = run_product(false,
+                                          nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                                          nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                                          nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                                          nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                                          alpha,
+                                          beta
+                                          );
 
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
-
-    int error_status = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
-
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -3291,7 +3652,7 @@ bool test_error_C_other_structure()
     delete[] data_C;
     delete[] data_D;
 
-    return error_status == 5 || error_status == 6 || error_status == 7;
+    return (error_status == 5 || error_status == 6 || error_status == 7); // && (error_status_B == 5 || error_status_B == 6 || error_status_B == 7); Error status isn't the same for CuTensor and reference imp
 }
 
 bool test_error_aliasing_within_D()
@@ -3308,33 +3669,20 @@ bool test_error_aliasing_within_D()
     int signs[2] = {-1, 1};
     strides_D[scewed_index] = random_choice(2, signs) * (strides_D[scewed_index - 1] * extents_D[scewed_index - 1] - rand((int64_t)1, strides_D[scewed_index - 1] * extents_D[scewed_index - 1] - 1));
 
-    TAPP_tensor_info info_A;
-    TAPP_create_tensor_info(&info_A, TAPP_F32, nmode_A, extents_A, strides_A);
-    TAPP_tensor_info info_B;
-    TAPP_create_tensor_info(&info_B, TAPP_F32, nmode_B, extents_B, strides_B);
-    TAPP_tensor_info info_C;
-    TAPP_create_tensor_info(&info_C, TAPP_F32, nmode_C, extents_C, strides_C);
-    TAPP_tensor_info info_D;
-    TAPP_create_tensor_info(&info_D, TAPP_F32, nmode_D, extents_D, strides_D);
-
-    TAPP_tensor_product plan;
-    TAPP_handle handle;
-    TAPP_create_handle(&handle);
-    TAPP_create_tensor_product(&plan, handle, 0, info_A, idx_A, 0, info_B, idx_B, 0, info_C, idx_C, 0, info_D, idx_D, TAPP_DEFAULT_PREC);
-    TAPP_status status;
-
-    TAPP_executor exec;
-    TAPP_create_executor(&exec);
+    int op_A = TAPP_IDENTITY;
+    int op_B = TAPP_IDENTITY;
+    int op_C = TAPP_IDENTITY;
+    int op_D = TAPP_IDENTITY;
 
-    int error_status = TAPP_execute_product(plan, exec, &status, (void*)&alpha, (void*)A, (void*)B, (void*)&beta, (void*)C, (void*)D);
+    TAPP_error error_status = run_product(false,
+                                            nmode_A, extents_A, strides_A, A, op_A, idx_A,
+                                            nmode_B, extents_B, strides_B, B, op_B, idx_B,
+                                            nmode_C, extents_C, strides_C, C, op_C, idx_C,
+                                            nmode_D, extents_D, strides_D, D, op_D, idx_D,
+                                            alpha,
+                                            beta
+                                            );
 
-    TAPP_destroy_executor(exec);
-    TAPP_destroy_handle(handle);
-    TAPP_destroy_tensor_product(plan);
-    TAPP_destroy_tensor_info(info_A);
-    TAPP_destroy_tensor_info(info_B);
-    TAPP_destroy_tensor_info(info_C);
-    TAPP_destroy_tensor_info(info_D);
     delete[] extents_A;
     delete[] extents_B;
     delete[] extents_C;
@@ -3352,5 +3700,6 @@ bool test_error_aliasing_within_D()
     delete[] data_C;
     delete[] data_D;
 
-    return error_status == 8;
+    return error_status == 8; // && error_status_B == 8; Error status isn't the same for CuTensor and reference imp
 }
+#endif
\ No newline at end of file
diff --git a/test/test.h b/test/test.h
index bfcc50e..36019b3 100644
--- a/test/test.h
+++ b/test/test.h
@@ -1,8 +1,3 @@
-/*
- * Niklas Hörnblad
- * Paolo Bientinesi
- * Umeå University - November 2024
- */
 #include <iostream>
 #include <random>
 #include <tuple>
@@ -11,14 +6,109 @@
 #include <algorithm>
 #include <unordered_map>
 #include <type_traits>
-#include <cstring>
+#include <dlfcn.h>  // POSIX dynamic loading, TODO: fix for windows
 
+#ifndef TAPP_DYNAMIC_LAUNCH
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #include "tblis.h"
 #pragma GCC diagnostic pop
+#endif
 #include <tapp.h>
 
+#ifdef TAPP_DYNAMIC_LAUNCH
+const char* pathA = "./reference_implementation/libtapp-reference.so";
+const char* pathB = "./cutensor_bindings/libtapp-cutensor.so";
+struct impl
+{
+    void* handle;
+    TAPP_error (*TAPP_attr_set)(TAPP_attr attr, TAPP_key key, void* value);
+    TAPP_error (*TAPP_attr_get)(TAPP_attr attr, TAPP_key key, void** value);
+    TAPP_error (*TAPP_attr_clear)(TAPP_attr attr, TAPP_key key);
+    bool (*TAPP_check_success)(TAPP_error error);
+    size_t (*TAPP_explain_error)(TAPP_error error, size_t maxlen, char* message);
+    TAPP_error (*TAPP_create_executor)(TAPP_executor* exec);
+    TAPP_error (*TAPP_destroy_executor)(TAPP_executor exec);
+    TAPP_error (*TAPP_create_handle)(TAPP_handle* handle);
+    TAPP_error (*TAPP_destroy_handle)(TAPP_handle handle);
+    TAPP_error (*TAPP_create_tensor_product)(TAPP_tensor_product* plan,
+                                             TAPP_handle handle,
+                                             TAPP_element_op op_A,
+                                             TAPP_tensor_info A,
+                                             const int64_t* idx_A,
+                                             TAPP_element_op op_B,
+                                             TAPP_tensor_info B,
+                                             const int64_t* idx_B,
+                                             TAPP_element_op op_C,
+                                             TAPP_tensor_info C,
+                                             const int64_t* idx_C,
+                                             TAPP_element_op op_D,
+                                             TAPP_tensor_info D,
+                                             const int64_t* idx_D,
+                                             TAPP_prectype prec);
+    TAPP_error (*TAPP_destroy_tensor_product)(TAPP_tensor_product plan);
+    TAPP_error (*TAPP_execute_product)(TAPP_tensor_product plan,
+                                       TAPP_executor exec,
+                                       TAPP_status* status,
+                                       const void* alpha,
+                                       const void* A,
+                                       const void* B,
+                                       const void* beta,
+                                       const void* C,
+                                             void* D);
+    TAPP_error (*TAPP_execute_batched_product)(TAPP_tensor_product plan,
+                                               TAPP_executor exec,
+                                               TAPP_status* status,
+                                               int num_batches,
+                                               const void* alpha,
+                                               const void** A,
+                                               const void** B,
+                                               const void* beta,
+                                               const void** C,
+                                                     void** D);
+    TAPP_error (*TAPP_destroy_status)(TAPP_status status);
+    TAPP_error (*TAPP_create_tensor_info)(TAPP_tensor_info* info,
+                                          TAPP_handle handle,
+                                          TAPP_datatype type,
+                                          int nmode,
+                                          const int64_t* extents,
+                                          const int64_t* strides);
+    TAPP_error (*TAPP_destroy_tensor_info)(TAPP_tensor_info info);
+    int (*TAPP_get_nmodes)(TAPP_tensor_info info);
+    TAPP_error (*TAPP_set_nmodes)(TAPP_tensor_info info, int nmodes);
+    void (*TAPP_get_extents)(TAPP_tensor_info info, int64_t* extents);
+    TAPP_error (*TAPP_set_extents)(TAPP_tensor_info info, const int64_t* extents);
+    void (*TAPP_get_strides)(TAPP_tensor_info info, int64_t* strides);
+    TAPP_error (*TAPP_set_strides)(TAPP_tensor_info info, const int64_t* strides);
+};
+
+int load_implementation(struct impl* impl, const char* path);
+void unload_implementation(struct impl* impl);
+#else
+template<typename T>
+T* run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A,
+                    int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B,
+                    int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C,
+                    int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D,
+                    T alpha, T beta);
+template<typename T>
+std::tuple<tblis::tblis_tensor*, tblis::label_type*, tblis::len_type*, tblis::stride_type*, T*> reduce_isolated_indices(tblis::tblis_tensor* tensor, tblis::label_type* idx, int nmode_X, tblis::label_type* idx_X, int nmode_Y, tblis::label_type* idx_Y);
+#endif
+
+template<typename T>
+TAPP_error run_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                  struct impl impl, bool use_device_memory,
+#else
+                  bool use_tblis,
+#endif
+                  int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A,
+                  int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B,
+                  int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C,
+                  int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D,
+                  T alpha, T beta
+                  );
+
 template<typename T>
 struct is_complex : std::false_type {};
 template<typename T>
@@ -30,14 +120,7 @@ template<typename T>
 T rand(T min, T max);
 template<typename T>
 T rand();
-template<typename T>
-void run_tblis_mult(int nmode_A, int64_t* extents_A, int64_t* strides_A, T* A, int op_A, int64_t* idx_A,
-                    int nmode_B, int64_t* extents_B, int64_t* strides_B, T* B, int op_B, int64_t* idx_B,
-                    int nmode_C, int64_t* extents_C, int64_t* strides_C, T* C, int op_C, int64_t* idx_C,
-                    int nmode_D, int64_t* extents_D, int64_t* strides_D, T* D, int op_D, int64_t* idx_D,
-                    T alpha, T beta);
-template<typename T>
-std::tuple<tblis::tblis_tensor*, tblis::label_type*, tblis::len_type*, tblis::stride_type*, T*> contract_unique_idx(tblis::tblis_tensor* tensor, tblis::label_type* idx, int nmode_1, tblis::label_type* idx_1, int nmode_2, tblis::label_type* idx_2);
+
 template<typename T, typename U>
 U* change_array_type(T* array, int size);
 template<typename T>
@@ -57,25 +140,25 @@ std::tuple<int, int64_t*, int64_t*, T*, int64_t*,
                                                                                  bool negative_strides_enabled = false, bool mixed_strides_enabled = false,
                                                                                  bool hadamard_indices_enabled = false, bool hadamard_only = false,
                                                                                  bool repeated_indices_enabled = false, bool isolated_indices_enabled = false);
-std::tuple<int, int, int, int,
+std::tuple<int, int, int,
            int, int, int, int,
            int, int, int, int> generate_index_configuration(int nmode_A = -1, int nmode_B = -1, int nmode_D = -1,
                                                             int contracted_indices = -1, int hadamard_indices = -1,
                                                             bool hadamard_only = false, bool hadamard_indices_enabled = false,
                                                             bool isolated_indices_enabled = false, bool repeated_indices_enabled = false);
 int* generate_unique_indices(int64_t total_unique_indices);
-std::tuple<int64_t*, int64_t*, int64_t*, int64_t*> assign_indices(int* unique_indices,
-                                                                  int contracted_modes, int hadamard_modes,
-                                                                  int free_indices_A, int free_indices_B,
-                                                                  int isolated_indices_A, int isolated_indices_B,
-                                                                  int repeated_indices_A, int repeated_indices_B);
+std::tuple<int64_t*, int64_t*, int64_t*> assign_indices(int* unique_indices,
+                                                        int contracted_modes, int hadamard_modes,
+                                                        int free_indices_A, int free_indices_B,
+                                                        int isolated_indices_A, int isolated_indices_B,
+                                                        int repeated_indices_A, int repeated_indices_B);
 std::unordered_map<int, int64_t> generate_index_extent_map(int64_t min_extent, int64_t max_extent,
                                                            bool equal_extents_only,
                                                            int64_t total_unique_indices, int* unique_indices);
-std::tuple<int64_t*, int64_t*, int64_t*, int64_t*> assign_extents(std::unordered_map<int, int64_t> index_extent_map,
-                                                                  int nmode_A, int64_t* idx_A,
-                                                                  int nmode_B, int64_t* idx_B,
-                                                                  int nmode_D, int64_t* idx_D);
+std::tuple<int64_t*, int64_t*, int64_t*> assign_extents(std::unordered_map<int, int64_t> index_extent_map,
+                                                        int nmode_A, int64_t* idx_A,
+                                                        int nmode_B, int64_t* idx_B,
+                                                        int nmode_D, int64_t* idx_D);
 int* choose_stride_signs(int nmode, bool negative_str, bool mixed_str);
 bool* choose_subtensor_dims(int nmode, int outer_nmode);
 int64_t* calculate_outer_extents(int outer_nmode, int64_t* extents, bool* subtensor_dims, bool lower_extents);
@@ -106,31 +189,134 @@ void add_incorrect_idx(int64_t max_idx, int* nmode, int64_t** idx, int64_t** ext
 void add_idx(int* nmode, int64_t** idx, int64_t** extents, int64_t** strides, int64_t additional_idx, int64_t additional_extents, int64_t additional_strides);
 
 // Tests
-bool test_hadamard_product();
-bool test_contraction();
-bool test_commutativity();
-bool test_permutations();
-bool test_equal_extents();
-bool test_outer_product();
-bool test_full_contraction();
-bool test_zero_dim_tensor_contraction();
-bool test_one_dim_tensor_contraction();
-bool test_subtensor_unchanged_nmode();
-bool test_subtensor_lower_nmode();
-bool test_negative_strides();
-bool test_negative_strides_subtensor_unchanged_nmode();
-bool test_negative_strides_subtensor_lower_nmode();
-bool test_mixed_strides();
-bool test_mixed_strides_subtensor_unchanged_nmode();
-bool test_mixed_strides_subtensor_lower_nmode();
-bool test_contraction_double_precision();
-bool test_contraction_complex();
-bool test_contraction_complex_double_precision();
-bool test_zero_stride();
-bool test_isolated_idx();
-bool test_repeated_idx();
-bool test_hadamard_and_free();
-bool test_hadamard_and_contraction();
+bool test_hadamard_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                           struct impl implA, struct impl implB
+#endif
+                           );
+bool test_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                      struct impl implA, struct impl implB
+#endif
+                      );
+bool test_commutativity(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                        struct impl implA, struct impl implB
+#endif
+                        );
+bool test_permutations(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                       struct impl implA, struct impl implB
+#endif
+                       );
+bool test_equal_extents(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                        struct impl implA, struct impl implB
+#endif
+                        );
+bool test_outer_product(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                        struct impl implA, struct impl implB
+#endif
+                        );
+bool test_full_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                           struct impl implA, struct impl implB
+#endif
+                           );
+bool test_zero_dim_tensor_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                      struct impl implA, struct impl implB
+#endif
+                                      );
+bool test_one_dim_tensor_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                     struct impl implA, struct impl implB
+#endif
+                                     );
+bool test_subtensor_same_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                             struct impl implA, struct impl implB
+#endif
+                             );
+bool test_subtensor_lower_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                              struct impl implA, struct impl implB
+#endif
+                              );
+bool test_negative_strides(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                           struct impl implA, struct impl implB
+#endif
+                           );
+bool test_negative_strides_subtensor_same_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                              struct impl implA, struct impl implB
+#endif
+                                              );
+bool test_negative_strides_subtensor_lower_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                               struct impl implA, struct impl implB
+#endif
+                                               );
+bool test_mixed_strides(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                        struct impl implA, struct impl implB
+#endif
+                        );
+bool test_mixed_strides_subtensor_same_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                           struct impl implA, struct impl implB
+#endif
+                                           );
+bool test_mixed_strides_subtensor_lower_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                            struct impl implA, struct impl implB
+#endif
+                                            );
+bool test_contraction_double_precision(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                       struct impl implA, struct impl implB
+#endif
+                                       );
+bool test_contraction_complex(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                              struct impl implA, struct impl implB
+#endif
+                              );
+bool test_contraction_complex_double_precision(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                               struct impl implA, struct impl implB
+#endif
+                                               );
+bool test_zero_stride(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                      struct impl implA, struct impl implB
+#endif
+                      );
+bool test_unique_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                     struct impl implA, struct impl implB
+#endif
+                     );
+bool test_repeated_idx(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                       struct impl implA, struct impl implB
+#endif
+                       );
+bool test_hadamard_and_free(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                            struct impl implA, struct impl implB
+#endif
+                            );
+bool test_hadamard_and_contraction(
+#ifdef TAPP_DYNAMIC_LAUNCH
+                                   struct impl implA, struct impl implB
+#endif
+                                   );
+
+#ifndef TAPP_DYNAMIC_LAUNCH // These test does not make sense for other implementations than the reference
 bool test_error_non_matching_ext();
 bool test_error_C_other_structure();
 bool test_error_aliasing_within_D();
+#endif