From 0eeb7d0eb301b71ce63792eb1fe011e8a4d0209b Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Wed, 1 Jul 2026 17:39:54 +0800
Subject: [PATCH 01/15] Align runtime API with generated wrappers

---
 README.md                            |  10 +-
 scripts/generate_public_headers.py   | 281 +++++++++++++--------------
 src/native/ascend/runtime_.h         |  18 +-
 src/native/cambricon/runtime_.h      |  18 +-
 src/native/cpu/runtime_.h            |  91 +++++++--
 src/native/cuda/hygon/runtime_.h     |  22 ++-
 src/native/cuda/iluvatar/runtime_.h  |  14 +-
 src/native/cuda/metax/runtime_.h     |  16 +-
 src/native/cuda/moore/runtime_.h     |  16 +-
 src/native/cuda/nvidia/runtime_.h    |  14 +-
 src/native/cuda/runtime_.h           |  10 +-
 src/runtime.h                        |  28 +--
 tests/CMakeLists.txt                 |  20 +-
 tests/compile_install_consumer.cmake |  11 ++
 tests/install_consumer_smoke.cc      |  64 +++++-
 tests/test_cpu_runtime.cc            |  18 +-
 tests/test_nvidia_runtime.cc         |   6 +-
 tests/test_runtime_dispatch.cc       |  65 -------
 18 files changed, 416 insertions(+), 306 deletions(-)
 delete mode 100644 tests/test_runtime_dispatch.cc
diff --git a/README.md b/README.md
index b93b7be..d8883e3 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ cmake --install build
 #include <infini/rt.h>
 
 int main() {
-  infini::rt::SetDevice({infini::rt::Device::Type::kCpu, 0});
+  infini::rt::SetDevice(0);
 
   constexpr std::size_t size = 1024;
   void* ptr = nullptr;
@@ -79,12 +79,16 @@ int main() {
 }
 ```
 
-For NVIDIA:
+When a GPU backend is enabled, the top-level runtime API targets that backend,
+matching CUDA Runtime API behavior:
 
 ```cpp
-infini::rt::SetDevice({infini::rt::Device::Type::kNvidia, 0});
+infini::rt::SetDevice(0);
 ```
 
+Use `infini::rt::Runtime<infini::rt::Device::Type::kCpu>` when CPU runtime
+calls are needed explicitly in a build that also enables an accelerator backend.
+
 ## Using Installed InfiniRT From Another Project
 
 Downstream projects should consume the installed headers and libraries instead of depending on the InfiniRT source tree.
diff --git a/scripts/generate_public_headers.py b/scripts/generate_public_headers.py
index 93ceee3..74dde79 100644
--- a/scripts/generate_public_headers.py
+++ b/scripts/generate_public_headers.py
@@ -53,21 +53,23 @@
     "cpu": "Device::Type::kCpu",
     "nvidia": "Device::Type::kNvidia",
     "iluvatar": "Device::Type::kIluvatar",
+    "hygon": "Device::Type::kHygon",
     "metax": "Device::Type::kMetax",
     "moore": "Device::Type::kMoore",
     "cambricon": "Device::Type::kCambricon",
     "ascend": "Device::Type::kAscend",
 }
 
-_RUNTIME_HEADERS = {
-    "cpu": "native/cpu/runtime_.h",
-    "nvidia": "native/cuda/nvidia/runtime_.h",
-    "iluvatar": "native/cuda/iluvatar/runtime_.h",
-    "metax": "native/cuda/metax/runtime_.h",
-    "moore": "native/cuda/moore/runtime_.h",
-    "cambricon": "native/cambricon/runtime_.h",
-    "ascend": "native/ascend/runtime_.h",
-}
+_DEFAULT_DEVICE_PRIORITY = (
+    "nvidia",
+    "iluvatar",
+    "hygon",
+    "metax",
+    "moore",
+    "cambricon",
+    "ascend",
+    "cpu",
+)
 
 
 def _guard(path):
@@ -155,12 +157,16 @@ def _write_detail_headers(include_root, source_root, devices):
 
 
 def _write_generated_header(include_root, devices):
+    default_device = _default_device(devices)
+    default_device_type = _DEVICE_TYPES[default_device]
     includes = [
+        "#include <cstddef>",
         f"#include {_detail_include('data_type.h')}",
         f"#include {_detail_include('device.h')}",
         f"#include {_detail_include('hash.h')}",
         f"#include {_detail_include('runtime.h')}",
         f"#include {_detail_include('tensor_view.h')}",
+        f"#include <infini/rt/{default_device}/runtime_.h>",
     ]
 
     for device in devices:
@@ -174,6 +180,51 @@ def _write_generated_header(include_root, devices):
 
 {chr(10).join(includes)}
 
+namespace infini::rt {{
+namespace generated_detail {{
+
+using DefaultRuntime = Runtime<{default_device_type}>;
+
+}}  // namespace generated_detail
+
+using Error = typename generated_detail::DefaultRuntime::Error;
+
+using Stream = typename generated_detail::DefaultRuntime::Stream;
+
+inline constexpr Error kSuccess = generated_detail::DefaultRuntime::kSuccess;
+
+enum class MemcpyKind {{
+  kMemcpyHostToHost =
+      static_cast<int>(generated_detail::DefaultRuntime::kMemcpyHostToHost),
+  kMemcpyHostToDevice =
+      static_cast<int>(generated_detail::DefaultRuntime::kMemcpyHostToDevice),
+  kMemcpyDeviceToHost =
+      static_cast<int>(generated_detail::DefaultRuntime::kMemcpyDeviceToHost),
+  kMemcpyDeviceToDevice =
+      static_cast<int>(generated_detail::DefaultRuntime::kMemcpyDeviceToDevice),
+}};
+
+Error SetDevice(int device);
+
+Error GetDevice(int* device);
+
+Error GetDeviceCount(int* count);
+
+Error DeviceSynchronize();
+
+Error Malloc(void** ptr, std::size_t size);
+
+Error Free(void* ptr);
+
+Error Memset(void* ptr, int value, std::size_t count);
+
+Error Memcpy(void* dst, const void* src, std::size_t count, MemcpyKind kind);
+
+Error MemcpyAsync(void* dst, const void* src, std::size_t count,
+                  MemcpyKind kind, Stream stream);
+
+}}  // namespace infini::rt
+
 #endif
 """
     )
@@ -198,58 +249,65 @@ def params_decl(self):
         return ", ".join(f"{param.type} {param.name}" for param in self.params)
 
 
-def _parse_param(param):
-    param_type, param_name = param.strip().rsplit(" ", 1)
-
-    return _Param(param_type, param_name)
-
-
-def _parse_runtime_functions(runtime_header):
-    text = pathlib.Path(runtime_header).read_text()
-    return tuple(
-        _Function(
-            return_type,
-            name,
-            tuple(_parse_param(param) for param in params.split(", ") if param),
-        )
-        for return_type, name, params in re.findall(
-            r"^(void) ([A-Z]\w*)\(([^()]*)\);$", text, re.MULTILINE
-        )
-    )
-
-
-def _abort_statement(message):
-    return f"""      assert(false && "{message}");
-      std::abort();"""
-
-
-def _dispatch_cases(devices, statements):
-    return "\n".join(
-        f"""    case {_DEVICE_TYPES[device]}: {{
-{statements.replace("__DEVICE_TYPE__", _DEVICE_TYPES[device])}
-      return;
-    }}"""
-        for device in devices
-    )
+_PUBLIC_RUNTIME_FUNCTIONS = (
+    _Function("Error", "SetDevice", (_Param("int", "device"),)),
+    _Function("Error", "GetDevice", (_Param("int*", "device"),)),
+    _Function("Error", "GetDeviceCount", (_Param("int*", "count"),)),
+    _Function("Error", "DeviceSynchronize", ()),
+    _Function(
+        "Error",
+        "Malloc",
+        (_Param("void**", "ptr"), _Param("std::size_t", "size")),
+    ),
+    _Function("Error", "Free", (_Param("void*", "ptr"),)),
+    _Function(
+        "Error",
+        "Memset",
+        (
+            _Param("void*", "ptr"),
+            _Param("int", "value"),
+            _Param("std::size_t", "count"),
+        ),
+    ),
+    _Function(
+        "Error",
+        "Memcpy",
+        (
+            _Param("void*", "dst"),
+            _Param("const void*", "src"),
+            _Param("std::size_t", "count"),
+            _Param("MemcpyKind", "kind"),
+        ),
+    ),
+    _Function(
+        "Error",
+        "MemcpyAsync",
+        (
+            _Param("void*", "dst"),
+            _Param("const void*", "src"),
+            _Param("std::size_t", "count"),
+            _Param("MemcpyKind", "kind"),
+            _Param("Stream", "stream"),
+        ),
+    ),
+)
 
 
-def _selector(function):
-    for param in function.params:
-        if param.type == "Device":
-            return f"{param.name}.type()"
-        if param.type == "Device::Type":
-            return param.name
+def _default_device(devices):
+    for device in _DEFAULT_DEVICE_PRIORITY:
+        if device in devices:
+            return device
 
-    return "current_device.type()"
+    raise ValueError("at least one device is required")
 
 
 def _runtime_arg(param):
-    if param.type == "Device":
-        return f"{param.name}.index()"
-    if param.type == "Device::Type":
-        return None
     if param.type == "MemcpyKind":
-        return f"RuntimeMemcpyKind<__DEVICE_TYPE__>({param.name})"
+        return f"RuntimeMemcpyKind({param.name})"
+    if param.type == "Stream":
+        return (
+            f"reinterpret_cast<typename DefaultRuntime::Stream>({param.name})"
+        )
 
     return param.name
 
@@ -260,134 +318,67 @@ def _runtime_args(function):
     return ", ".join(arg for arg in args if arg is not None)
 
 
-def _preconditions(function):
-    required_pointer_names = {
-        "GetDevice": {"device"},
-        "GetDeviceCount": {"count"},
-    }
-    checks = []
-    for param in function.params:
-        if param.type.endswith("**") or param.name in required_pointer_names.get(
-            function.name, set()
-        ):
-            checks.append(f"  assert({param.name} != nullptr);")
-
-    return "\n".join(checks)
-
-
-def _post_dispatch(function):
-    if function.name == "SetDevice":
-        return "\n      current_device = device;"
-
-    return ""
-
-
 def _runtime_call(function):
     args = _runtime_args(function)
     if args:
-        return f"Runtime<__DEVICE_TYPE__>::{function.name}({args})"
+        return f"DefaultRuntime::{function.name}({args})"
 
-    return f"Runtime<__DEVICE_TYPE__>::{function.name}()"
+    return f"DefaultRuntime::{function.name}()"
 
 
-def _write_get_device(function, devices):
-    device_param = function.params[0].name
-    cases = _dispatch_cases(
-        devices,
-        f"""      int index = current_device.index();
-      CheckCall([&] {{ return Runtime<__DEVICE_TYPE__>::GetDevice(&index); }});
-      current_device = Device{{current_device.type(), index}};
-      *{device_param} = current_device;""",
-    )
-
-    return f"""void GetDevice(Device* {device_param}) {{
-  assert({device_param} != nullptr);
-
-  switch (current_device.type()) {{
-{cases}
-    default:
-{_abort_statement("runtime device is not enabled")}
-  }}
-}}
-"""
-
-
-def _write_dispatch_function(function, devices):
-    if function.name == "GetDevice":
-        return _write_get_device(function, devices)
-
-    cases = _dispatch_cases(
-        devices,
-        f"""      CheckCall([&] {{ return {_runtime_call(function)}; }});{_post_dispatch(function)}""",
-    )
-    preconditions = _preconditions(function)
-    if preconditions:
-        preconditions = f"{preconditions}\n\n"
-
+def _write_dispatch_function(function):
     return f"""{function.signature()} {{
-{preconditions}  switch ({_selector(function)}) {{
-{cases}
-    default:
-{_abort_statement("runtime device is not enabled")}
-  }}
+  return CheckCall([&] {{ return {_runtime_call(function)}; }});
 }}
 """
 
 
-def _write_runtime_dispatch(source_path, runtime_header, devices):
-    first_device_type = _DEVICE_TYPES[devices[0]]
-    includes = ['#include "runtime.h"']
-    includes.extend(f'#include "{_RUNTIME_HEADERS[device]}"' for device in devices)
-    functions = _parse_runtime_functions(runtime_header)
+def _write_runtime_dispatch(source_path):
+    functions = _PUBLIC_RUNTIME_FUNCTIONS
     dispatch_functions = "\n".join(
-        _write_dispatch_function(function, devices) for function in functions
+        _write_dispatch_function(function) for function in functions
     )
 
     source_path.parent.mkdir(parents=True, exist_ok=True)
     source_path.write_text(
         f"""#include <cassert>
-#include <cstdlib>
+#include <cstddef>
 #include <type_traits>
 #include <utility>
 
-{chr(10).join(includes)}
+#include <infini/rt/generated.h>
 
 namespace infini::rt {{
 namespace {{
 
-thread_local Device current_device{{{first_device_type}, 0}};
+using DefaultRuntime = generated_detail::DefaultRuntime;
 
 template <typename Func>
-void CheckCall(Func&& func) {{
+Error CheckCall(Func&& func) {{
   using ReturnType = decltype(std::forward<Func>(func)());
 
   if constexpr (std::is_void_v<ReturnType>) {{
     std::forward<Func>(func)();
+    return kSuccess;
   }} else {{
-    ReturnType status = std::forward<Func>(func)();
-    if (status != ReturnType{{}}) {{
-      assert(false && "runtime call failed");
-      std::abort();
-    }}
+    return static_cast<Error>(std::forward<Func>(func)());
   }}
 }}
 
-template <Device::Type kDev>
 auto RuntimeMemcpyKind(MemcpyKind kind) {{
   switch (kind) {{
-    case MemcpyKind::kHostToHost:
-      return Runtime<kDev>::MemcpyHostToHost;
-    case MemcpyKind::kHostToDevice:
-      return Runtime<kDev>::MemcpyHostToDevice;
-    case MemcpyKind::kDeviceToHost:
-      return Runtime<kDev>::MemcpyDeviceToHost;
-    case MemcpyKind::kDeviceToDevice:
-      return Runtime<kDev>::MemcpyDeviceToDevice;
+    case MemcpyKind::kMemcpyHostToHost:
+      return DefaultRuntime::kMemcpyHostToHost;
+    case MemcpyKind::kMemcpyHostToDevice:
+      return DefaultRuntime::kMemcpyHostToDevice;
+    case MemcpyKind::kMemcpyDeviceToHost:
+      return DefaultRuntime::kMemcpyDeviceToHost;
+    case MemcpyKind::kMemcpyDeviceToDevice:
+      return DefaultRuntime::kMemcpyDeviceToDevice;
   }}
 
   assert(false && "unsupported memcpy kind");
-  std::abort();
-  return Runtime<kDev>::MemcpyHostToHost;
+  return DefaultRuntime::kMemcpyHostToHost;
 }}
 
 }}  // namespace
@@ -422,9 +413,7 @@ def main():
             _write_wrapper(include_root, wrapper_device, header_name, target)
 
     _write_generated_header(include_root, devices)
-    _write_runtime_dispatch(
-        pathlib.Path(args.source_output), args.runtime_header, devices
-    )
+    _write_runtime_dispatch(pathlib.Path(args.source_output))
 
 
 if __name__ == "__main__":
diff --git a/src/native/ascend/runtime_.h b/src/native/ascend/runtime_.h
index 8b33e54..191beef 100644
--- a/src/native/ascend/runtime_.h
+++ b/src/native/ascend/runtime_.h
@@ -16,10 +16,14 @@ namespace infini::rt {
 template <>
 struct Runtime<Device::Type::kAscend>
     : DeviceRuntime<Runtime<Device::Type::kAscend>> {
+  using Error = aclError;
+
   using Stream = aclrtStream;
 
   static constexpr Device::Type kDeviceType = Device::Type::kAscend;
 
+  static constexpr Error kSuccess = ACL_SUCCESS;
+
   static constexpr auto SetDevice = aclrtSetDevice;
 
   static constexpr auto GetDevice = aclrtGetDevice;
@@ -45,13 +49,19 @@ struct Runtime<Device::Type::kAscend>
     return aclrtMemcpy(dst, count, src, count, kind);
   };
 
-  static constexpr auto MemcpyHostToHost = ACL_MEMCPY_HOST_TO_HOST;
+  static constexpr auto MemcpyAsync = [](void* dst, const void* src,
+                                         size_t count, aclrtMemcpyKind kind,
+                                         Stream stream) {
+    return aclrtMemcpyAsync(dst, count, src, count, kind, stream);
+  };
+
+  static constexpr auto kMemcpyHostToHost = ACL_MEMCPY_HOST_TO_HOST;
 
-  static constexpr auto MemcpyHostToDevice = ACL_MEMCPY_HOST_TO_DEVICE;
+  static constexpr auto kMemcpyHostToDevice = ACL_MEMCPY_HOST_TO_DEVICE;
 
-  static constexpr auto MemcpyDeviceToHost = ACL_MEMCPY_DEVICE_TO_HOST;
+  static constexpr auto kMemcpyDeviceToHost = ACL_MEMCPY_DEVICE_TO_HOST;
 
-  static constexpr auto MemcpyDeviceToDevice = ACL_MEMCPY_DEVICE_TO_DEVICE;
+  static constexpr auto kMemcpyDeviceToDevice = ACL_MEMCPY_DEVICE_TO_DEVICE;
 
   static constexpr auto Memset = [](void* ptr, int value, size_t count) {
     return aclrtMemset(ptr, count, value, count);
diff --git a/src/native/cambricon/runtime_.h b/src/native/cambricon/runtime_.h
index 4db4920..d892df8 100644
--- a/src/native/cambricon/runtime_.h
+++ b/src/native/cambricon/runtime_.h
@@ -14,10 +14,14 @@ namespace infini::rt {
 template <>
 struct Runtime<Device::Type::kCambricon>
     : DeviceRuntime<Runtime<Device::Type::kCambricon>> {
+  using Error = cnrtRet_t;
+
   using Stream = cnrtQueue_t;
 
   static constexpr Device::Type kDeviceType = Device::Type::kCambricon;
 
+  static constexpr Error kSuccess = CNRT_RET_SUCCESS;
+
   static constexpr auto SetDevice = cnrtSetDevice;
 
   static constexpr auto GetDevice = cnrtGetDevice;
@@ -41,13 +45,19 @@ struct Runtime<Device::Type::kCambricon>
     return cnrtMemcpy(dst, const_cast<void*>(src), size, kind);
   };
 
-  static constexpr auto MemcpyHostToHost = cnrtMemcpyHostToHost;
+  static constexpr auto MemcpyAsync = [](void* dst, const void* src,
+                                         std::size_t size, auto kind,
+                                         Stream stream) {
+    return cnrtMemcpyAsync(dst, const_cast<void*>(src), size, kind, stream);
+  };
+
+  static constexpr auto kMemcpyHostToHost = cnrtMemcpyHostToHost;
 
-  static constexpr auto MemcpyHostToDevice = cnrtMemcpyHostToDev;
+  static constexpr auto kMemcpyHostToDevice = cnrtMemcpyHostToDev;
 
-  static constexpr auto MemcpyDeviceToHost = cnrtMemcpyDevToHost;
+  static constexpr auto kMemcpyDeviceToHost = cnrtMemcpyDevToHost;
 
-  static constexpr auto MemcpyDeviceToDevice = cnrtMemcpyDevToDev;
+  static constexpr auto kMemcpyDeviceToDevice = cnrtMemcpyDevToDev;
 
   static constexpr auto Memset = cnrtMemset;
 };
diff --git a/src/native/cpu/runtime_.h b/src/native/cpu/runtime_.h
index bf5a81c..c2bcf4f 100644
--- a/src/native/cpu/runtime_.h
+++ b/src/native/cpu/runtime_.h
@@ -1,7 +1,6 @@
 #ifndef INFINI_RT_CPU_RUNTIME__H_
 #define INFINI_RT_CPU_RUNTIME__H_
 
-#include <cassert>
 #include <cstdlib>
 #include <cstring>
 
@@ -13,44 +12,98 @@ template <>
 struct Runtime<Device::Type::kCpu> : RuntimeBase<Runtime<Device::Type::kCpu>> {
   static constexpr Device::Type kDeviceType = Device::Type::kCpu;
 
-  static void SetDevice(int index) {
-    if (index != 0) {
-      assert(false && "CPU device index must be 0");
-      std::abort();
+  using Error = int;
+
+  using Stream = void*;
+
+  static constexpr Error kSuccess = 0;
+
+  static constexpr Error kErrorInvalidValue = 1;
+
+  static constexpr Error kErrorMemoryAllocation = 2;
+
+  static Error SetDevice(int device) {
+    if (device != 0) {
+      return kErrorInvalidValue;
     }
+
+    return kSuccess;
   }
 
-  static void GetDevice(int* index) {
-    assert(index != nullptr);
-    *index = 0;
+  static Error GetDevice(int* device) {
+    if (device == nullptr) {
+      return kErrorInvalidValue;
+    }
+
+    *device = 0;
+
+    return kSuccess;
   }
 
-  static void GetDeviceCount(int* count) {
-    assert(count != nullptr);
+  static Error GetDeviceCount(int* count) {
+    if (count == nullptr) {
+      return kErrorInvalidValue;
+    }
+
     *count = 1;
+
+    return kSuccess;
+  }
+
+  static Error DeviceSynchronize() { return kSuccess; }
+
+  static Error Malloc(void** ptr, std::size_t size) {
+    if (ptr == nullptr) {
+      return kErrorInvalidValue;
+    }
+
+    *ptr = std::malloc(size);
+
+    if (size != 0 && *ptr == nullptr) {
+      return kErrorMemoryAllocation;
+    }
+
+    return kSuccess;
   }
 
-  static void DeviceSynchronize() {}
+  static Error Free(void* ptr) {
+    std::free(ptr);
 
-  static void Malloc(void** ptr, std::size_t size) { *ptr = std::malloc(size); }
+    return kSuccess;
+  }
 
-  static void Free(void* ptr) { std::free(ptr); }
+  static Error Memcpy(void* dst, const void* src, std::size_t size, int) {
+    if ((dst == nullptr || src == nullptr) && size != 0) {
+      return kErrorInvalidValue;
+    }
 
-  static void Memcpy(void* dst, const void* src, std::size_t size, int) {
     std::memcpy(dst, src, size);
+
+    return kSuccess;
   }
 
-  static void Memset(void* ptr, int value, std::size_t count) {
+  static Error Memset(void* ptr, int value, std::size_t count) {
+    if (ptr == nullptr && count != 0) {
+      return kErrorInvalidValue;
+    }
+
     std::memset(ptr, value, count);
+
+    return kSuccess;
+  }
+
+  static Error MemcpyAsync(void* dst, const void* src, std::size_t size,
+                           int kind, Stream) {
+    return kErrorInvalidValue;
   }
 
-  static constexpr int MemcpyHostToHost = 0;
+  static constexpr int kMemcpyHostToHost = 0;
 
-  static constexpr int MemcpyHostToDevice = 0;
+  static constexpr int kMemcpyHostToDevice = 1;
 
-  static constexpr int MemcpyDeviceToHost = 1;
+  static constexpr int kMemcpyDeviceToHost = 2;
 
-  static constexpr int MemcpyDeviceToDevice = 0;
+  static constexpr int kMemcpyDeviceToDevice = 3;
 };
 
 static_assert(Runtime<Device::Type::kCpu>::Validate());
diff --git a/src/native/cuda/hygon/runtime_.h b/src/native/cuda/hygon/runtime_.h
index a9f41ec..520e8f4 100644
--- a/src/native/cuda/hygon/runtime_.h
+++ b/src/native/cuda/hygon/runtime_.h
@@ -15,23 +15,41 @@ namespace infini::rt {
 template <>
 struct Runtime<Device::Type::kHygon>
     : CudaRuntime<Runtime<Device::Type::kHygon>> {
+  using Error = cudaError_t;
+
   using Stream = cudaStream_t;
 
   static constexpr Device::Type kDeviceType = Device::Type::kHygon;
 
+  static constexpr Error kSuccess = cudaSuccess;
+
+  static constexpr auto SetDevice = cudaSetDevice;
+
+  static constexpr auto GetDevice = cudaGetDevice;
+
+  static constexpr auto GetDeviceCount = cudaGetDeviceCount;
+
+  static constexpr auto DeviceSynchronize = cudaDeviceSynchronize;
+
   static constexpr auto Malloc = [](auto&&... args) {
     return cudaMalloc(std::forward<decltype(args)>(args)...);
   };
 
   static constexpr auto Memcpy = cudaMemcpy;
 
+  static constexpr auto MemcpyAsync = cudaMemcpyAsync;
+
   static constexpr auto Free = [](auto&&... args) {
     return cudaFree(std::forward<decltype(args)>(args)...);
   };
 
-  static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice;
+  static constexpr auto kMemcpyHostToHost = cudaMemcpyHostToHost;
+
+  static constexpr auto kMemcpyHostToDevice = cudaMemcpyHostToDevice;
+
+  static constexpr auto kMemcpyDeviceToHost = cudaMemcpyDeviceToHost;
 
-  static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost;
+  static constexpr auto kMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
 
   static constexpr auto Memset = cudaMemset;
 };
diff --git a/src/native/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h
index 8a1b649..87558bf 100644
--- a/src/native/cuda/iluvatar/runtime_.h
+++ b/src/native/cuda/iluvatar/runtime_.h
@@ -15,10 +15,14 @@ namespace infini::rt {
 template <>
 struct Runtime<Device::Type::kIluvatar>
     : CudaRuntime<Runtime<Device::Type::kIluvatar>> {
+  using Error = cudaError_t;
+
   using Stream = cudaStream_t;
 
   static constexpr Device::Type kDeviceType = Device::Type::kIluvatar;
 
+  static constexpr Error kSuccess = cudaSuccess;
+
   static constexpr auto SetDevice = cudaSetDevice;
 
   static constexpr auto GetDevice = cudaGetDevice;
@@ -33,15 +37,17 @@ struct Runtime<Device::Type::kIluvatar>
 
   static constexpr auto Memcpy = cudaMemcpy;
 
+  static constexpr auto MemcpyAsync = cudaMemcpyAsync;
+
   static constexpr auto Free = cudaFree;
 
-  static constexpr auto MemcpyHostToHost = cudaMemcpyHostToHost;
+  static constexpr auto kMemcpyHostToHost = cudaMemcpyHostToHost;
 
-  static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice;
+  static constexpr auto kMemcpyHostToDevice = cudaMemcpyHostToDevice;
 
-  static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost;
+  static constexpr auto kMemcpyDeviceToHost = cudaMemcpyDeviceToHost;
 
-  static constexpr auto MemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
+  static constexpr auto kMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
 
   static constexpr auto Memset = cudaMemset;
 };
diff --git a/src/native/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h
index 5785a51..14ed18b 100644
--- a/src/native/cuda/metax/runtime_.h
+++ b/src/native/cuda/metax/runtime_.h
@@ -13,10 +13,14 @@ namespace infini::rt {
 template <>
 struct Runtime<Device::Type::kMetax>
     : CudaRuntime<Runtime<Device::Type::kMetax>> {
+  using Error = mcError_t;
+
   using Stream = mcStream_t;
 
   static constexpr Device::Type kDeviceType = Device::Type::kMetax;
 
+  static constexpr Error kSuccess = mcSuccess;
+
   static constexpr auto SetDevice = mcSetDevice;
 
   static constexpr auto GetDevice = mcGetDevice;
@@ -33,17 +37,21 @@ struct Runtime<Device::Type::kMetax>
     return mcMemcpy(std::forward<decltype(args)>(args)...);
   };
 
+  static constexpr auto MemcpyAsync = [](auto&&... args) {
+    return mcMemcpyAsync(std::forward<decltype(args)>(args)...);
+  };
+
   static constexpr auto Free = [](auto&&... args) {
     return mcFree(std::forward<decltype(args)>(args)...);
   };
 
-  static constexpr auto MemcpyHostToHost = mcMemcpyHostToHost;
+  static constexpr auto kMemcpyHostToHost = mcMemcpyHostToHost;
 
-  static constexpr auto MemcpyHostToDevice = mcMemcpyHostToDevice;
+  static constexpr auto kMemcpyHostToDevice = mcMemcpyHostToDevice;
 
-  static constexpr auto MemcpyDeviceToHost = mcMemcpyDeviceToHost;
+  static constexpr auto kMemcpyDeviceToHost = mcMemcpyDeviceToHost;
 
-  static constexpr auto MemcpyDeviceToDevice = mcMemcpyDeviceToDevice;
+  static constexpr auto kMemcpyDeviceToDevice = mcMemcpyDeviceToDevice;
 
   static constexpr auto Memset = mcMemset;
 };
diff --git a/src/native/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h
index 8ced2ed..fd0a215 100644
--- a/src/native/cuda/moore/runtime_.h
+++ b/src/native/cuda/moore/runtime_.h
@@ -13,10 +13,14 @@ namespace infini::rt {
 template <>
 struct Runtime<Device::Type::kMoore>
     : CudaRuntime<Runtime<Device::Type::kMoore>> {
+  using Error = musaError_t;
+
   using Stream = musaStream_t;
 
   static constexpr Device::Type kDeviceType = Device::Type::kMoore;
 
+  static constexpr Error kSuccess = musaSuccess;
+
   static constexpr auto SetDevice = musaSetDevice;
 
   static constexpr auto GetDevice = [](auto&&... args) {
@@ -39,17 +43,21 @@ struct Runtime<Device::Type::kMoore>
     return musaMemcpy(std::forward<decltype(args)>(args)...);
   };
 
+  static constexpr auto MemcpyAsync = [](auto&&... args) {
+    return musaMemcpyAsync(std::forward<decltype(args)>(args)...);
+  };
+
   static constexpr auto Free = [](auto&&... args) {
     return musaFree(std::forward<decltype(args)>(args)...);
   };
 
-  static constexpr auto MemcpyHostToHost = musaMemcpyHostToHost;
+  static constexpr auto kMemcpyHostToHost = musaMemcpyHostToHost;
 
-  static constexpr auto MemcpyHostToDevice = musaMemcpyHostToDevice;
+  static constexpr auto kMemcpyHostToDevice = musaMemcpyHostToDevice;
 
-  static constexpr auto MemcpyDeviceToHost = musaMemcpyDeviceToHost;
+  static constexpr auto kMemcpyDeviceToHost = musaMemcpyDeviceToHost;
 
-  static constexpr auto MemcpyDeviceToDevice = musaMemcpyDeviceToDevice;
+  static constexpr auto kMemcpyDeviceToDevice = musaMemcpyDeviceToDevice;
 
   static constexpr auto Memset = musaMemset;
 };
diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h
index f6a9f2d..c910198 100644
--- a/src/native/cuda/nvidia/runtime_.h
+++ b/src/native/cuda/nvidia/runtime_.h
@@ -15,10 +15,14 @@ namespace infini::rt {
 template <>
 struct Runtime<Device::Type::kNvidia>
     : CudaRuntime<Runtime<Device::Type::kNvidia>> {
+  using Error = cudaError_t;
+
   using Stream = cudaStream_t;
 
   static constexpr Device::Type kDeviceType = Device::Type::kNvidia;
 
+  static constexpr Error kSuccess = cudaSuccess;
+
   static constexpr auto SetDevice = cudaSetDevice;
 
   static constexpr auto GetDevice = cudaGetDevice;
@@ -33,15 +37,17 @@ struct Runtime<Device::Type::kNvidia>
 
   static constexpr auto Memcpy = cudaMemcpy;
 
+  static constexpr auto MemcpyAsync = cudaMemcpyAsync;
+
   static constexpr auto Free = cudaFree;
 
-  static constexpr auto MemcpyHostToHost = cudaMemcpyHostToHost;
+  static constexpr auto kMemcpyHostToHost = cudaMemcpyHostToHost;
 
-  static constexpr auto MemcpyHostToDevice = cudaMemcpyHostToDevice;
+  static constexpr auto kMemcpyHostToDevice = cudaMemcpyHostToDevice;
 
-  static constexpr auto MemcpyDeviceToHost = cudaMemcpyDeviceToHost;
+  static constexpr auto kMemcpyDeviceToHost = cudaMemcpyDeviceToHost;
 
-  static constexpr auto MemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
+  static constexpr auto kMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
 
   static constexpr auto Memset = cudaMemset;
 };
diff --git a/src/native/cuda/runtime_.h b/src/native/cuda/runtime_.h
index 8765a05..7d1899d 100644
--- a/src/native/cuda/runtime_.h
+++ b/src/native/cuda/runtime_.h
@@ -17,9 +17,15 @@ struct CudaRuntime : DeviceRuntime<Derived> {
     DeviceRuntime<Derived>::Validate();
     static_assert(
         std::is_invocable_v<decltype(Derived::Memcpy), void*, const void*,
-                            size_t, decltype(Derived::MemcpyHostToDevice)>,
+                            size_t, decltype(Derived::kMemcpyHostToDevice)>,
         "`Runtime::Memcpy` must be callable with "
-        "`(void*, const void*, size_t, MemcpyHostToDevice)`.");
+        "`(void*, const void*, size_t, kMemcpyHostToDevice)`.");
+    static_assert(
+        std::is_invocable_v<decltype(Derived::MemcpyAsync), void*, const void*,
+                            size_t, decltype(Derived::kMemcpyHostToDevice),
+                            typename Derived::Stream>,
+        "`Runtime::MemcpyAsync` must be callable with "
+        "`(void*, const void*, size_t, kMemcpyHostToDevice, Stream)`.");
     return true;
   }
 };
diff --git a/src/runtime.h b/src/runtime.h
index ebc2698..43ba9da 100644
--- a/src/runtime.h
+++ b/src/runtime.h
@@ -30,6 +30,11 @@ struct RuntimeBase {
         std::is_same_v<std::remove_cv_t<decltype(Derived::kDeviceType)>,
                        Device::Type>,
         "`Runtime` must define `static constexpr Device::Type kDeviceType`.");
+    static_assert(sizeof(typename Derived::Error) > 0,
+                  "`Runtime` must define an `Error` type alias.");
+    static_assert(std::is_same_v<std::remove_cv_t<decltype(Derived::kSuccess)>,
+                                 typename Derived::Error>,
+                  "`Runtime` must define `static constexpr Error kSuccess`.");
     return true;
   }
 };
@@ -51,29 +56,6 @@ struct DeviceRuntime : RuntimeBase<Derived> {
   }
 };
 
-enum class MemcpyKind {
-  kHostToHost,
-  kHostToDevice,
-  kDeviceToHost,
-  kDeviceToDevice,
-};
-
-void SetDevice(Device device);
-
-void GetDevice(Device* device);
-
-void GetDeviceCount(int* count, Device::Type type);
-
-void DeviceSynchronize();
-
-void Malloc(void** ptr, std::size_t size);
-
-void Free(void* ptr);
-
-void Memset(void* ptr, int value, std::size_t count);
-
-void Memcpy(void* dst, const void* src, std::size_t count, MemcpyKind kind);
-
 }  // namespace infini::rt
 
 #endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ab54530..2bb0812 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -11,10 +11,6 @@ if(WITH_CPU)
     add_infini_rt_test(test_cpu_runtime test_cpu_runtime.cc)
 endif()
 
-if(WITH_CPU OR WITH_NVIDIA)
-    add_infini_rt_test(test_runtime_dispatch test_runtime_dispatch.cc)
-endif()
-
 if(WITH_NVIDIA)
     add_infini_rt_test(test_nvidia_runtime test_nvidia_runtime.cc)
 endif()
@@ -24,12 +20,20 @@ set(INFINI_RT_TEST_INSTALL_PREFIX
 set(INFINI_RT_TEST_CONSUMER_BINARY
     "${CMAKE_CURRENT_BINARY_DIR}/install_consumer_smoke")
 set(INFINI_RT_TEST_EXTRA_LIBRARY_DIRS "")
+set(INFINI_RT_TEST_EXTRA_INCLUDE_DIRS "")
 set(INFINI_RT_TEST_CONSUMER_BACKEND NONE)
 
-if(WITH_CPU)
-    set(INFINI_RT_TEST_CONSUMER_BACKEND CPU)
-elseif(WITH_NVIDIA)
+if(WITH_NVIDIA)
     set(INFINI_RT_TEST_CONSUMER_BACKEND NVIDIA)
+    if(CUDAToolkit_INCLUDE_DIRS)
+        list(APPEND INFINI_RT_TEST_EXTRA_INCLUDE_DIRS
+             ${CUDAToolkit_INCLUDE_DIRS})
+    elseif(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
+        list(APPEND INFINI_RT_TEST_EXTRA_INCLUDE_DIRS
+             ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    endif()
+elseif(WITH_CPU)
+    set(INFINI_RT_TEST_CONSUMER_BACKEND CPU)
 endif()
 
 if(WITH_ASCEND)
@@ -44,6 +48,7 @@ if(WITH_ASCEND)
 endif()
 
 list(JOIN INFINI_RT_TEST_EXTRA_LIBRARY_DIRS ":" INFINI_RT_TEST_EXTRA_LIBRARY_PATHS)
+list(JOIN INFINI_RT_TEST_EXTRA_INCLUDE_DIRS ":" INFINI_RT_TEST_EXTRA_INCLUDE_PATHS)
 
 add_test(
     NAME test_install
@@ -59,6 +64,7 @@ add_test(
             "-DINFINI_RT_CONSUMER_BINARY=${INFINI_RT_TEST_CONSUMER_BINARY}"
             "-DINFINI_RT_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
             "-DINFINI_RT_EXTRA_LIBRARY_PATHS=${INFINI_RT_TEST_EXTRA_LIBRARY_PATHS}"
+            "-DINFINI_RT_EXTRA_INCLUDE_PATHS=${INFINI_RT_TEST_EXTRA_INCLUDE_PATHS}"
             "-DINFINI_RT_CONSUMER_BACKEND=${INFINI_RT_TEST_CONSUMER_BACKEND}"
             -P "${CMAKE_CURRENT_SOURCE_DIR}/compile_install_consumer.cmake")
 set_tests_properties(test_install_consumer PROPERTIES
diff --git a/tests/compile_install_consumer.cmake b/tests/compile_install_consumer.cmake
index 169311e..1bb96d1 100644
--- a/tests/compile_install_consumer.cmake
+++ b/tests/compile_install_consumer.cmake
@@ -17,6 +17,17 @@ set(INFINI_RT_EXTRA_LINK_ARGS "")
 set(INFINI_RT_EXTRA_COMPILE_ARGS "")
 set(INFINI_RT_LD_LIBRARY_PATH "${INFINI_RT_LIBRARY_DIR}")
 
+if(INFINI_RT_EXTRA_INCLUDE_PATHS)
+    string(REPLACE ":" ";" INFINI_RT_EXTRA_INCLUDE_DIRS
+           "${INFINI_RT_EXTRA_INCLUDE_PATHS}")
+    foreach(INFINI_RT_EXTRA_INCLUDE_DIR ${INFINI_RT_EXTRA_INCLUDE_DIRS})
+        if(EXISTS "${INFINI_RT_EXTRA_INCLUDE_DIR}")
+            list(APPEND INFINI_RT_EXTRA_COMPILE_ARGS
+                 "-I${INFINI_RT_EXTRA_INCLUDE_DIR}")
+        endif()
+    endforeach()
+endif()
+
 if(INFINI_RT_CONSUMER_BACKEND AND NOT INFINI_RT_CONSUMER_BACKEND STREQUAL "NONE")
     list(APPEND INFINI_RT_EXTRA_COMPILE_ARGS
          "-DINFINI_RT_CONSUMER_BACKEND_${INFINI_RT_CONSUMER_BACKEND}=1")
diff --git a/tests/install_consumer_smoke.cc b/tests/install_consumer_smoke.cc
index 97442b5..9414e71 100644
--- a/tests/install_consumer_smoke.cc
+++ b/tests/install_consumer_smoke.cc
@@ -1,5 +1,6 @@
 #include <infini/rt.h>
 
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <vector>
@@ -20,19 +21,64 @@ int main() {
 
 #if defined(INFINI_RT_CONSUMER_BACKEND_CPU) || \
     defined(INFINI_RT_CONSUMER_BACKEND_NVIDIA)
-#if defined(INFINI_RT_CONSUMER_BACKEND_NVIDIA)
-  const infini::rt::Device runtime_device{infini::rt::Device::Type::kNvidia};
-#else
-  const infini::rt::Device runtime_device{infini::rt::Device::Type::kCpu};
-#endif
-
+  std::array<std::uint8_t, 4> input{1, 2, 3, 4};
+  std::array<std::uint8_t, 4> output{};
   void* ptr = nullptr;
-  infini::rt::SetDevice(runtime_device);
-  infini::rt::Malloc(&ptr, sizeof(std::uint32_t));
+  if (infini::rt::SetDevice(0) != infini::rt::kSuccess) {
+    return 1;
+  }
+  int current_device = -1;
+  if (infini::rt::GetDevice(&current_device) != infini::rt::kSuccess) {
+    return 1;
+  }
+  if (current_device != 0) {
+    return 1;
+  }
+  int device_count = 0;
+  if (infini::rt::GetDeviceCount(&device_count) != infini::rt::kSuccess) {
+    return 1;
+  }
+  if (device_count <= 0) {
+    return 1;
+  }
+  if (infini::rt::Malloc(&ptr, input.size()) != infini::rt::kSuccess) {
+    return 1;
+  }
   if (ptr == nullptr) {
     return 1;
   }
-  infini::rt::Free(ptr);
+  if (infini::rt::Memcpy(ptr, input.data(), input.size(),
+                         infini::rt::MemcpyKind::kMemcpyHostToDevice) !=
+      infini::rt::kSuccess) {
+    return 1;
+  }
+#if defined(INFINI_RT_CONSUMER_BACKEND_CPU)
+  if (infini::rt::MemcpyAsync(ptr, input.data(), input.size(),
+                              infini::rt::MemcpyKind::kMemcpyHostToDevice,
+                              nullptr) == infini::rt::kSuccess) {
+    return 1;
+  }
+#else
+  if (infini::rt::MemcpyAsync(ptr, input.data(), input.size(),
+                              infini::rt::MemcpyKind::kMemcpyHostToDevice,
+                              nullptr) != infini::rt::kSuccess) {
+    return 1;
+  }
+#endif
+  if (infini::rt::DeviceSynchronize() != infini::rt::kSuccess) {
+    return 1;
+  }
+  if (infini::rt::Memcpy(output.data(), ptr, output.size(),
+                         infini::rt::MemcpyKind::kMemcpyDeviceToHost) !=
+      infini::rt::kSuccess) {
+    return 1;
+  }
+  if (output != input) {
+    return 1;
+  }
+  if (infini::rt::Free(ptr) != infini::rt::kSuccess) {
+    return 1;
+  }
 #endif
 
   return 0;
diff --git a/tests/test_cpu_runtime.cc b/tests/test_cpu_runtime.cc
index 20edce9..d307f3c 100644
--- a/tests/test_cpu_runtime.cc
+++ b/tests/test_cpu_runtime.cc
@@ -32,15 +32,26 @@ void TestMemcpyRoundTrip(infini::rt::test::TestContext* context) {
   }
 
   CpuRuntime::Memcpy(ptr, input.data(), input.size(),
-                     CpuRuntime::MemcpyHostToDevice);
+                     CpuRuntime::kMemcpyHostToDevice);
   CpuRuntime::Memcpy(output.data(), ptr, output.size(),
-                     CpuRuntime::MemcpyDeviceToHost);
+                     CpuRuntime::kMemcpyDeviceToHost);
   CpuRuntime::Free(ptr);
 
   context->ExpectEqual(output, input,
                        "CPU runtime should copy data through runtime memory.");
 }
 
+void TestMemcpyAsyncUnsupported(infini::rt::test::TestContext* context) {
+  std::array<std::uint8_t, 1> input{1};
+  std::array<std::uint8_t, 1> output{};
+
+  context->Expect(CpuRuntime::MemcpyAsync(output.data(), input.data(),
+                                          input.size(),
+                                          CpuRuntime::kMemcpyHostToHost,
+                                          nullptr) != CpuRuntime::kSuccess,
+                  "CPU runtime should not report async memcpy success.");
+}
+
 void TestMemset(infini::rt::test::TestContext* context) {
   std::array<std::uint8_t, 4> output{};
   void* ptr = nullptr;
@@ -53,7 +64,7 @@ void TestMemset(infini::rt::test::TestContext* context) {
 
   CpuRuntime::Memset(ptr, 0x5A, output.size());
   CpuRuntime::Memcpy(output.data(), ptr, output.size(),
-                     CpuRuntime::MemcpyDeviceToHost);
+                     CpuRuntime::kMemcpyDeviceToHost);
   CpuRuntime::Free(ptr);
 
   for (const auto value : output) {
@@ -69,6 +80,7 @@ int main() {
 
   TestMallocAndFree(&context);
   TestMemcpyRoundTrip(&context);
+  TestMemcpyAsyncUnsupported(&context);
   TestMemset(&context);
 
   return context.ExitCode();
diff --git a/tests/test_nvidia_runtime.cc b/tests/test_nvidia_runtime.cc
index 2d3dac2..df5a038 100644
--- a/tests/test_nvidia_runtime.cc
+++ b/tests/test_nvidia_runtime.cc
@@ -41,11 +41,11 @@ void TestMemcpyRoundTrip(infini::rt::test::TestContext* context) {
 
   ExpectCudaSuccess(context,
                     NvidiaRuntime::Memcpy(ptr, input.data(), input.size(),
-                                          NvidiaRuntime::MemcpyHostToDevice),
+                                          NvidiaRuntime::kMemcpyHostToDevice),
                     "NVIDIA runtime should copy host data to device memory.");
   ExpectCudaSuccess(context,
                     NvidiaRuntime::Memcpy(output.data(), ptr, output.size(),
-                                          NvidiaRuntime::MemcpyDeviceToHost),
+                                          NvidiaRuntime::kMemcpyDeviceToHost),
                     "NVIDIA runtime should copy device data to host memory.");
   ExpectCudaSuccess(context, NvidiaRuntime::Free(ptr),
                     "NVIDIA runtime should free copy memory.");
@@ -68,7 +68,7 @@ void TestMemset(infini::rt::test::TestContext* context) {
                     "NVIDIA runtime should memset device memory.");
   ExpectCudaSuccess(context,
                     NvidiaRuntime::Memcpy(output.data(), ptr, output.size(),
-                                          NvidiaRuntime::MemcpyDeviceToHost),
+                                          NvidiaRuntime::kMemcpyDeviceToHost),
                     "NVIDIA runtime should copy memset data to host memory.");
   ExpectCudaSuccess(context, NvidiaRuntime::Free(ptr),
                     "NVIDIA runtime should free memset memory.");
diff --git a/tests/test_runtime_dispatch.cc b/tests/test_runtime_dispatch.cc
deleted file mode 100644
index 9be92a7..0000000
--- a/tests/test_runtime_dispatch.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <infini/rt.h>
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-
-#include "test_helper.h"
-
-namespace {
-
-infini::rt::Device RuntimeTestDevice() {
-#if defined(WITH_NVIDIA)
-  return infini::rt::Device{infini::rt::Device::Type::kNvidia};
-#else
-  return infini::rt::Device{infini::rt::Device::Type::kCpu};
-#endif
-}
-
-}  // namespace
-
-int main() {
-  infini::rt::test::TestContext context;
-  const infini::rt::Device device = RuntimeTestDevice();
-  std::array<std::uint8_t, 4> input{1, 2, 3, 4};
-  std::array<std::uint8_t, 4> output{};
-  void* ptr = nullptr;
-
-  infini::rt::SetDevice(device);
-
-  infini::rt::Device current_device;
-  infini::rt::GetDevice(&current_device);
-  context.ExpectEqual(current_device, device,
-                      "Runtime dispatch should keep the current device.");
-
-  int device_count = 0;
-  infini::rt::GetDeviceCount(&device_count, device.type());
-  context.Expect(device_count > 0,
-                 "Runtime dispatch should report at least one device.");
-
-  infini::rt::Malloc(&ptr, input.size());
-  context.Expect(ptr != nullptr, "Runtime dispatch should allocate memory.");
-  if (ptr == nullptr) {
-    return context.ExitCode();
-  }
-
-  infini::rt::Memcpy(ptr, input.data(), input.size(),
-                     infini::rt::MemcpyKind::kHostToDevice);
-  infini::rt::Memcpy(output.data(), ptr, output.size(),
-                     infini::rt::MemcpyKind::kDeviceToHost);
-  context.ExpectEqual(output, input,
-                      "Runtime dispatch should copy data through memory.");
-
-  infini::rt::Memset(ptr, 0x5A, output.size());
-  infini::rt::Memcpy(output.data(), ptr, output.size(),
-                     infini::rt::MemcpyKind::kDeviceToHost);
-  for (const auto value : output) {
-    context.ExpectEqual(value, static_cast<std::uint8_t>(0x5A),
-                        "Runtime dispatch should fill memory.");
-  }
-
-  infini::rt::DeviceSynchronize();
-  infini::rt::Free(ptr);
-
-  return context.ExitCode();
-}

From 310310100fc29f3ff853e0357dd06c4542f52404 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Wed, 1 Jul 2026 20:39:43 +0800
Subject: [PATCH 02/15] Add default runtime dispatch specialization

---
 README.md                          |  17 ++-
 scripts/generate_public_headers.py | 187 ++++++++++++++++++++++++-----
 src/runtime.h                      |   2 +-
 src/tensor_view.h                  |  22 +++-
 tests/CMakeLists.txt               |  12 ++
 tests/install_consumer_smoke.cc    |  14 +++
 tests/test_core.cc                 |   4 +
 tests/test_default_runtime.cc      | 126 +++++++++++++++++++
 8 files changed, 347 insertions(+), 37 deletions(-)
 create mode 100644 tests/test_default_runtime.cc

diff --git a/README.md b/README.md
index d8883e3..33fbe90 100644
--- a/README.md
+++ b/README.md
@@ -79,11 +79,22 @@ int main() {
 }
 ```
 
-When a GPU backend is enabled, the top-level runtime API targets that backend,
-matching CUDA Runtime API behavior:
+The top-level runtime API dispatches through `infini::rt::Runtime<>`, which is
+the `Runtime<Device::Type::kCount>` default specialization. A GPU backend is
+selected initially when one is enabled; otherwise CPU is selected.
+`SetDeviceType` accepts only backends enabled in the current build.
 
 ```cpp
-infini::rt::SetDevice(0);
+constexpr std::size_t size = 1024;
+void* ptr = nullptr;
+
+infini::rt::Runtime<>::SetDeviceType(infini::rt::Device::Type::kCpu);
+infini::rt::Malloc(&ptr, size);
+infini::rt::Free(ptr);
+
+infini::rt::Runtime<>::SetDeviceType(infini::rt::Device::Type::kNvidia);
+infini::rt::Malloc(&ptr, size);
+infini::rt::Free(ptr);
 ```
 
 Use `infini::rt::Runtime<infini::rt::Device::Type::kCpu>` when CPU runtime
diff --git a/scripts/generate_public_headers.py b/scripts/generate_public_headers.py
index 74dde79..755d098 100644
--- a/scripts/generate_public_headers.py
+++ b/scripts/generate_public_headers.py
@@ -166,12 +166,14 @@ def _write_generated_header(include_root, devices):
         f"#include {_detail_include('hash.h')}",
         f"#include {_detail_include('runtime.h')}",
         f"#include {_detail_include('tensor_view.h')}",
-        f"#include <infini/rt/{default_device}/runtime_.h>",
     ]
 
     for device in devices:
         includes.append(f"#include <infini/rt/{device}/device_.h>")
 
+    for device in devices:
+        includes.append(f"#include <infini/rt/{device}/runtime_.h>")
+
     path = include_root / "infini" / "rt" / "generated.h"
     path.parent.mkdir(parents=True, exist_ok=True)
     path.write_text(
@@ -183,27 +185,83 @@ def _write_generated_header(include_root, devices):
 namespace infini::rt {{
 namespace generated_detail {{
 
-using DefaultRuntime = Runtime<{default_device_type}>;
+using DefaultErrorRuntime = Runtime<{default_device_type}>;
+
+inline constexpr Device::Type kDefaultDeviceType = {default_device_type};
 
 }}  // namespace generated_detail
 
-using Error = typename generated_detail::DefaultRuntime::Error;
+using Error = typename generated_detail::DefaultErrorRuntime::Error;
 
-using Stream = typename generated_detail::DefaultRuntime::Stream;
+using Stream = typename generated_detail::DefaultErrorRuntime::Stream;
 
-inline constexpr Error kSuccess = generated_detail::DefaultRuntime::kSuccess;
+inline constexpr Error kSuccess = generated_detail::DefaultErrorRuntime::kSuccess;
 
 enum class MemcpyKind {{
   kMemcpyHostToHost =
-      static_cast<int>(generated_detail::DefaultRuntime::kMemcpyHostToHost),
+      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyHostToHost),
   kMemcpyHostToDevice =
-      static_cast<int>(generated_detail::DefaultRuntime::kMemcpyHostToDevice),
+      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyHostToDevice),
   kMemcpyDeviceToHost =
-      static_cast<int>(generated_detail::DefaultRuntime::kMemcpyDeviceToHost),
+      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyDeviceToHost),
   kMemcpyDeviceToDevice =
-      static_cast<int>(generated_detail::DefaultRuntime::kMemcpyDeviceToDevice),
+      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyDeviceToDevice),
+}};
+
+template <>
+struct Runtime<Device::Type::kCount>
+    : RuntimeBase<Runtime<Device::Type::kCount>> {{
+  using Error = infini::rt::Error;
+
+  using Stream = infini::rt::Stream;
+
+  static constexpr Device::Type kDeviceType = Device::Type::kCount;
+
+  static constexpr Error kSuccess = infini::rt::kSuccess;
+
+  static constexpr MemcpyKind kMemcpyHostToHost =
+      MemcpyKind::kMemcpyHostToHost;
+
+  static constexpr MemcpyKind kMemcpyHostToDevice =
+      MemcpyKind::kMemcpyHostToDevice;
+
+  static constexpr MemcpyKind kMemcpyDeviceToHost =
+      MemcpyKind::kMemcpyDeviceToHost;
+
+  static constexpr MemcpyKind kMemcpyDeviceToDevice =
+      MemcpyKind::kMemcpyDeviceToDevice;
+
+  static Error SetDeviceType(Device::Type device_type);
+
+  static Device::Type GetDeviceType();
+
+  static Error SetDevice(int device);
+
+  static Error GetDevice(int* device);
+
+  static Error GetDeviceCount(int* count);
+
+  static Error DeviceSynchronize();
+
+  static Error Malloc(void** ptr, std::size_t size);
+
+  static Error Free(void* ptr);
+
+  static Error Memset(void* ptr, int value, std::size_t count);
+
+  static Error Memcpy(void* dst, const void* src, std::size_t count,
+                      MemcpyKind kind);
+
+  static Error MemcpyAsync(void* dst, const void* src, std::size_t count,
+                           MemcpyKind kind, Stream stream);
+
+ private:
+  inline static thread_local Device::Type device_type_ =
+      generated_detail::kDefaultDeviceType;
 }};
 
+static_assert(Runtime<Device::Type::kCount>::Validate());
+
 Error SetDevice(int device);
 
 Error GetDevice(int* device);
@@ -301,42 +359,91 @@ def _default_device(devices):
     raise ValueError("at least one device is required")
 
 
-def _runtime_arg(param):
+def _runtime_arg(param, device):
+    device_type = _DEVICE_TYPES[device]
     if param.type == "MemcpyKind":
-        return f"RuntimeMemcpyKind({param.name})"
+        return f"RuntimeMemcpyKind<{device_type}>({param.name})"
     if param.type == "Stream":
         return (
-            f"reinterpret_cast<typename DefaultRuntime::Stream>({param.name})"
+            f"reinterpret_cast<typename Runtime<{device_type}>::Stream>"
+            f"({param.name})"
         )
 
     return param.name
 
 
-def _runtime_args(function):
-    args = (_runtime_arg(param) for param in function.params)
+def _runtime_args(function, device):
+    args = (_runtime_arg(param, device) for param in function.params)
 
     return ", ".join(arg for arg in args if arg is not None)
 
 
-def _runtime_call(function):
-    args = _runtime_args(function)
+def _runtime_call(function, device):
+    device_type = _DEVICE_TYPES[device]
+    args = _runtime_args(function, device)
     if args:
-        return f"DefaultRuntime::{function.name}({args})"
+        return f"Runtime<{device_type}>::{function.name}({args})"
+
+    return f"Runtime<{device_type}>::{function.name}()"
 
-    return f"DefaultRuntime::{function.name}()"
 
+def _call_args(function):
+    return ", ".join(param.name for param in function.params)
+
+
+def _member_signature(function):
+    return (
+        f"{function.return_type} Runtime<Device::Type::kCount>::"
+        f"{function.name}({function.params_decl()})"
+    )
+
+
+def _dispatch_cases(devices, function):
+    return "\n".join(
+        f"""    case {_DEVICE_TYPES[device]}:
+      return CheckCall([&] {{ return {_runtime_call(function, device)}; }});"""
+        for device in devices
+    )
+
+
+def _write_member_dispatch_function(function, devices):
+    return f"""{_member_signature(function)} {{
+  switch (device_type_) {{
+{_dispatch_cases(devices, function)}
+  }}
+
+  return InvalidValueError();
+}}
+"""
+
+
+def _write_public_dispatch_function(function):
+    args = _call_args(function)
+    if args:
+        call = f"Runtime<Device::Type::kCount>::{function.name}({args})"
+    else:
+        call = f"Runtime<Device::Type::kCount>::{function.name}()"
 
-def _write_dispatch_function(function):
     return f"""{function.signature()} {{
-  return CheckCall([&] {{ return {_runtime_call(function)}; }});
+  return {call};
 }}
 """
 
 
-def _write_runtime_dispatch(source_path):
+def _write_runtime_dispatch(source_path, devices):
     functions = _PUBLIC_RUNTIME_FUNCTIONS
-    dispatch_functions = "\n".join(
-        _write_dispatch_function(function) for function in functions
+    member_dispatch_functions = "\n".join(
+        _write_member_dispatch_function(function, devices=devices)
+        for function in functions
+    )
+    public_dispatch_functions = "\n".join(
+        _write_public_dispatch_function(function) for function in functions
+    )
+    set_device_type_cases = "\n".join(
+        f"""    case {_DEVICE_TYPES[device]}:
+      device_type_ = device_type;
+      return kSuccess;"""
+        for device in devices
     )
 
     source_path.parent.mkdir(parents=True, exist_ok=True)
@@ -351,8 +458,6 @@ def _write_runtime_dispatch(source_path):
 namespace infini::rt {{
 namespace {{
 
-using DefaultRuntime = generated_detail::DefaultRuntime;
-
 template <typename Func>
 Error CheckCall(Func&& func) {{
   using ReturnType = decltype(std::forward<Func>(func)());
@@ -365,25 +470,43 @@ def _write_runtime_dispatch(source_path):
   }}
 }}
 
+Error InvalidValueError() {{ return static_cast<Error>(1); }}
+
+template <Device::Type device_type>
 auto RuntimeMemcpyKind(MemcpyKind kind) {{
+  using DeviceRuntime = Runtime<device_type>;
+
   switch (kind) {{
     case MemcpyKind::kMemcpyHostToHost:
-      return DefaultRuntime::kMemcpyHostToHost;
+      return DeviceRuntime::kMemcpyHostToHost;
     case MemcpyKind::kMemcpyHostToDevice:
-      return DefaultRuntime::kMemcpyHostToDevice;
+      return DeviceRuntime::kMemcpyHostToDevice;
     case MemcpyKind::kMemcpyDeviceToHost:
-      return DefaultRuntime::kMemcpyDeviceToHost;
+      return DeviceRuntime::kMemcpyDeviceToHost;
     case MemcpyKind::kMemcpyDeviceToDevice:
-      return DefaultRuntime::kMemcpyDeviceToDevice;
+      return DeviceRuntime::kMemcpyDeviceToDevice;
   }}
 
   assert(false && "unsupported memcpy kind");
-  return DefaultRuntime::kMemcpyHostToHost;
+  return DeviceRuntime::kMemcpyHostToHost;
 }}
 
 }}  // namespace
 
-{dispatch_functions}
+Error Runtime<Device::Type::kCount>::SetDeviceType(Device::Type device_type) {{
+  switch (device_type) {{
+{set_device_type_cases}
+  }}
+
+  return InvalidValueError();
+}}
+
+Device::Type Runtime<Device::Type::kCount>::GetDeviceType() {{
+  return device_type_;
+}}
+
+{member_dispatch_functions}
+{public_dispatch_functions}
 }}  // namespace infini::rt
 """
     )
@@ -413,7 +536,7 @@ def main():
             _write_wrapper(include_root, wrapper_device, header_name, target)
 
     _write_generated_header(include_root, devices)
-    _write_runtime_dispatch(pathlib.Path(args.source_output))
+    _write_runtime_dispatch(pathlib.Path(args.source_output), devices)
 
 
 if __name__ == "__main__":
diff --git a/src/runtime.h b/src/runtime.h
index 43ba9da..104e160 100644
--- a/src/runtime.h
+++ b/src/runtime.h
@@ -8,7 +8,7 @@
 
 namespace infini::rt {
 
-template <Device::Type device_type>
+template <Device::Type device_type = Device::Type::kCount>
 struct Runtime;
 
 /// ## Interface enforcement via CRTP.
diff --git a/src/tensor_view.h b/src/tensor_view.h
index 0b6fc59..90ed0df 100644
--- a/src/tensor_view.h
+++ b/src/tensor_view.h
@@ -3,6 +3,8 @@
 
 #include <cstdint>
 #include <string>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "data_type.h"
@@ -11,6 +13,22 @@
 
 namespace infini::rt {
 
+namespace tensor_view_detail {
+
+template <typename T, typename = void>
+struct IsTensorLike : std::false_type {};
+
+template <typename T>
+struct IsTensorLike<
+    T, std::void_t<decltype(std::declval<const T&>().data()),
+                   decltype(std::declval<const T&>().shape()),
+                   decltype(std::declval<const T&>().dtype()),
+                   decltype(std::declval<const T&>().device()),
+                   decltype(std::declval<const T&>().strides())>>
+    : std::true_type {};
+
+}  // namespace tensor_view_detail
+
 class TensorView {
  public:
   using Size = std::size_t;
@@ -23,7 +41,9 @@ class TensorView {
 
   using Strides = std::vector<Stride>;
 
-  template <typename TensorLike>
+  template <typename TensorLike,
+            typename = std::enable_if_t<
+                tensor_view_detail::IsTensorLike<TensorLike>::value>>
   TensorView(const TensorLike& tensor)
       : data_{const_cast<void*>(static_cast<const void*>(tensor.data()))},
         shape_{tensor.shape()},
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2bb0812..1aeca74 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -7,6 +7,18 @@ endfunction()
 add_infini_rt_test(test_smoke test_smoke.cc)
 add_infini_rt_test(test_core test_core.cc)
 
+if(WITH_CPU OR WITH_NVIDIA)
+    add_infini_rt_test(test_default_runtime test_default_runtime.cc)
+    if(WITH_CPU)
+        target_compile_definitions(test_default_runtime
+                                   PRIVATE INFINI_RT_TEST_WITH_CPU=1)
+    endif()
+    if(WITH_NVIDIA)
+        target_compile_definitions(test_default_runtime
+                                   PRIVATE INFINI_RT_TEST_WITH_NVIDIA=1)
+    endif()
+endif()
+
 if(WITH_CPU)
     add_infini_rt_test(test_cpu_runtime test_cpu_runtime.cc)
 endif()
diff --git a/tests/install_consumer_smoke.cc b/tests/install_consumer_smoke.cc
index 9414e71..b4e1ce6 100644
--- a/tests/install_consumer_smoke.cc
+++ b/tests/install_consumer_smoke.cc
@@ -21,6 +21,20 @@ int main() {
 
 #if defined(INFINI_RT_CONSUMER_BACKEND_CPU) || \
     defined(INFINI_RT_CONSUMER_BACKEND_NVIDIA)
+  using DefaultRuntime = infini::rt::Runtime<>;
+#if defined(INFINI_RT_CONSUMER_BACKEND_CPU)
+  constexpr auto kExpectedDeviceType = infini::rt::Device::Type::kCpu;
+#else
+  constexpr auto kExpectedDeviceType = infini::rt::Device::Type::kNvidia;
+#endif
+  if (DefaultRuntime::SetDeviceType(kExpectedDeviceType) !=
+      infini::rt::kSuccess) {
+    return 1;
+  }
+  if (DefaultRuntime::GetDeviceType() != kExpectedDeviceType) {
+    return 1;
+  }
+
   std::array<std::uint8_t, 4> input{1, 2, 3, 4};
   std::array<std::uint8_t, 4> output{};
   void* ptr = nullptr;
diff --git a/tests/test_core.cc b/tests/test_core.cc
index e8b8071..1c57c04 100644
--- a/tests/test_core.cc
+++ b/tests/test_core.cc
@@ -2,6 +2,7 @@
 
 #include <cstdint>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "test_helper.h"
@@ -12,6 +13,9 @@ using infini::rt::DataType;
 using infini::rt::Device;
 using infini::rt::TensorView;
 
+static_assert(!std::is_constructible_v<TensorView, std::vector<TensorView>>,
+              "TensorView should not treat tensor containers as tensor-like.");
+
 void TestDevice(infini::rt::test::TestContext* context) {
   const Device cpu{Device::Type::kCpu};
   const Device nvidia{Device::Type::kNvidia, 1};
diff --git a/tests/test_default_runtime.cc b/tests/test_default_runtime.cc
new file mode 100644
index 0000000..f2c6978
--- /dev/null
+++ b/tests/test_default_runtime.cc
@@ -0,0 +1,126 @@
+#include <infini/rt.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+
+#include "test_helper.h"
+
+namespace {
+
+using DefaultRuntime = infini::rt::Runtime<>;
+
+void ExpectSuccess(infini::rt::test::TestContext* context,
+                   infini::rt::Error status, const char* message) {
+  context->Expect(status == infini::rt::kSuccess, message);
+}
+
+void TestInvalidDeviceType(infini::rt::test::TestContext* context) {
+  const auto before = DefaultRuntime::GetDeviceType();
+
+  context->Expect(DefaultRuntime::SetDeviceType(infini::rt::Device::Type::kQy) !=
+                      infini::rt::kSuccess,
+                  "Default runtime should reject disabled device types.");
+  context->Expect(DefaultRuntime::GetDeviceType() == before,
+                  "Rejected device type should not change dispatch target.");
+}
+
+#if defined(INFINI_RT_TEST_WITH_CPU)
+void TestCpuDispatch(infini::rt::test::TestContext* context) {
+  ExpectSuccess(context,
+                DefaultRuntime::SetDeviceType(infini::rt::Device::Type::kCpu),
+                "Default runtime should select CPU dispatch.");
+  context->Expect(DefaultRuntime::GetDeviceType() ==
+                      infini::rt::Device::Type::kCpu,
+                  "Default runtime should report CPU dispatch.");
+
+  std::array<std::uint8_t, 4> input{1, 2, 3, 4};
+  std::array<std::uint8_t, 4> output{};
+  void* ptr = nullptr;
+
+  ExpectSuccess(context, infini::rt::SetDevice(0),
+                "CPU dispatch should set device 0.");
+  ExpectSuccess(context, infini::rt::Malloc(&ptr, input.size()),
+                "CPU dispatch should allocate memory.");
+  if (ptr == nullptr) {
+    return;
+  }
+
+  ExpectSuccess(context,
+                infini::rt::Memcpy(ptr, input.data(), input.size(),
+                                   infini::rt::MemcpyKind::kMemcpyHostToDevice),
+                "CPU dispatch should copy host data to runtime memory.");
+  context->Expect(
+      infini::rt::MemcpyAsync(ptr, input.data(), input.size(),
+                              infini::rt::MemcpyKind::kMemcpyHostToDevice,
+                              nullptr) != infini::rt::kSuccess,
+      "CPU dispatch should not report async memcpy success.");
+  ExpectSuccess(context,
+                infini::rt::Memcpy(output.data(), ptr, output.size(),
+                                   infini::rt::MemcpyKind::kMemcpyDeviceToHost),
+                "CPU dispatch should copy runtime memory to host.");
+  ExpectSuccess(context, infini::rt::Free(ptr),
+                "CPU dispatch should free memory.");
+
+  context->ExpectEqual(output, input,
+                       "CPU dispatch should preserve copied bytes.");
+}
+#endif
+
+#if defined(INFINI_RT_TEST_WITH_NVIDIA)
+void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
+  ExpectSuccess(
+      context, DefaultRuntime::SetDeviceType(infini::rt::Device::Type::kNvidia),
+      "Default runtime should select NVIDIA dispatch.");
+  context->Expect(DefaultRuntime::GetDeviceType() ==
+                      infini::rt::Device::Type::kNvidia,
+                  "Default runtime should report NVIDIA dispatch.");
+
+  std::array<std::uint8_t, 4> input{5, 6, 7, 8};
+  std::array<std::uint8_t, 4> output{};
+  void* ptr = nullptr;
+
+  ExpectSuccess(context, infini::rt::SetDevice(0),
+                "NVIDIA dispatch should set device 0.");
+  ExpectSuccess(context, infini::rt::Malloc(&ptr, input.size()),
+                "NVIDIA dispatch should allocate memory.");
+  if (ptr == nullptr) {
+    return;
+  }
+
+  ExpectSuccess(context,
+                infini::rt::MemcpyAsync(
+                    ptr, input.data(), input.size(),
+                    infini::rt::MemcpyKind::kMemcpyHostToDevice, nullptr),
+                "NVIDIA dispatch should support async host-to-device copy.");
+  ExpectSuccess(context, infini::rt::DeviceSynchronize(),
+                "NVIDIA dispatch should synchronize the device.");
+  ExpectSuccess(context,
+                infini::rt::Memcpy(output.data(), ptr, output.size(),
+                                   infini::rt::MemcpyKind::kMemcpyDeviceToHost),
+                "NVIDIA dispatch should copy device data to host.");
+  ExpectSuccess(context, infini::rt::Free(ptr),
+                "NVIDIA dispatch should free memory.");
+
+  context->ExpectEqual(output, input,
+                       "NVIDIA dispatch should preserve copied bytes.");
+}
+#endif
+
+}  // namespace
+
+int main() {
+  infini::rt::test::TestContext context;
+
+  TestInvalidDeviceType(&context);
+
+#if defined(INFINI_RT_TEST_WITH_CPU)
+  TestCpuDispatch(&context);
+#endif
+
+#if defined(INFINI_RT_TEST_WITH_NVIDIA)
+  TestNvidiaDispatch(&context);
+#endif
+
+  return context.ExitCode();
+}

From 50fca829288405dde5ee1567e9a73947ab6a4e25 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Wed, 1 Jul 2026 23:49:56 +0800
Subject: [PATCH 03/15] Refactor runtime dispatch namespace

---
 scripts/generate_public_headers.py  | 145 ++++++++--------------------
 src/native/ascend/runtime_.h        |   4 +-
 src/native/cambricon/runtime_.h     |   4 +-
 src/native/cpu/runtime_.h           |   4 +-
 src/native/cuda/hygon/runtime_.h    |   4 +-
 src/native/cuda/iluvatar/runtime_.h |   4 +-
 src/native/cuda/metax/runtime_.h    |   4 +-
 src/native/cuda/moore/runtime_.h    |   4 +-
 src/native/cuda/nvidia/runtime_.h   |   4 +-
 src/native/cuda/runtime_.h          |   4 +-
 src/runtime.h                       |  11 ++-
 src/tensor_view.h                   |  12 +--
 tests/install_consumer_smoke.cc     |  45 ++++-----
 tests/test_cpu_runtime.cc           |  12 +--
 tests/test_default_runtime.cc       |  83 +++++++---------
 tests/test_nvidia_runtime.cc        |   3 +-
 16 files changed, 132 insertions(+), 215 deletions(-)

diff --git a/scripts/generate_public_headers.py b/scripts/generate_public_headers.py
index 755d098..957d0d8 100644
--- a/scripts/generate_public_headers.py
+++ b/scripts/generate_public_headers.py
@@ -183,6 +183,12 @@ def _write_generated_header(include_root, devices):
 {chr(10).join(includes)}
 
 namespace infini::rt {{
+
+void set_runtime_device_type(Device::Type device_type);
+
+Device::Type runtime_device_type();
+
+namespace runtime {{
 namespace generated_detail {{
 
 using DefaultErrorRuntime = Runtime<{default_device_type}>;
@@ -208,60 +214,6 @@ def _write_generated_header(include_root, devices):
       static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyDeviceToDevice),
 }};
 
-template <>
-struct Runtime<Device::Type::kCount>
-    : RuntimeBase<Runtime<Device::Type::kCount>> {{
-  using Error = infini::rt::Error;
-
-  using Stream = infini::rt::Stream;
-
-  static constexpr Device::Type kDeviceType = Device::Type::kCount;
-
-  static constexpr Error kSuccess = infini::rt::kSuccess;
-
-  static constexpr MemcpyKind kMemcpyHostToHost =
-      MemcpyKind::kMemcpyHostToHost;
-
-  static constexpr MemcpyKind kMemcpyHostToDevice =
-      MemcpyKind::kMemcpyHostToDevice;
-
-  static constexpr MemcpyKind kMemcpyDeviceToHost =
-      MemcpyKind::kMemcpyDeviceToHost;
-
-  static constexpr MemcpyKind kMemcpyDeviceToDevice =
-      MemcpyKind::kMemcpyDeviceToDevice;
-
-  static Error SetDeviceType(Device::Type device_type);
-
-  static Device::Type GetDeviceType();
-
-  static Error SetDevice(int device);
-
-  static Error GetDevice(int* device);
-
-  static Error GetDeviceCount(int* count);
-
-  static Error DeviceSynchronize();
-
-  static Error Malloc(void** ptr, std::size_t size);
-
-  static Error Free(void* ptr);
-
-  static Error Memset(void* ptr, int value, std::size_t count);
-
-  static Error Memcpy(void* dst, const void* src, std::size_t count,
-                      MemcpyKind kind);
-
-  static Error MemcpyAsync(void* dst, const void* src, std::size_t count,
-                           MemcpyKind kind, Stream stream);
-
- private:
-  inline static thread_local Device::Type device_type_ =
-      generated_detail::kDefaultDeviceType;
-}};
-
-static_assert(Runtime<Device::Type::kCount>::Validate());
-
 Error SetDevice(int device);
 
 Error GetDevice(int* device);
@@ -281,6 +233,7 @@ def _write_generated_header(include_root, devices):
 Error MemcpyAsync(void* dst, const void* src, std::size_t count,
                   MemcpyKind kind, Stream stream);
 
+}}  // namespace runtime
 }}  // namespace infini::rt
 
 #endif
@@ -365,8 +318,7 @@ def _runtime_arg(param, device):
         return f"RuntimeMemcpyKind<{device_type}>({param.name})"
     if param.type == "Stream":
         return (
-            f"reinterpret_cast<typename Runtime<{device_type}>::Stream>"
-            f"({param.name})"
+            f"reinterpret_cast<typename Runtime<{device_type}>::Stream>({param.name})"
         )
 
     return param.name
@@ -387,17 +339,6 @@ def _runtime_call(function, device):
     return f"Runtime<{device_type}>::{function.name}()"
 
 
-def _call_args(function):
-    return ", ".join(param.name for param in function.params)
-
-
-def _member_signature(function):
-    return (
-        f"{function.return_type} Runtime<Device::Type::kCount>::"
-        f"{function.name}({function.params_decl()})"
-    )
-
-
 def _dispatch_cases(devices, function):
     return "\n".join(
         f"""    case {_DEVICE_TYPES[device]}:
@@ -406,43 +347,28 @@ def _dispatch_cases(devices, function):
     )
 
 
-def _write_member_dispatch_function(function, devices):
-    return f"""{_member_signature(function)} {{
-  switch (device_type_) {{
+def _write_runtime_dispatch_function(function, devices):
+    return f"""{function.signature()} {{
+  switch (infini::rt::runtime_device_type()) {{
 {_dispatch_cases(devices, function)}
   }}
 
+  assert(false && "unsupported runtime device type");
   return InvalidValueError();
 }}
 """
 
 
-def _write_public_dispatch_function(function):
-    args = _call_args(function)
-    if args:
-        call = f"Runtime<Device::Type::kCount>::{function.name}({args})"
-    else:
-        call = f"Runtime<Device::Type::kCount>::{function.name}()"
-
-    return f"""{function.signature()} {{
-  return {call};
-}}
-"""
-
-
 def _write_runtime_dispatch(source_path, devices):
     functions = _PUBLIC_RUNTIME_FUNCTIONS
-    member_dispatch_functions = "\n".join(
-        _write_member_dispatch_function(function, devices=devices)
+    dispatch_functions = "\n".join(
+        _write_runtime_dispatch_function(function, devices=devices)
         for function in functions
     )
-    public_dispatch_functions = "\n".join(
-        _write_public_dispatch_function(function) for function in functions
-    )
     set_device_type_cases = "\n".join(
         f"""    case {_DEVICE_TYPES[device]}:
-      device_type_ = device_type;
-      return kSuccess;"""
+      runtime_device_type_ = device_type;
+      return;"""
         for device in devices
     )
 
@@ -458,6 +384,28 @@ def _write_runtime_dispatch(source_path, devices):
 namespace infini::rt {{
 namespace {{
 
+thread_local Device::Type runtime_device_type_ =
+    runtime::generated_detail::kDefaultDeviceType;
+
+}}  // namespace
+
+void set_runtime_device_type(Device::Type device_type) {{
+  switch (device_type) {{
+{set_device_type_cases}
+  }}
+
+  assert(false && "unsupported runtime device type");
+}}
+
+Device::Type runtime_device_type() {{
+  return runtime_device_type_;
+}}
+
+}}  // namespace infini::rt
+
+namespace infini::rt::runtime {{
+namespace {{
+
 template <typename Func>
 Error CheckCall(Func&& func) {{
   using ReturnType = decltype(std::forward<Func>(func)());
@@ -493,21 +441,8 @@ def _write_runtime_dispatch(source_path, devices):
 
 }}  // namespace
 
-Error Runtime<Device::Type::kCount>::SetDeviceType(Device::Type device_type) {{
-  switch (device_type) {{
-{set_device_type_cases}
-  }}
-
-  return InvalidValueError();
-}}
-
-Device::Type Runtime<Device::Type::kCount>::GetDeviceType() {{
-  return device_type_;
-}}
-
-{member_dispatch_functions}
-{public_dispatch_functions}
-}}  // namespace infini::rt
+{dispatch_functions}
+}}  // namespace infini::rt::runtime
 """
     )
 
diff --git a/src/native/ascend/runtime_.h b/src/native/ascend/runtime_.h
index 191beef..065a9d3 100644
--- a/src/native/ascend/runtime_.h
+++ b/src/native/ascend/runtime_.h
@@ -11,7 +11,7 @@
 #include "native/ascend/device_.h"
 #include "runtime.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
 template <>
 struct Runtime<Device::Type::kAscend>
@@ -70,6 +70,6 @@ struct Runtime<Device::Type::kAscend>
 
 static_assert(Runtime<Device::Type::kAscend>::Validate());
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/native/cambricon/runtime_.h b/src/native/cambricon/runtime_.h
index d892df8..927e2c5 100644
--- a/src/native/cambricon/runtime_.h
+++ b/src/native/cambricon/runtime_.h
@@ -9,7 +9,7 @@
 #include "native/cambricon/device_.h"
 #include "runtime.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
 template <>
 struct Runtime<Device::Type::kCambricon>
@@ -64,6 +64,6 @@ struct Runtime<Device::Type::kCambricon>
 
 static_assert(Runtime<Device::Type::kCambricon>::Validate());
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/native/cpu/runtime_.h b/src/native/cpu/runtime_.h
index c2bcf4f..f4b18bf 100644
--- a/src/native/cpu/runtime_.h
+++ b/src/native/cpu/runtime_.h
@@ -6,7 +6,7 @@
 
 #include "runtime.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
 template <>
 struct Runtime<Device::Type::kCpu> : RuntimeBase<Runtime<Device::Type::kCpu>> {
@@ -108,6 +108,6 @@ struct Runtime<Device::Type::kCpu> : RuntimeBase<Runtime<Device::Type::kCpu>> {
 
 static_assert(Runtime<Device::Type::kCpu>::Validate());
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/native/cuda/hygon/runtime_.h b/src/native/cuda/hygon/runtime_.h
index 520e8f4..52c47eb 100644
--- a/src/native/cuda/hygon/runtime_.h
+++ b/src/native/cuda/hygon/runtime_.h
@@ -10,7 +10,7 @@
 #include "native/cuda/hygon/device_.h"
 #include "native/cuda/runtime_.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
 template <>
 struct Runtime<Device::Type::kHygon>
@@ -56,6 +56,6 @@ struct Runtime<Device::Type::kHygon>
 
 static_assert(Runtime<Device::Type::kHygon>::Validate());
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/native/cuda/iluvatar/runtime_.h b/src/native/cuda/iluvatar/runtime_.h
index 87558bf..f49db23 100644
--- a/src/native/cuda/iluvatar/runtime_.h
+++ b/src/native/cuda/iluvatar/runtime_.h
@@ -10,7 +10,7 @@
 #include "native/cuda/iluvatar/device_.h"
 #include "native/cuda/runtime_.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
 template <>
 struct Runtime<Device::Type::kIluvatar>
@@ -54,6 +54,6 @@ struct Runtime<Device::Type::kIluvatar>
 
 static_assert(Runtime<Device::Type::kIluvatar>::Validate());
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/native/cuda/metax/runtime_.h b/src/native/cuda/metax/runtime_.h
index 14ed18b..c1f19c0 100644
--- a/src/native/cuda/metax/runtime_.h
+++ b/src/native/cuda/metax/runtime_.h
@@ -8,7 +8,7 @@
 #include "native/cuda/metax/device_.h"
 #include "native/cuda/runtime_.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
 template <>
 struct Runtime<Device::Type::kMetax>
@@ -58,6 +58,6 @@ struct Runtime<Device::Type::kMetax>
 
 static_assert(Runtime<Device::Type::kMetax>::Validate());
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/native/cuda/moore/runtime_.h b/src/native/cuda/moore/runtime_.h
index fd0a215..5beffcf 100644
--- a/src/native/cuda/moore/runtime_.h
+++ b/src/native/cuda/moore/runtime_.h
@@ -8,7 +8,7 @@
 #include "native/cuda/moore/device_.h"
 #include "native/cuda/runtime_.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
 template <>
 struct Runtime<Device::Type::kMoore>
@@ -64,6 +64,6 @@ struct Runtime<Device::Type::kMoore>
 
 static_assert(Runtime<Device::Type::kMoore>::Validate());
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h
index c910198..1786e08 100644
--- a/src/native/cuda/nvidia/runtime_.h
+++ b/src/native/cuda/nvidia/runtime_.h
@@ -10,7 +10,7 @@
 #include "native/cuda/nvidia/device_.h"
 #include "native/cuda/runtime_.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
 template <>
 struct Runtime<Device::Type::kNvidia>
@@ -54,6 +54,6 @@ struct Runtime<Device::Type::kNvidia>
 
 static_assert(Runtime<Device::Type::kNvidia>::Validate());
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/native/cuda/runtime_.h b/src/native/cuda/runtime_.h
index 7d1899d..59dd6bc 100644
--- a/src/native/cuda/runtime_.h
+++ b/src/native/cuda/runtime_.h
@@ -5,7 +5,7 @@
 
 #include "runtime.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
 /// ## CUDA-like runtime interface enforcement via CRTP.
 ///
@@ -30,6 +30,6 @@ struct CudaRuntime : DeviceRuntime<Derived> {
   }
 };
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/runtime.h b/src/runtime.h
index 104e160..c3e7640 100644
--- a/src/runtime.h
+++ b/src/runtime.h
@@ -6,16 +6,17 @@
 
 #include "device.h"
 
-namespace infini::rt {
+namespace infini::rt::runtime {
 
-template <Device::Type device_type = Device::Type::kCount>
+template <Device::Type device_type>
 struct Runtime;
 
 /// ## Interface enforcement via CRTP.
 ///
 /// Inherit from the appropriate base to declare which interface level a
-/// `Runtime` specialization implements. After the struct is fully defined, call
-/// `static_assert(Runtime<...>::Validate())`. The chained `Validate()` checks
+/// `runtime::Runtime` specialization implements. After the struct is fully
+/// defined, call `static_assert(Runtime<...>::Validate())`. The chained
+/// `Validate()` checks
 /// every required member's existence and signature at compile time, analogous
 /// to how `override` catches signature mismatches for virtual functions.
 ///
@@ -56,6 +57,6 @@ struct DeviceRuntime : RuntimeBase<Derived> {
   }
 };
 
-}  // namespace infini::rt
+}  // namespace infini::rt::runtime
 
 #endif
diff --git a/src/tensor_view.h b/src/tensor_view.h
index 90ed0df..dcf7cc9 100644
--- a/src/tensor_view.h
+++ b/src/tensor_view.h
@@ -19,12 +19,12 @@ template <typename T, typename = void>
 struct IsTensorLike : std::false_type {};
 
 template <typename T>
-struct IsTensorLike<
-    T, std::void_t<decltype(std::declval<const T&>().data()),
-                   decltype(std::declval<const T&>().shape()),
-                   decltype(std::declval<const T&>().dtype()),
-                   decltype(std::declval<const T&>().device()),
-                   decltype(std::declval<const T&>().strides())>>
+struct IsTensorLike<T,
+                    std::void_t<decltype(std::declval<const T&>().data()),
+                                decltype(std::declval<const T&>().shape()),
+                                decltype(std::declval<const T&>().dtype()),
+                                decltype(std::declval<const T&>().device()),
+                                decltype(std::declval<const T&>().strides())>>
     : std::true_type {};
 
 }  // namespace tensor_view_detail
diff --git a/tests/install_consumer_smoke.cc b/tests/install_consumer_smoke.cc
index b4e1ce6..14e6c0d 100644
--- a/tests/install_consumer_smoke.cc
+++ b/tests/install_consumer_smoke.cc
@@ -21,76 +21,73 @@ int main() {
 
 #if defined(INFINI_RT_CONSUMER_BACKEND_CPU) || \
     defined(INFINI_RT_CONSUMER_BACKEND_NVIDIA)
-  using DefaultRuntime = infini::rt::Runtime<>;
+  namespace runtime = infini::rt::runtime;
 #if defined(INFINI_RT_CONSUMER_BACKEND_CPU)
   constexpr auto kExpectedDeviceType = infini::rt::Device::Type::kCpu;
 #else
   constexpr auto kExpectedDeviceType = infini::rt::Device::Type::kNvidia;
 #endif
-  if (DefaultRuntime::SetDeviceType(kExpectedDeviceType) !=
-      infini::rt::kSuccess) {
-    return 1;
-  }
-  if (DefaultRuntime::GetDeviceType() != kExpectedDeviceType) {
+  infini::rt::set_runtime_device_type(kExpectedDeviceType);
+  if (infini::rt::runtime_device_type() != kExpectedDeviceType) {
     return 1;
   }
 
   std::array<std::uint8_t, 4> input{1, 2, 3, 4};
   std::array<std::uint8_t, 4> output{};
   void* ptr = nullptr;
-  if (infini::rt::SetDevice(0) != infini::rt::kSuccess) {
+  if (runtime::SetDevice(0) != runtime::kSuccess) {
     return 1;
   }
   int current_device = -1;
-  if (infini::rt::GetDevice(&current_device) != infini::rt::kSuccess) {
+  if (runtime::GetDevice(&current_device) != runtime::kSuccess) {
     return 1;
   }
   if (current_device != 0) {
     return 1;
   }
   int device_count = 0;
-  if (infini::rt::GetDeviceCount(&device_count) != infini::rt::kSuccess) {
+  if (runtime::GetDeviceCount(&device_count) != runtime::kSuccess) {
     return 1;
   }
   if (device_count <= 0) {
     return 1;
   }
-  if (infini::rt::Malloc(&ptr, input.size()) != infini::rt::kSuccess) {
+  if (runtime::Malloc(&ptr, input.size()) != runtime::kSuccess) {
     return 1;
   }
   if (ptr == nullptr) {
     return 1;
   }
-  if (infini::rt::Memcpy(ptr, input.data(), input.size(),
-                         infini::rt::MemcpyKind::kMemcpyHostToDevice) !=
-      infini::rt::kSuccess) {
+  if (runtime::Memcpy(ptr, input.data(), input.size(),
+                      runtime::MemcpyKind::kMemcpyHostToDevice) !=
+      runtime::kSuccess) {
     return 1;
   }
 #if defined(INFINI_RT_CONSUMER_BACKEND_CPU)
-  if (infini::rt::MemcpyAsync(ptr, input.data(), input.size(),
-                              infini::rt::MemcpyKind::kMemcpyHostToDevice,
-                              nullptr) == infini::rt::kSuccess) {
+  if (runtime::MemcpyAsync(ptr, input.data(), input.size(),
+                           runtime::MemcpyKind::kMemcpyHostToDevice,
+                           nullptr) == runtime::kSuccess) {
     return 1;
   }
 #else
-  if (infini::rt::MemcpyAsync(ptr, input.data(), input.size(),
-                              infini::rt::MemcpyKind::kMemcpyHostToDevice,
-                              nullptr) != infini::rt::kSuccess) {
+  if (runtime::MemcpyAsync(ptr, input.data(), input.size(),
+                           runtime::MemcpyKind::kMemcpyHostToDevice,
+                           nullptr) != runtime::kSuccess) {
     return 1;
   }
 #endif
-  if (infini::rt::DeviceSynchronize() != infini::rt::kSuccess) {
+  if (runtime::DeviceSynchronize() != runtime::kSuccess) {
     return 1;
   }
-  if (infini::rt::Memcpy(output.data(), ptr, output.size(),
-                         infini::rt::MemcpyKind::kMemcpyDeviceToHost) !=
-      infini::rt::kSuccess) {
+  if (runtime::Memcpy(output.data(), ptr, output.size(),
+                      runtime::MemcpyKind::kMemcpyDeviceToHost) !=
+      runtime::kSuccess) {
     return 1;
   }
   if (output != input) {
     return 1;
   }
-  if (infini::rt::Free(ptr) != infini::rt::kSuccess) {
+  if (runtime::Free(ptr) != runtime::kSuccess) {
     return 1;
   }
 #endif
diff --git a/tests/test_cpu_runtime.cc b/tests/test_cpu_runtime.cc
index d307f3c..e066c85 100644
--- a/tests/test_cpu_runtime.cc
+++ b/tests/test_cpu_runtime.cc
@@ -9,7 +9,7 @@
 
 namespace {
 
-using CpuRuntime = infini::rt::Runtime<infini::rt::Device::Type::kCpu>;
+using CpuRuntime = infini::rt::runtime::Runtime<infini::rt::Device::Type::kCpu>;
 
 void TestMallocAndFree(infini::rt::test::TestContext* context) {
   void* ptr = nullptr;
@@ -45,11 +45,11 @@ void TestMemcpyAsyncUnsupported(infini::rt::test::TestContext* context) {
   std::array<std::uint8_t, 1> input{1};
   std::array<std::uint8_t, 1> output{};
 
-  context->Expect(CpuRuntime::MemcpyAsync(output.data(), input.data(),
-                                          input.size(),
-                                          CpuRuntime::kMemcpyHostToHost,
-                                          nullptr) != CpuRuntime::kSuccess,
-                  "CPU runtime should not report async memcpy success.");
+  context->Expect(
+      CpuRuntime::MemcpyAsync(output.data(), input.data(), input.size(),
+                              CpuRuntime::kMemcpyHostToHost,
+                              nullptr) != CpuRuntime::kSuccess,
+      "CPU runtime should not report async memcpy success.");
 }
 
 void TestMemset(infini::rt::test::TestContext* context) {
diff --git a/tests/test_default_runtime.cc b/tests/test_default_runtime.cc
index f2c6978..bf76a4b 100644
--- a/tests/test_default_runtime.cc
+++ b/tests/test_default_runtime.cc
@@ -8,58 +8,45 @@
 
 namespace {
 
-using DefaultRuntime = infini::rt::Runtime<>;
+namespace runtime = infini::rt::runtime;
 
 void ExpectSuccess(infini::rt::test::TestContext* context,
-                   infini::rt::Error status, const char* message) {
-  context->Expect(status == infini::rt::kSuccess, message);
-}
-
-void TestInvalidDeviceType(infini::rt::test::TestContext* context) {
-  const auto before = DefaultRuntime::GetDeviceType();
-
-  context->Expect(DefaultRuntime::SetDeviceType(infini::rt::Device::Type::kQy) !=
-                      infini::rt::kSuccess,
-                  "Default runtime should reject disabled device types.");
-  context->Expect(DefaultRuntime::GetDeviceType() == before,
-                  "Rejected device type should not change dispatch target.");
+                   runtime::Error status, const char* message) {
+  context->Expect(status == runtime::kSuccess, message);
 }
 
 #if defined(INFINI_RT_TEST_WITH_CPU)
 void TestCpuDispatch(infini::rt::test::TestContext* context) {
-  ExpectSuccess(context,
-                DefaultRuntime::SetDeviceType(infini::rt::Device::Type::kCpu),
-                "Default runtime should select CPU dispatch.");
-  context->Expect(DefaultRuntime::GetDeviceType() ==
-                      infini::rt::Device::Type::kCpu,
-                  "Default runtime should report CPU dispatch.");
+  infini::rt::set_runtime_device_type(infini::rt::Device::Type::kCpu);
+  context->Expect(
+      infini::rt::runtime_device_type() == infini::rt::Device::Type::kCpu,
+      "Default runtime should report CPU dispatch.");
 
   std::array<std::uint8_t, 4> input{1, 2, 3, 4};
   std::array<std::uint8_t, 4> output{};
   void* ptr = nullptr;
 
-  ExpectSuccess(context, infini::rt::SetDevice(0),
+  ExpectSuccess(context, runtime::SetDevice(0),
                 "CPU dispatch should set device 0.");
-  ExpectSuccess(context, infini::rt::Malloc(&ptr, input.size()),
+  ExpectSuccess(context, runtime::Malloc(&ptr, input.size()),
                 "CPU dispatch should allocate memory.");
   if (ptr == nullptr) {
     return;
   }
 
   ExpectSuccess(context,
-                infini::rt::Memcpy(ptr, input.data(), input.size(),
-                                   infini::rt::MemcpyKind::kMemcpyHostToDevice),
+                runtime::Memcpy(ptr, input.data(), input.size(),
+                                runtime::MemcpyKind::kMemcpyHostToDevice),
                 "CPU dispatch should copy host data to runtime memory.");
-  context->Expect(
-      infini::rt::MemcpyAsync(ptr, input.data(), input.size(),
-                              infini::rt::MemcpyKind::kMemcpyHostToDevice,
-                              nullptr) != infini::rt::kSuccess,
-      "CPU dispatch should not report async memcpy success.");
+  context->Expect(runtime::MemcpyAsync(ptr, input.data(), input.size(),
+                                       runtime::MemcpyKind::kMemcpyHostToDevice,
+                                       nullptr) != runtime::kSuccess,
+                  "CPU dispatch should not report async memcpy success.");
   ExpectSuccess(context,
-                infini::rt::Memcpy(output.data(), ptr, output.size(),
-                                   infini::rt::MemcpyKind::kMemcpyDeviceToHost),
+                runtime::Memcpy(output.data(), ptr, output.size(),
+                                runtime::MemcpyKind::kMemcpyDeviceToHost),
                 "CPU dispatch should copy runtime memory to host.");
-  ExpectSuccess(context, infini::rt::Free(ptr),
+  ExpectSuccess(context, runtime::Free(ptr),
                 "CPU dispatch should free memory.");
 
   context->ExpectEqual(output, input,
@@ -69,37 +56,35 @@ void TestCpuDispatch(infini::rt::test::TestContext* context) {
 
 #if defined(INFINI_RT_TEST_WITH_NVIDIA)
 void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
-  ExpectSuccess(
-      context, DefaultRuntime::SetDeviceType(infini::rt::Device::Type::kNvidia),
-      "Default runtime should select NVIDIA dispatch.");
-  context->Expect(DefaultRuntime::GetDeviceType() ==
-                      infini::rt::Device::Type::kNvidia,
-                  "Default runtime should report NVIDIA dispatch.");
+  infini::rt::set_runtime_device_type(infini::rt::Device::Type::kNvidia);
+  context->Expect(
+      infini::rt::runtime_device_type() == infini::rt::Device::Type::kNvidia,
+      "Default runtime should report NVIDIA dispatch.");
 
   std::array<std::uint8_t, 4> input{5, 6, 7, 8};
   std::array<std::uint8_t, 4> output{};
   void* ptr = nullptr;
 
-  ExpectSuccess(context, infini::rt::SetDevice(0),
+  ExpectSuccess(context, runtime::SetDevice(0),
                 "NVIDIA dispatch should set device 0.");
-  ExpectSuccess(context, infini::rt::Malloc(&ptr, input.size()),
+  ExpectSuccess(context, runtime::Malloc(&ptr, input.size()),
                 "NVIDIA dispatch should allocate memory.");
   if (ptr == nullptr) {
     return;
   }
 
-  ExpectSuccess(context,
-                infini::rt::MemcpyAsync(
-                    ptr, input.data(), input.size(),
-                    infini::rt::MemcpyKind::kMemcpyHostToDevice, nullptr),
-                "NVIDIA dispatch should support async host-to-device copy.");
-  ExpectSuccess(context, infini::rt::DeviceSynchronize(),
+  ExpectSuccess(
+      context,
+      runtime::MemcpyAsync(ptr, input.data(), input.size(),
+                           runtime::MemcpyKind::kMemcpyHostToDevice, nullptr),
+      "NVIDIA dispatch should support async host-to-device copy.");
+  ExpectSuccess(context, runtime::DeviceSynchronize(),
                 "NVIDIA dispatch should synchronize the device.");
   ExpectSuccess(context,
-                infini::rt::Memcpy(output.data(), ptr, output.size(),
-                                   infini::rt::MemcpyKind::kMemcpyDeviceToHost),
+                runtime::Memcpy(output.data(), ptr, output.size(),
+                                runtime::MemcpyKind::kMemcpyDeviceToHost),
                 "NVIDIA dispatch should copy device data to host.");
-  ExpectSuccess(context, infini::rt::Free(ptr),
+  ExpectSuccess(context, runtime::Free(ptr),
                 "NVIDIA dispatch should free memory.");
 
   context->ExpectEqual(output, input,
@@ -112,8 +97,6 @@ void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
 int main() {
   infini::rt::test::TestContext context;
 
-  TestInvalidDeviceType(&context);
-
 #if defined(INFINI_RT_TEST_WITH_CPU)
   TestCpuDispatch(&context);
 #endif
diff --git a/tests/test_nvidia_runtime.cc b/tests/test_nvidia_runtime.cc
index df5a038..fa10858 100644
--- a/tests/test_nvidia_runtime.cc
+++ b/tests/test_nvidia_runtime.cc
@@ -9,7 +9,8 @@
 
 namespace {
 
-using NvidiaRuntime = infini::rt::Runtime<infini::rt::Device::Type::kNvidia>;
+using NvidiaRuntime =
+    infini::rt::runtime::Runtime<infini::rt::Device::Type::kNvidia>;
 
 void ExpectCudaSuccess(infini::rt::test::TestContext* context,
                        cudaError_t status, const char* message) {

From a26ddffad65819bc0fd2f7aa38bb8cf8a2c8667d Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 2 Jul 2026 09:22:10 +0800
Subject: [PATCH 04/15] Use Abseil status for runtime device API

---
 CMakeLists.txt                       |  22 ++++++
 scripts/generate_public_headers.py   |  24 +++---
 src/CMakeLists.txt                   |   2 +
 src/status.h                         |  21 +++++
 tests/compile_install_consumer.cmake | 111 ++++++++++++++++++++-------
 tests/install_consumer_smoke.cc      |   8 +-
 tests/test_core.cc                   |  14 ++++
 tests/test_default_runtime.cc        |  42 +++++++++-
 8 files changed, 203 insertions(+), 41 deletions(-)
 create mode 100644 src/status.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 38b8d72..525df0d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,7 @@ project(InfiniRT LANGUAGES CXX)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 set(_DEFAULT_HYGON_DTK_ROOT "/opt/dtk")
 
@@ -33,6 +34,27 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)
 
 option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
 option(INFINI_RT_BUILD_TESTING "Build InfiniRT tests" OFF)
+option(INFINI_RT_FETCH_ABSEIL "Fetch Abseil when no package is found" ON)
+
+find_package(absl CONFIG QUIET)
+if(NOT absl_FOUND)
+    if(NOT INFINI_RT_FETCH_ABSEIL)
+        message(FATAL_ERROR "Abseil was not found. Set absl_DIR or enable INFINI_RT_FETCH_ABSEIL.")
+    endif()
+
+    include(FetchContent)
+    set(ABSL_PROPAGATE_CXX_STD ON CACHE BOOL "" FORCE)
+    set(ABSL_ENABLE_INSTALL ON CACHE BOOL "" FORCE)
+    set(ABSL_BUILD_TESTING OFF CACHE BOOL "" FORCE)
+    FetchContent_Declare(
+        abseil
+        GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
+        GIT_TAG 20260526.0
+        GIT_SHALLOW TRUE
+        GIT_PROGRESS TRUE
+    )
+    FetchContent_MakeAvailable(abseil)
+endif()
 
 if(AUTO_DETECT_DEVICES)
     message(STATUS "Auto-detecting available devices...")
diff --git a/scripts/generate_public_headers.py b/scripts/generate_public_headers.py
index 957d0d8..9910d94 100644
--- a/scripts/generate_public_headers.py
+++ b/scripts/generate_public_headers.py
@@ -103,7 +103,7 @@ def _rewrite_detail_include(match):
 
 
 _DETAIL_INCLUDE_PATTERN = re.compile(
-    r'#include "((?:common|native)/[^"]+|data_type\.h|device\.h|dispatcher\.h|hash\.h|runtime\.h|tensor_view\.h)"'
+    r'#include "((?:common|native)/[^"]+|data_type\.h|device\.h|dispatcher\.h|hash\.h|runtime\.h|status\.h|tensor_view\.h)"'
 )
 
 
@@ -137,6 +137,7 @@ def _write_detail_headers(include_root, source_root, devices):
         "dispatcher.h",
         "hash.h",
         "runtime.h",
+        "status.h",
         "tensor_view.h",
     }
 
@@ -165,6 +166,7 @@ def _write_generated_header(include_root, devices):
         f"#include {_detail_include('device.h')}",
         f"#include {_detail_include('hash.h')}",
         f"#include {_detail_include('runtime.h')}",
+        f"#include {_detail_include('status.h')}",
         f"#include {_detail_include('tensor_view.h')}",
     ]
 
@@ -184,9 +186,9 @@ def _write_generated_header(include_root, devices):
 
 namespace infini::rt {{
 
-void set_runtime_device_type(Device::Type device_type);
+Status set_runtime_device_type(Device::Type device_type);
 
-Device::Type runtime_device_type();
+StatusOr<Device::Type> runtime_device_type();
 
 namespace runtime {{
 namespace generated_detail {{
@@ -349,11 +351,15 @@ def _dispatch_cases(devices, function):
 
 def _write_runtime_dispatch_function(function, devices):
     return f"""{function.signature()} {{
-  switch (infini::rt::runtime_device_type()) {{
+  auto device_type = infini::rt::runtime_device_type();
+  if (!device_type.ok()) {{
+    return InvalidValueError();
+  }}
+
+  switch (*device_type) {{
 {_dispatch_cases(devices, function)}
   }}
 
-  assert(false && "unsupported runtime device type");
   return InvalidValueError();
 }}
 """
@@ -368,7 +374,7 @@ def _write_runtime_dispatch(source_path, devices):
     set_device_type_cases = "\n".join(
         f"""    case {_DEVICE_TYPES[device]}:
       runtime_device_type_ = device_type;
-      return;"""
+      return OkStatus();"""
         for device in devices
     )
 
@@ -389,15 +395,15 @@ def _write_runtime_dispatch(source_path, devices):
 
 }}  // namespace
 
-void set_runtime_device_type(Device::Type device_type) {{
+Status set_runtime_device_type(Device::Type device_type) {{
   switch (device_type) {{
 {set_device_type_cases}
   }}
 
-  assert(false && "unsupported runtime device type");
+  return InvalidArgumentError("unsupported runtime device type");
 }}
 
-Device::Type runtime_device_type() {{
+StatusOr<Device::Type> runtime_device_type() {{
   return runtime_device_type_;
 }}
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dd9c99f..8c0c856 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,6 +7,8 @@ target_sources(infinirt PRIVATE
     ${BASE_SRCS}
     ${PROJECT_SOURCE_DIR}/generated/src/runtime_dispatch.cc)
 
+target_link_libraries(infinirt PUBLIC absl::status absl::statusor)
+
 if(WITH_CPU)
     target_compile_definitions(infinirt PUBLIC WITH_CPU=1)
 
diff --git a/src/status.h b/src/status.h
new file mode 100644
index 0000000..3ca670a
--- /dev/null
+++ b/src/status.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_RT_STATUS_H_
+#define INFINI_RT_STATUS_H_
+
+#include <absl/status/status.h>
+#include <absl/status/statusor.h>
+
+namespace infini::rt {
+
+using Status = absl::Status;
+
+using StatusCode = absl::StatusCode;
+
+using absl::InvalidArgumentError;
+using absl::OkStatus;
+
+template <typename T>
+using StatusOr = absl::StatusOr<T>;
+
+}  // namespace infini::rt
+
+#endif
diff --git a/tests/compile_install_consumer.cmake b/tests/compile_install_consumer.cmake
index 1bb96d1..9f4107b 100644
--- a/tests/compile_install_consumer.cmake
+++ b/tests/compile_install_consumer.cmake
@@ -13,51 +13,110 @@ if(NOT EXISTS "${INFINI_RT_LIBRARY_DIR}/libinfinirt.so")
     message(FATAL_ERROR "The installed InfiniRT library was not found.")
 endif()
 
-set(INFINI_RT_EXTRA_LINK_ARGS "")
-set(INFINI_RT_EXTRA_COMPILE_ARGS "")
 set(INFINI_RT_LD_LIBRARY_PATH "${INFINI_RT_LIBRARY_DIR}")
 
+if(INFINI_RT_EXTRA_LIBRARY_PATHS)
+    string(REPLACE ":" ";" INFINI_RT_EXTRA_LIBRARY_DIRS
+           "${INFINI_RT_EXTRA_LIBRARY_PATHS}")
+    foreach(INFINI_RT_EXTRA_LIBRARY_DIR ${INFINI_RT_EXTRA_LIBRARY_DIRS})
+        if(EXISTS "${INFINI_RT_EXTRA_LIBRARY_DIR}")
+            set(INFINI_RT_LD_LIBRARY_PATH
+                "${INFINI_RT_LD_LIBRARY_PATH}:${INFINI_RT_EXTRA_LIBRARY_DIR}")
+        endif()
+    endforeach()
+endif()
+
+get_filename_component(INFINI_RT_CONSUMER_OUTPUT_DIR
+                       "${INFINI_RT_CONSUMER_BINARY}" DIRECTORY)
+get_filename_component(INFINI_RT_CONSUMER_OUTPUT_NAME
+                       "${INFINI_RT_CONSUMER_BINARY}" NAME)
+set(INFINI_RT_CONSUMER_PROJECT_DIR
+    "${INFINI_RT_CONSUMER_BINARY}.project")
+set(INFINI_RT_CONSUMER_BUILD_DIR
+    "${INFINI_RT_CONSUMER_BINARY}.build")
+
+file(MAKE_DIRECTORY "${INFINI_RT_CONSUMER_PROJECT_DIR}")
+file(WRITE "${INFINI_RT_CONSUMER_PROJECT_DIR}/CMakeLists.txt" [=[
+cmake_minimum_required(VERSION 3.18)
+project(InfiniRTInstallConsumer LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+find_package(absl CONFIG REQUIRED)
+
+add_library(infinirt SHARED IMPORTED GLOBAL)
+set_target_properties(infinirt PROPERTIES
+    IMPORTED_LOCATION "${INFINI_RT_LIBRARY_DIR}/libinfinirt.so")
+
+add_executable(install_consumer "${INFINI_RT_CONSUMER_SOURCE}")
+target_include_directories(install_consumer PRIVATE "${INFINI_RT_INCLUDE_DIR}")
+target_compile_options(install_consumer PRIVATE -Werror)
+target_link_libraries(install_consumer
+    PRIVATE
+        infinirt
+        absl::status
+        absl::statusor)
+set_target_properties(install_consumer PROPERTIES
+    OUTPUT_NAME "${INFINI_RT_CONSUMER_OUTPUT_NAME}"
+    RUNTIME_OUTPUT_DIRECTORY "${INFINI_RT_CONSUMER_OUTPUT_DIR}"
+    BUILD_RPATH "${INFINI_RT_LIBRARY_DIR}")
+
+if(INFINI_RT_CONSUMER_BACKEND AND
+   NOT INFINI_RT_CONSUMER_BACKEND STREQUAL "NONE")
+    target_compile_definitions(install_consumer
+        PRIVATE INFINI_RT_CONSUMER_BACKEND_${INFINI_RT_CONSUMER_BACKEND}=1)
+endif()
+
 if(INFINI_RT_EXTRA_INCLUDE_PATHS)
     string(REPLACE ":" ";" INFINI_RT_EXTRA_INCLUDE_DIRS
            "${INFINI_RT_EXTRA_INCLUDE_PATHS}")
-    foreach(INFINI_RT_EXTRA_INCLUDE_DIR ${INFINI_RT_EXTRA_INCLUDE_DIRS})
+    foreach(INFINI_RT_EXTRA_INCLUDE_DIR IN LISTS INFINI_RT_EXTRA_INCLUDE_DIRS)
         if(EXISTS "${INFINI_RT_EXTRA_INCLUDE_DIR}")
-            list(APPEND INFINI_RT_EXTRA_COMPILE_ARGS
-                 "-I${INFINI_RT_EXTRA_INCLUDE_DIR}")
+            target_include_directories(install_consumer
+                PRIVATE "${INFINI_RT_EXTRA_INCLUDE_DIR}")
         endif()
     endforeach()
 endif()
 
-if(INFINI_RT_CONSUMER_BACKEND AND NOT INFINI_RT_CONSUMER_BACKEND STREQUAL "NONE")
-    list(APPEND INFINI_RT_EXTRA_COMPILE_ARGS
-         "-DINFINI_RT_CONSUMER_BACKEND_${INFINI_RT_CONSUMER_BACKEND}=1")
-endif()
-
 if(INFINI_RT_EXTRA_LIBRARY_PATHS)
     string(REPLACE ":" ";" INFINI_RT_EXTRA_LIBRARY_DIRS
            "${INFINI_RT_EXTRA_LIBRARY_PATHS}")
-    foreach(INFINI_RT_EXTRA_LIBRARY_DIR ${INFINI_RT_EXTRA_LIBRARY_DIRS})
+    foreach(INFINI_RT_EXTRA_LIBRARY_DIR IN LISTS INFINI_RT_EXTRA_LIBRARY_DIRS)
         if(EXISTS "${INFINI_RT_EXTRA_LIBRARY_DIR}")
-            list(APPEND INFINI_RT_EXTRA_LINK_ARGS
-                 "-Wl,-rpath-link,${INFINI_RT_EXTRA_LIBRARY_DIR}")
-            set(INFINI_RT_LD_LIBRARY_PATH
-                "${INFINI_RT_LD_LIBRARY_PATH}:${INFINI_RT_EXTRA_LIBRARY_DIR}")
+            target_link_options(install_consumer
+                PRIVATE "-Wl,-rpath-link,${INFINI_RT_EXTRA_LIBRARY_DIR}")
         endif()
     endforeach()
 endif()
+]=])
+
+execute_process(
+    COMMAND "${CMAKE_COMMAND}"
+            -S "${INFINI_RT_CONSUMER_PROJECT_DIR}"
+            -B "${INFINI_RT_CONSUMER_BUILD_DIR}"
+            "-DCMAKE_CXX_COMPILER=${INFINI_RT_CXX_COMPILER}"
+            "-DCMAKE_PREFIX_PATH=${INFINI_RT_INSTALL_PREFIX}"
+            "-DINFINI_RT_INCLUDE_DIR=${INFINI_RT_INCLUDE_DIR}"
+            "-DINFINI_RT_LIBRARY_DIR=${INFINI_RT_LIBRARY_DIR}"
+            "-DINFINI_RT_CONSUMER_SOURCE=${INFINI_RT_CONSUMER_SOURCE}"
+            "-DINFINI_RT_CONSUMER_OUTPUT_DIR=${INFINI_RT_CONSUMER_OUTPUT_DIR}"
+            "-DINFINI_RT_CONSUMER_OUTPUT_NAME=${INFINI_RT_CONSUMER_OUTPUT_NAME}"
+            "-DINFINI_RT_EXTRA_INCLUDE_PATHS=${INFINI_RT_EXTRA_INCLUDE_PATHS}"
+            "-DINFINI_RT_EXTRA_LIBRARY_PATHS=${INFINI_RT_EXTRA_LIBRARY_PATHS}"
+            "-DINFINI_RT_CONSUMER_BACKEND=${INFINI_RT_CONSUMER_BACKEND}"
+    RESULT_VARIABLE INFINI_RT_COMPILE_RESULT
+    OUTPUT_VARIABLE INFINI_RT_COMPILE_OUTPUT
+    ERROR_VARIABLE INFINI_RT_COMPILE_ERROR)
+
+if(NOT INFINI_RT_COMPILE_RESULT EQUAL 0)
+    message(FATAL_ERROR
+            "Configuring the install consumer failed.\n"
+            "${INFINI_RT_COMPILE_OUTPUT}\n${INFINI_RT_COMPILE_ERROR}")
+endif()
 
 execute_process(
-    COMMAND "${INFINI_RT_CXX_COMPILER}"
-            -std=c++17
-            -Werror
-            "-I${INFINI_RT_INCLUDE_DIR}"
-            ${INFINI_RT_EXTRA_COMPILE_ARGS}
-            "${INFINI_RT_CONSUMER_SOURCE}"
-            "-L${INFINI_RT_LIBRARY_DIR}"
-            -linfinirt
-            "-Wl,-rpath,${INFINI_RT_LIBRARY_DIR}"
-            ${INFINI_RT_EXTRA_LINK_ARGS}
-            -o "${INFINI_RT_CONSUMER_BINARY}"
+    COMMAND "${CMAKE_COMMAND}" --build "${INFINI_RT_CONSUMER_BUILD_DIR}"
     RESULT_VARIABLE INFINI_RT_COMPILE_RESULT
     OUTPUT_VARIABLE INFINI_RT_COMPILE_OUTPUT
     ERROR_VARIABLE INFINI_RT_COMPILE_ERROR)
diff --git a/tests/install_consumer_smoke.cc b/tests/install_consumer_smoke.cc
index 14e6c0d..697cff0 100644
--- a/tests/install_consumer_smoke.cc
+++ b/tests/install_consumer_smoke.cc
@@ -27,8 +27,12 @@ int main() {
 #else
   constexpr auto kExpectedDeviceType = infini::rt::Device::Type::kNvidia;
 #endif
-  infini::rt::set_runtime_device_type(kExpectedDeviceType);
-  if (infini::rt::runtime_device_type() != kExpectedDeviceType) {
+  if (!infini::rt::set_runtime_device_type(kExpectedDeviceType).ok()) {
+    return 1;
+  }
+  const auto runtime_device_type = infini::rt::runtime_device_type();
+  if (!runtime_device_type.ok() ||
+      *runtime_device_type != kExpectedDeviceType) {
     return 1;
   }
 
diff --git a/tests/test_core.cc b/tests/test_core.cc
index 1c57c04..68092e1 100644
--- a/tests/test_core.cc
+++ b/tests/test_core.cc
@@ -45,6 +45,19 @@ void TestDataType(infini::rt::test::TestContext* context) {
                        DataType::kUInt16, "uint16 should parse by name.");
 }
 
+void TestStatus(infini::rt::test::TestContext* context) {
+  const infini::rt::Status ok_status = infini::rt::OkStatus();
+  context->Expect(ok_status.ok(), "OkStatus should report success.");
+
+  const infini::rt::Status invalid =
+      infini::rt::InvalidArgumentError("invalid argument");
+  context->Expect(!invalid.ok(), "InvalidArgumentError should report failure.");
+
+  const infini::rt::StatusOr<int> value{1};
+  context->Expect(value.ok(), "StatusOr should hold a value.");
+  context->ExpectEqual(*value, 1, "StatusOr should expose its value.");
+}
+
 void TestTensorView(infini::rt::test::TestContext* context) {
   std::vector<float> data{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   TensorView tensor{data.data(), std::vector<std::size_t>{2, 3},
@@ -89,6 +102,7 @@ int main() {
 
   TestDevice(&context);
   TestDataType(&context);
+  TestStatus(&context);
   TestTensorView(&context);
 
   return context.ExitCode();
diff --git a/tests/test_default_runtime.cc b/tests/test_default_runtime.cc
index bf76a4b..26e62cc 100644
--- a/tests/test_default_runtime.cc
+++ b/tests/test_default_runtime.cc
@@ -15,11 +15,39 @@ void ExpectSuccess(infini::rt::test::TestContext* context,
   context->Expect(status == runtime::kSuccess, message);
 }
 
+void ExpectStatusOk(infini::rt::test::TestContext* context,
+                    const infini::rt::Status& status, const char* message) {
+  context->Expect(status.ok(), message);
+}
+
+void TestInvalidDeviceType(infini::rt::test::TestContext* context) {
+  const auto before = infini::rt::runtime_device_type();
+  context->Expect(before.ok(),
+                  "Default runtime should report its initial device type.");
+
+  const auto status =
+      infini::rt::set_runtime_device_type(infini::rt::Device::Type::kQy);
+  context->Expect(!status.ok(),
+                  "Default runtime should reject disabled device types.");
+
+  const auto after = infini::rt::runtime_device_type();
+  context->Expect(after.ok(),
+                  "Rejected device type should keep runtime state valid.");
+  if (before.ok() && after.ok()) {
+    context->Expect(*after == *before,
+                    "Rejected device type should not change dispatch target.");
+  }
+}
+
 #if defined(INFINI_RT_TEST_WITH_CPU)
 void TestCpuDispatch(infini::rt::test::TestContext* context) {
-  infini::rt::set_runtime_device_type(infini::rt::Device::Type::kCpu);
+  ExpectStatusOk(
+      context,
+      infini::rt::set_runtime_device_type(infini::rt::Device::Type::kCpu),
+      "Default runtime should select CPU dispatch.");
+  const auto device_type = infini::rt::runtime_device_type();
   context->Expect(
-      infini::rt::runtime_device_type() == infini::rt::Device::Type::kCpu,
+      device_type.ok() && *device_type == infini::rt::Device::Type::kCpu,
       "Default runtime should report CPU dispatch.");
 
   std::array<std::uint8_t, 4> input{1, 2, 3, 4};
@@ -56,9 +84,13 @@ void TestCpuDispatch(infini::rt::test::TestContext* context) {
 
 #if defined(INFINI_RT_TEST_WITH_NVIDIA)
 void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
-  infini::rt::set_runtime_device_type(infini::rt::Device::Type::kNvidia);
+  ExpectStatusOk(
+      context,
+      infini::rt::set_runtime_device_type(infini::rt::Device::Type::kNvidia),
+      "Default runtime should select NVIDIA dispatch.");
+  const auto device_type = infini::rt::runtime_device_type();
   context->Expect(
-      infini::rt::runtime_device_type() == infini::rt::Device::Type::kNvidia,
+      device_type.ok() && *device_type == infini::rt::Device::Type::kNvidia,
       "Default runtime should report NVIDIA dispatch.");
 
   std::array<std::uint8_t, 4> input{5, 6, 7, 8};
@@ -97,6 +129,8 @@ void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
 int main() {
   infini::rt::test::TestContext context;
 
+  TestInvalidDeviceType(&context);
+
 #if defined(INFINI_RT_TEST_WITH_CPU)
   TestCpuDispatch(&context);
 #endif

From 1965f98db146772bcc747fed42e6af6b2b04efb4 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 2 Jul 2026 09:29:35 +0800
Subject: [PATCH 05/15] Revert "Use Abseil status for runtime device API"

This reverts commit a26ddffad65819bc0fd2f7aa38bb8cf8a2c8667d.
---
 CMakeLists.txt                       |  22 ------
 scripts/generate_public_headers.py   |  24 +++---
 src/CMakeLists.txt                   |   2 -
 src/status.h                         |  21 -----
 tests/compile_install_consumer.cmake | 111 +++++++--------------------
 tests/install_consumer_smoke.cc      |   8 +-
 tests/test_core.cc                   |  14 ----
 tests/test_default_runtime.cc        |  42 +---------
 8 files changed, 41 insertions(+), 203 deletions(-)
 delete mode 100644 src/status.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 525df0d..38b8d72 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,6 @@ project(InfiniRT LANGUAGES CXX)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 set(_DEFAULT_HYGON_DTK_ROOT "/opt/dtk")
 
@@ -34,27 +33,6 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)
 
 option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
 option(INFINI_RT_BUILD_TESTING "Build InfiniRT tests" OFF)
-option(INFINI_RT_FETCH_ABSEIL "Fetch Abseil when no package is found" ON)
-
-find_package(absl CONFIG QUIET)
-if(NOT absl_FOUND)
-    if(NOT INFINI_RT_FETCH_ABSEIL)
-        message(FATAL_ERROR "Abseil was not found. Set absl_DIR or enable INFINI_RT_FETCH_ABSEIL.")
-    endif()
-
-    include(FetchContent)
-    set(ABSL_PROPAGATE_CXX_STD ON CACHE BOOL "" FORCE)
-    set(ABSL_ENABLE_INSTALL ON CACHE BOOL "" FORCE)
-    set(ABSL_BUILD_TESTING OFF CACHE BOOL "" FORCE)
-    FetchContent_Declare(
-        abseil
-        GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
-        GIT_TAG 20260526.0
-        GIT_SHALLOW TRUE
-        GIT_PROGRESS TRUE
-    )
-    FetchContent_MakeAvailable(abseil)
-endif()
 
 if(AUTO_DETECT_DEVICES)
     message(STATUS "Auto-detecting available devices...")
diff --git a/scripts/generate_public_headers.py b/scripts/generate_public_headers.py
index 9910d94..957d0d8 100644
--- a/scripts/generate_public_headers.py
+++ b/scripts/generate_public_headers.py
@@ -103,7 +103,7 @@ def _rewrite_detail_include(match):
 
 
 _DETAIL_INCLUDE_PATTERN = re.compile(
-    r'#include "((?:common|native)/[^"]+|data_type\.h|device\.h|dispatcher\.h|hash\.h|runtime\.h|status\.h|tensor_view\.h)"'
+    r'#include "((?:common|native)/[^"]+|data_type\.h|device\.h|dispatcher\.h|hash\.h|runtime\.h|tensor_view\.h)"'
 )
 
 
@@ -137,7 +137,6 @@ def _write_detail_headers(include_root, source_root, devices):
         "dispatcher.h",
         "hash.h",
         "runtime.h",
-        "status.h",
         "tensor_view.h",
     }
 
@@ -166,7 +165,6 @@ def _write_generated_header(include_root, devices):
         f"#include {_detail_include('device.h')}",
         f"#include {_detail_include('hash.h')}",
         f"#include {_detail_include('runtime.h')}",
-        f"#include {_detail_include('status.h')}",
         f"#include {_detail_include('tensor_view.h')}",
     ]
 
@@ -186,9 +184,9 @@ def _write_generated_header(include_root, devices):
 
 namespace infini::rt {{
 
-Status set_runtime_device_type(Device::Type device_type);
+void set_runtime_device_type(Device::Type device_type);
 
-StatusOr<Device::Type> runtime_device_type();
+Device::Type runtime_device_type();
 
 namespace runtime {{
 namespace generated_detail {{
@@ -351,15 +349,11 @@ def _dispatch_cases(devices, function):
 
 def _write_runtime_dispatch_function(function, devices):
     return f"""{function.signature()} {{
-  auto device_type = infini::rt::runtime_device_type();
-  if (!device_type.ok()) {{
-    return InvalidValueError();
-  }}
-
-  switch (*device_type) {{
+  switch (infini::rt::runtime_device_type()) {{
 {_dispatch_cases(devices, function)}
   }}
 
+  assert(false && "unsupported runtime device type");
   return InvalidValueError();
 }}
 """
@@ -374,7 +368,7 @@ def _write_runtime_dispatch(source_path, devices):
     set_device_type_cases = "\n".join(
         f"""    case {_DEVICE_TYPES[device]}:
       runtime_device_type_ = device_type;
-      return OkStatus();"""
+      return;"""
         for device in devices
     )
 
@@ -395,15 +389,15 @@ def _write_runtime_dispatch(source_path, devices):
 
 }}  // namespace
 
-Status set_runtime_device_type(Device::Type device_type) {{
+void set_runtime_device_type(Device::Type device_type) {{
   switch (device_type) {{
 {set_device_type_cases}
   }}
 
-  return InvalidArgumentError("unsupported runtime device type");
+  assert(false && "unsupported runtime device type");
 }}
 
-StatusOr<Device::Type> runtime_device_type() {{
+Device::Type runtime_device_type() {{
   return runtime_device_type_;
 }}
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8c0c856..dd9c99f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,8 +7,6 @@ target_sources(infinirt PRIVATE
     ${BASE_SRCS}
     ${PROJECT_SOURCE_DIR}/generated/src/runtime_dispatch.cc)
 
-target_link_libraries(infinirt PUBLIC absl::status absl::statusor)
-
 if(WITH_CPU)
     target_compile_definitions(infinirt PUBLIC WITH_CPU=1)
 
diff --git a/src/status.h b/src/status.h
deleted file mode 100644
index 3ca670a..0000000
--- a/src/status.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef INFINI_RT_STATUS_H_
-#define INFINI_RT_STATUS_H_
-
-#include <absl/status/status.h>
-#include <absl/status/statusor.h>
-
-namespace infini::rt {
-
-using Status = absl::Status;
-
-using StatusCode = absl::StatusCode;
-
-using absl::InvalidArgumentError;
-using absl::OkStatus;
-
-template <typename T>
-using StatusOr = absl::StatusOr<T>;
-
-}  // namespace infini::rt
-
-#endif
diff --git a/tests/compile_install_consumer.cmake b/tests/compile_install_consumer.cmake
index 9f4107b..1bb96d1 100644
--- a/tests/compile_install_consumer.cmake
+++ b/tests/compile_install_consumer.cmake
@@ -13,110 +13,51 @@ if(NOT EXISTS "${INFINI_RT_LIBRARY_DIR}/libinfinirt.so")
     message(FATAL_ERROR "The installed InfiniRT library was not found.")
 endif()
 
+set(INFINI_RT_EXTRA_LINK_ARGS "")
+set(INFINI_RT_EXTRA_COMPILE_ARGS "")
 set(INFINI_RT_LD_LIBRARY_PATH "${INFINI_RT_LIBRARY_DIR}")
 
-if(INFINI_RT_EXTRA_LIBRARY_PATHS)
-    string(REPLACE ":" ";" INFINI_RT_EXTRA_LIBRARY_DIRS
-           "${INFINI_RT_EXTRA_LIBRARY_PATHS}")
-    foreach(INFINI_RT_EXTRA_LIBRARY_DIR ${INFINI_RT_EXTRA_LIBRARY_DIRS})
-        if(EXISTS "${INFINI_RT_EXTRA_LIBRARY_DIR}")
-            set(INFINI_RT_LD_LIBRARY_PATH
-                "${INFINI_RT_LD_LIBRARY_PATH}:${INFINI_RT_EXTRA_LIBRARY_DIR}")
-        endif()
-    endforeach()
-endif()
-
-get_filename_component(INFINI_RT_CONSUMER_OUTPUT_DIR
-                       "${INFINI_RT_CONSUMER_BINARY}" DIRECTORY)
-get_filename_component(INFINI_RT_CONSUMER_OUTPUT_NAME
-                       "${INFINI_RT_CONSUMER_BINARY}" NAME)
-set(INFINI_RT_CONSUMER_PROJECT_DIR
-    "${INFINI_RT_CONSUMER_BINARY}.project")
-set(INFINI_RT_CONSUMER_BUILD_DIR
-    "${INFINI_RT_CONSUMER_BINARY}.build")
-
-file(MAKE_DIRECTORY "${INFINI_RT_CONSUMER_PROJECT_DIR}")
-file(WRITE "${INFINI_RT_CONSUMER_PROJECT_DIR}/CMakeLists.txt" [=[
-cmake_minimum_required(VERSION 3.18)
-project(InfiniRTInstallConsumer LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-find_package(absl CONFIG REQUIRED)
-
-add_library(infinirt SHARED IMPORTED GLOBAL)
-set_target_properties(infinirt PROPERTIES
-    IMPORTED_LOCATION "${INFINI_RT_LIBRARY_DIR}/libinfinirt.so")
-
-add_executable(install_consumer "${INFINI_RT_CONSUMER_SOURCE}")
-target_include_directories(install_consumer PRIVATE "${INFINI_RT_INCLUDE_DIR}")
-target_compile_options(install_consumer PRIVATE -Werror)
-target_link_libraries(install_consumer
-    PRIVATE
-        infinirt
-        absl::status
-        absl::statusor)
-set_target_properties(install_consumer PROPERTIES
-    OUTPUT_NAME "${INFINI_RT_CONSUMER_OUTPUT_NAME}"
-    RUNTIME_OUTPUT_DIRECTORY "${INFINI_RT_CONSUMER_OUTPUT_DIR}"
-    BUILD_RPATH "${INFINI_RT_LIBRARY_DIR}")
-
-if(INFINI_RT_CONSUMER_BACKEND AND
-   NOT INFINI_RT_CONSUMER_BACKEND STREQUAL "NONE")
-    target_compile_definitions(install_consumer
-        PRIVATE INFINI_RT_CONSUMER_BACKEND_${INFINI_RT_CONSUMER_BACKEND}=1)
-endif()
-
 if(INFINI_RT_EXTRA_INCLUDE_PATHS)
     string(REPLACE ":" ";" INFINI_RT_EXTRA_INCLUDE_DIRS
            "${INFINI_RT_EXTRA_INCLUDE_PATHS}")
-    foreach(INFINI_RT_EXTRA_INCLUDE_DIR IN LISTS INFINI_RT_EXTRA_INCLUDE_DIRS)
+    foreach(INFINI_RT_EXTRA_INCLUDE_DIR ${INFINI_RT_EXTRA_INCLUDE_DIRS})
         if(EXISTS "${INFINI_RT_EXTRA_INCLUDE_DIR}")
-            target_include_directories(install_consumer
-                PRIVATE "${INFINI_RT_EXTRA_INCLUDE_DIR}")
+            list(APPEND INFINI_RT_EXTRA_COMPILE_ARGS
+                 "-I${INFINI_RT_EXTRA_INCLUDE_DIR}")
         endif()
     endforeach()
 endif()
 
+if(INFINI_RT_CONSUMER_BACKEND AND NOT INFINI_RT_CONSUMER_BACKEND STREQUAL "NONE")
+    list(APPEND INFINI_RT_EXTRA_COMPILE_ARGS
+         "-DINFINI_RT_CONSUMER_BACKEND_${INFINI_RT_CONSUMER_BACKEND}=1")
+endif()
+
 if(INFINI_RT_EXTRA_LIBRARY_PATHS)
     string(REPLACE ":" ";" INFINI_RT_EXTRA_LIBRARY_DIRS
            "${INFINI_RT_EXTRA_LIBRARY_PATHS}")
-    foreach(INFINI_RT_EXTRA_LIBRARY_DIR IN LISTS INFINI_RT_EXTRA_LIBRARY_DIRS)
+    foreach(INFINI_RT_EXTRA_LIBRARY_DIR ${INFINI_RT_EXTRA_LIBRARY_DIRS})
         if(EXISTS "${INFINI_RT_EXTRA_LIBRARY_DIR}")
-            target_link_options(install_consumer
-                PRIVATE "-Wl,-rpath-link,${INFINI_RT_EXTRA_LIBRARY_DIR}")
+            list(APPEND INFINI_RT_EXTRA_LINK_ARGS
+                 "-Wl,-rpath-link,${INFINI_RT_EXTRA_LIBRARY_DIR}")
+            set(INFINI_RT_LD_LIBRARY_PATH
+                "${INFINI_RT_LD_LIBRARY_PATH}:${INFINI_RT_EXTRA_LIBRARY_DIR}")
         endif()
     endforeach()
 endif()
-]=])
-
-execute_process(
-    COMMAND "${CMAKE_COMMAND}"
-            -S "${INFINI_RT_CONSUMER_PROJECT_DIR}"
-            -B "${INFINI_RT_CONSUMER_BUILD_DIR}"
-            "-DCMAKE_CXX_COMPILER=${INFINI_RT_CXX_COMPILER}"
-            "-DCMAKE_PREFIX_PATH=${INFINI_RT_INSTALL_PREFIX}"
-            "-DINFINI_RT_INCLUDE_DIR=${INFINI_RT_INCLUDE_DIR}"
-            "-DINFINI_RT_LIBRARY_DIR=${INFINI_RT_LIBRARY_DIR}"
-            "-DINFINI_RT_CONSUMER_SOURCE=${INFINI_RT_CONSUMER_SOURCE}"
-            "-DINFINI_RT_CONSUMER_OUTPUT_DIR=${INFINI_RT_CONSUMER_OUTPUT_DIR}"
-            "-DINFINI_RT_CONSUMER_OUTPUT_NAME=${INFINI_RT_CONSUMER_OUTPUT_NAME}"
-            "-DINFINI_RT_EXTRA_INCLUDE_PATHS=${INFINI_RT_EXTRA_INCLUDE_PATHS}"
-            "-DINFINI_RT_EXTRA_LIBRARY_PATHS=${INFINI_RT_EXTRA_LIBRARY_PATHS}"
-            "-DINFINI_RT_CONSUMER_BACKEND=${INFINI_RT_CONSUMER_BACKEND}"
-    RESULT_VARIABLE INFINI_RT_COMPILE_RESULT
-    OUTPUT_VARIABLE INFINI_RT_COMPILE_OUTPUT
-    ERROR_VARIABLE INFINI_RT_COMPILE_ERROR)
-
-if(NOT INFINI_RT_COMPILE_RESULT EQUAL 0)
-    message(FATAL_ERROR
-            "Configuring the install consumer failed.\n"
-            "${INFINI_RT_COMPILE_OUTPUT}\n${INFINI_RT_COMPILE_ERROR}")
-endif()
 
 execute_process(
-    COMMAND "${CMAKE_COMMAND}" --build "${INFINI_RT_CONSUMER_BUILD_DIR}"
+    COMMAND "${INFINI_RT_CXX_COMPILER}"
+            -std=c++17
+            -Werror
+            "-I${INFINI_RT_INCLUDE_DIR}"
+            ${INFINI_RT_EXTRA_COMPILE_ARGS}
+            "${INFINI_RT_CONSUMER_SOURCE}"
+            "-L${INFINI_RT_LIBRARY_DIR}"
+            -linfinirt
+            "-Wl,-rpath,${INFINI_RT_LIBRARY_DIR}"
+            ${INFINI_RT_EXTRA_LINK_ARGS}
+            -o "${INFINI_RT_CONSUMER_BINARY}"
     RESULT_VARIABLE INFINI_RT_COMPILE_RESULT
     OUTPUT_VARIABLE INFINI_RT_COMPILE_OUTPUT
     ERROR_VARIABLE INFINI_RT_COMPILE_ERROR)
diff --git a/tests/install_consumer_smoke.cc b/tests/install_consumer_smoke.cc
index 697cff0..14e6c0d 100644
--- a/tests/install_consumer_smoke.cc
+++ b/tests/install_consumer_smoke.cc
@@ -27,12 +27,8 @@ int main() {
 #else
   constexpr auto kExpectedDeviceType = infini::rt::Device::Type::kNvidia;
 #endif
-  if (!infini::rt::set_runtime_device_type(kExpectedDeviceType).ok()) {
-    return 1;
-  }
-  const auto runtime_device_type = infini::rt::runtime_device_type();
-  if (!runtime_device_type.ok() ||
-      *runtime_device_type != kExpectedDeviceType) {
+  infini::rt::set_runtime_device_type(kExpectedDeviceType);
+  if (infini::rt::runtime_device_type() != kExpectedDeviceType) {
     return 1;
   }
 
diff --git a/tests/test_core.cc b/tests/test_core.cc
index 68092e1..1c57c04 100644
--- a/tests/test_core.cc
+++ b/tests/test_core.cc
@@ -45,19 +45,6 @@ void TestDataType(infini::rt::test::TestContext* context) {
                        DataType::kUInt16, "uint16 should parse by name.");
 }
 
-void TestStatus(infini::rt::test::TestContext* context) {
-  const infini::rt::Status ok_status = infini::rt::OkStatus();
-  context->Expect(ok_status.ok(), "OkStatus should report success.");
-
-  const infini::rt::Status invalid =
-      infini::rt::InvalidArgumentError("invalid argument");
-  context->Expect(!invalid.ok(), "InvalidArgumentError should report failure.");
-
-  const infini::rt::StatusOr<int> value{1};
-  context->Expect(value.ok(), "StatusOr should hold a value.");
-  context->ExpectEqual(*value, 1, "StatusOr should expose its value.");
-}
-
 void TestTensorView(infini::rt::test::TestContext* context) {
   std::vector<float> data{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   TensorView tensor{data.data(), std::vector<std::size_t>{2, 3},
@@ -102,7 +89,6 @@ int main() {
 
   TestDevice(&context);
   TestDataType(&context);
-  TestStatus(&context);
   TestTensorView(&context);
 
   return context.ExitCode();
diff --git a/tests/test_default_runtime.cc b/tests/test_default_runtime.cc
index 26e62cc..bf76a4b 100644
--- a/tests/test_default_runtime.cc
+++ b/tests/test_default_runtime.cc
@@ -15,39 +15,11 @@ void ExpectSuccess(infini::rt::test::TestContext* context,
   context->Expect(status == runtime::kSuccess, message);
 }
 
-void ExpectStatusOk(infini::rt::test::TestContext* context,
-                    const infini::rt::Status& status, const char* message) {
-  context->Expect(status.ok(), message);
-}
-
-void TestInvalidDeviceType(infini::rt::test::TestContext* context) {
-  const auto before = infini::rt::runtime_device_type();
-  context->Expect(before.ok(),
-                  "Default runtime should report its initial device type.");
-
-  const auto status =
-      infini::rt::set_runtime_device_type(infini::rt::Device::Type::kQy);
-  context->Expect(!status.ok(),
-                  "Default runtime should reject disabled device types.");
-
-  const auto after = infini::rt::runtime_device_type();
-  context->Expect(after.ok(),
-                  "Rejected device type should keep runtime state valid.");
-  if (before.ok() && after.ok()) {
-    context->Expect(*after == *before,
-                    "Rejected device type should not change dispatch target.");
-  }
-}
-
 #if defined(INFINI_RT_TEST_WITH_CPU)
 void TestCpuDispatch(infini::rt::test::TestContext* context) {
-  ExpectStatusOk(
-      context,
-      infini::rt::set_runtime_device_type(infini::rt::Device::Type::kCpu),
-      "Default runtime should select CPU dispatch.");
-  const auto device_type = infini::rt::runtime_device_type();
+  infini::rt::set_runtime_device_type(infini::rt::Device::Type::kCpu);
   context->Expect(
-      device_type.ok() && *device_type == infini::rt::Device::Type::kCpu,
+      infini::rt::runtime_device_type() == infini::rt::Device::Type::kCpu,
       "Default runtime should report CPU dispatch.");
 
   std::array<std::uint8_t, 4> input{1, 2, 3, 4};
@@ -84,13 +56,9 @@ void TestCpuDispatch(infini::rt::test::TestContext* context) {
 
 #if defined(INFINI_RT_TEST_WITH_NVIDIA)
 void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
-  ExpectStatusOk(
-      context,
-      infini::rt::set_runtime_device_type(infini::rt::Device::Type::kNvidia),
-      "Default runtime should select NVIDIA dispatch.");
-  const auto device_type = infini::rt::runtime_device_type();
+  infini::rt::set_runtime_device_type(infini::rt::Device::Type::kNvidia);
   context->Expect(
-      device_type.ok() && *device_type == infini::rt::Device::Type::kNvidia,
+      infini::rt::runtime_device_type() == infini::rt::Device::Type::kNvidia,
       "Default runtime should report NVIDIA dispatch.");
 
   std::array<std::uint8_t, 4> input{5, 6, 7, 8};
@@ -129,8 +97,6 @@ void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
 int main() {
   infini::rt::test::TestContext context;
 
-  TestInvalidDeviceType(&context);
-
 #if defined(INFINI_RT_TEST_WITH_CPU)
   TestCpuDispatch(&context);
 #endif

From 1fd4ef4b7c674f774729753b0dc83c314dd9643a Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 2 Jul 2026 10:13:52 +0800
Subject: [PATCH 06/15] Address runtime dispatch review feedback

---
 README.md                                     |  38 ++--
 scripts/generate_public_headers.py            | 176 ++++++++++--------
 src/native/cuda/nvidia/runtime_.h             |  32 ++--
 src/tensor_view.h                             |  22 +--
 tests/CMakeLists.txt                          |   6 +-
 tests/test_core.cc                            |   4 -
 ...lt_runtime.cc => test_runtime_dispatch.cc} |  62 +++++-
 7 files changed, 200 insertions(+), 140 deletions(-)
 rename tests/{test_default_runtime.cc => test_runtime_dispatch.cc} (60%)

diff --git a/README.md b/README.md
index 33fbe90..51099fc 100644
--- a/README.md
+++ b/README.md
@@ -66,39 +66,45 @@ cmake --install build
 #include <infini/rt.h>
 
 int main() {
-  infini::rt::SetDevice(0);
+  namespace runtime = infini::rt::runtime;
+
+  runtime::SetDevice(0);
 
   constexpr std::size_t size = 1024;
   void* ptr = nullptr;
 
-  infini::rt::Malloc(&ptr, size);
-  infini::rt::Memset(ptr, 0, size);
-  infini::rt::Free(ptr);
+  runtime::Malloc(&ptr, size);
+  runtime::Memset(ptr, 0, size);
+  runtime::Free(ptr);
 
   return 0;
 }
 ```
 
-The top-level runtime API dispatches through `infini::rt::Runtime<>`, which is
-the `Runtime<Device::Type::kCount>` default specialization. A GPU backend is
-selected initially when one is enabled; otherwise CPU is selected.
-`SetDeviceType` accepts only backends enabled in the current build.
+The CUDA Runtime API-aligned layer lives under `infini::rt::runtime`. The
+top-level `infini::rt::set_runtime_device_type` and
+`infini::rt::runtime_device_type` APIs select which enabled backend receives
+those runtime calls. A GPU backend is selected initially when one is enabled;
+otherwise CPU is selected.
 
 ```cpp
+namespace runtime = infini::rt::runtime;
+
 constexpr std::size_t size = 1024;
 void* ptr = nullptr;
 
-infini::rt::Runtime<>::SetDeviceType(infini::rt::Device::Type::kCpu);
-infini::rt::Malloc(&ptr, size);
-infini::rt::Free(ptr);
+infini::rt::set_runtime_device_type(infini::rt::Device::Type::kCpu);
+runtime::Malloc(&ptr, size);
+runtime::Free(ptr);
 
-infini::rt::Runtime<>::SetDeviceType(infini::rt::Device::Type::kNvidia);
-infini::rt::Malloc(&ptr, size);
-infini::rt::Free(ptr);
+infini::rt::set_runtime_device_type(infini::rt::Device::Type::kNvidia);
+runtime::Malloc(&ptr, size);
+runtime::Free(ptr);
 ```
 
-Use `infini::rt::Runtime<infini::rt::Device::Type::kCpu>` when CPU runtime
-calls are needed explicitly in a build that also enables an accelerator backend.
+Use `infini::rt::runtime::Runtime<infini::rt::Device::Type::kCpu>` when CPU
+runtime calls are needed explicitly in a build that also enables an accelerator
+backend.
 
 ## Using Installed InfiniRT From Another Project
 
diff --git a/scripts/generate_public_headers.py b/scripts/generate_public_headers.py
index 957d0d8..d2eeca9 100644
--- a/scripts/generate_public_headers.py
+++ b/scripts/generate_public_headers.py
@@ -71,6 +71,13 @@
     "cpu",
 )
 
+_NVIDIA_RUNTIME_HEADER = "native/cuda/nvidia/runtime_.h"
+
+_PUBLIC_TYPE_NAMES = {
+    "cudaMemcpyKind": "MemcpyKind",
+    "size_t": "std::size_t",
+}
+
 
 def _guard(path):
     token = "_".join(path.parts).replace(".", "_").upper()
@@ -156,7 +163,8 @@ def _write_detail_headers(include_root, source_root, devices):
         _write_detail_header(include_root, source_root, relative_path)
 
 
-def _write_generated_header(include_root, devices):
+def _write_generated_header(include_root, source_root, devices):
+    runtime_api = _parse_nvidia_runtime_api(source_root)
     default_device = _default_device(devices)
     default_device_type = _DEVICE_TYPES[default_device]
     includes = [
@@ -174,6 +182,19 @@ def _write_generated_header(include_root, devices):
     for device in devices:
         includes.append(f"#include <infini/rt/{device}/runtime_.h>")
 
+    runtime_aliases = "\n\n".join(
+        f"using {alias} = typename generated_detail::DefaultRuntime::{alias};"
+        for alias in runtime_api.aliases
+    )
+    memcpy_kind_values = "\n".join(
+        f"""  {constant} =
+      static_cast<int>(generated_detail::DefaultRuntime::{constant}),"""
+        for constant in runtime_api.memcpy_kind_constants
+    )
+    runtime_declarations = "\n\n".join(
+        f"{function.signature()};" for function in runtime_api.functions
+    )
+
     path = include_root / "infini" / "rt" / "generated.h"
     path.parent.mkdir(parents=True, exist_ok=True)
     path.write_text(
@@ -191,47 +212,21 @@ def _write_generated_header(include_root, devices):
 namespace runtime {{
 namespace generated_detail {{
 
-using DefaultErrorRuntime = Runtime<{default_device_type}>;
+using DefaultRuntime = Runtime<{default_device_type}>;
 
 inline constexpr Device::Type kDefaultDeviceType = {default_device_type};
 
 }}  // namespace generated_detail
 
-using Error = typename generated_detail::DefaultErrorRuntime::Error;
-
-using Stream = typename generated_detail::DefaultErrorRuntime::Stream;
+{runtime_aliases}
 
-inline constexpr Error kSuccess = generated_detail::DefaultErrorRuntime::kSuccess;
+inline constexpr Error kSuccess = generated_detail::DefaultRuntime::kSuccess;
 
 enum class MemcpyKind {{
-  kMemcpyHostToHost =
-      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyHostToHost),
-  kMemcpyHostToDevice =
-      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyHostToDevice),
-  kMemcpyDeviceToHost =
-      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyDeviceToHost),
-  kMemcpyDeviceToDevice =
-      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyDeviceToDevice),
+{memcpy_kind_values}
 }};
 
-Error SetDevice(int device);
-
-Error GetDevice(int* device);
-
-Error GetDeviceCount(int* count);
-
-Error DeviceSynchronize();
-
-Error Malloc(void** ptr, std::size_t size);
-
-Error Free(void* ptr);
-
-Error Memset(void* ptr, int value, std::size_t count);
-
-Error Memcpy(void* dst, const void* src, std::size_t count, MemcpyKind kind);
-
-Error MemcpyAsync(void* dst, const void* src, std::size_t count,
-                  MemcpyKind kind, Stream stream);
+{runtime_declarations}
 
 }}  // namespace runtime
 }}  // namespace infini::rt
@@ -260,48 +255,73 @@ def params_decl(self):
         return ", ".join(f"{param.type} {param.name}" for param in self.params)
 
 
-_PUBLIC_RUNTIME_FUNCTIONS = (
-    _Function("Error", "SetDevice", (_Param("int", "device"),)),
-    _Function("Error", "GetDevice", (_Param("int*", "device"),)),
-    _Function("Error", "GetDeviceCount", (_Param("int*", "count"),)),
-    _Function("Error", "DeviceSynchronize", ()),
-    _Function(
-        "Error",
-        "Malloc",
-        (_Param("void**", "ptr"), _Param("std::size_t", "size")),
-    ),
-    _Function("Error", "Free", (_Param("void*", "ptr"),)),
-    _Function(
-        "Error",
-        "Memset",
-        (
-            _Param("void*", "ptr"),
-            _Param("int", "value"),
-            _Param("std::size_t", "count"),
-        ),
-    ),
-    _Function(
-        "Error",
-        "Memcpy",
-        (
-            _Param("void*", "dst"),
-            _Param("const void*", "src"),
-            _Param("std::size_t", "count"),
-            _Param("MemcpyKind", "kind"),
-        ),
-    ),
-    _Function(
-        "Error",
-        "MemcpyAsync",
-        (
-            _Param("void*", "dst"),
-            _Param("const void*", "src"),
-            _Param("std::size_t", "count"),
-            _Param("MemcpyKind", "kind"),
-            _Param("Stream", "stream"),
-        ),
-    ),
-)
+@dataclasses.dataclass(frozen=True)
+class _RuntimeApi:
+    aliases: tuple[str, ...]
+    memcpy_kind_constants: tuple[str, ...]
+    functions: tuple[_Function, ...]
+
+
+def _runtime_struct_body(source_root):
+    text = (source_root / _NVIDIA_RUNTIME_HEADER).read_text()
+    match = re.search(
+        r"struct Runtime<Device::Type::kNvidia>.*?\{\n(?P<body>.*?)\n\};",
+        text,
+        flags=re.DOTALL,
+    )
+    if match is None:
+        raise ValueError("could not find NVIDIA Runtime specialization")
+
+    return match.group("body")
+
+
+def _public_type_name(name):
+    return _PUBLIC_TYPE_NAMES.get(name, name)
+
+
+def _parse_param(param):
+    param_type, param_name = " ".join(param.strip().split()).rsplit(" ", 1)
+
+    return _Param(_public_type_name(param_type), param_name)
+
+
+def _parse_function(match):
+    params = match.group("params").strip()
+    return _Function(
+        _public_type_name(match.group("return_type")),
+        match.group("name"),
+        tuple(_parse_param(param) for param in re.split(r"\s*,\s*", params) if param),
+    )
+
+
+def _parse_nvidia_runtime_api(source_root):
+    body = _runtime_struct_body(source_root)
+    aliases = tuple(
+        alias
+        for alias in re.findall(r"^\s+using\s+(\w+)\s*=", body, flags=re.MULTILINE)
+        if alias in {"Error", "Stream"}
+    )
+    memcpy_kind_constants = tuple(
+        re.findall(
+            r"^\s+static constexpr auto (kMemcpy\w+)\s*=",
+            body,
+            flags=re.MULTILINE,
+        )
+    )
+    functions = tuple(
+        _parse_function(match)
+        for match in re.finditer(
+            r"^\s+static\s+(?P<return_type>\w+)\s+"
+            r"(?P<name>[A-Z]\w*)\((?P<params>[^)]*)\)\s*\{",
+            body,
+            flags=re.MULTILINE,
+        )
+    )
+
+    if not aliases or not memcpy_kind_constants or not functions:
+        raise ValueError("NVIDIA Runtime specialization has no public API")
+
+    return _RuntimeApi(aliases, memcpy_kind_constants, functions)
 
 
 def _default_device(devices):
@@ -359,8 +379,8 @@ def _write_runtime_dispatch_function(function, devices):
 """
 
 
-def _write_runtime_dispatch(source_path, devices):
-    functions = _PUBLIC_RUNTIME_FUNCTIONS
+def _write_runtime_dispatch(source_path, source_root, devices):
+    functions = _parse_nvidia_runtime_api(source_root).functions
     dispatch_functions = "\n".join(
         _write_runtime_dispatch_function(function, devices=devices)
         for function in functions
@@ -470,8 +490,8 @@ def main():
         for wrapper_device, header_name, target in _DEVICE_HEADERS[device]:
             _write_wrapper(include_root, wrapper_device, header_name, target)
 
-    _write_generated_header(include_root, devices)
-    _write_runtime_dispatch(pathlib.Path(args.source_output), devices)
+    _write_generated_header(include_root, source_root, devices)
+    _write_runtime_dispatch(pathlib.Path(args.source_output), source_root, devices)
 
 
 if __name__ == "__main__":
diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h
index 1786e08..1ce61f7 100644
--- a/src/native/cuda/nvidia/runtime_.h
+++ b/src/native/cuda/nvidia/runtime_.h
@@ -1,7 +1,7 @@
 #ifndef INFINI_RT_NVIDIA_RUNTIME__H_
 #define INFINI_RT_NVIDIA_RUNTIME__H_
 
-#include <utility>
+#include <cstddef>
 
 // clang-format off
 #include <cuda_runtime.h>
@@ -23,23 +23,29 @@ struct Runtime<Device::Type::kNvidia>
 
   static constexpr Error kSuccess = cudaSuccess;
 
-  static constexpr auto SetDevice = cudaSetDevice;
+  static Error SetDevice(int device) { return cudaSetDevice(device); }
 
-  static constexpr auto GetDevice = cudaGetDevice;
+  static Error GetDevice(int* device) { return cudaGetDevice(device); }
 
-  static constexpr auto GetDeviceCount = cudaGetDeviceCount;
+  static Error GetDeviceCount(int* count) { return cudaGetDeviceCount(count); }
 
-  static constexpr auto DeviceSynchronize = cudaDeviceSynchronize;
+  static Error DeviceSynchronize() { return cudaDeviceSynchronize(); }
 
-  static constexpr auto Malloc = [](auto&&... args) {
-    return cudaMalloc(std::forward<decltype(args)>(args)...);
-  };
+  static Error Malloc(void** ptr, std::size_t size) {
+    return cudaMalloc(ptr, size);
+  }
 
-  static constexpr auto Memcpy = cudaMemcpy;
+  static Error Memcpy(void* dst, const void* src, std::size_t count,
+                      cudaMemcpyKind kind) {
+    return cudaMemcpy(dst, src, count, kind);
+  }
 
-  static constexpr auto MemcpyAsync = cudaMemcpyAsync;
+  static Error MemcpyAsync(void* dst, const void* src, std::size_t count,
+                           cudaMemcpyKind kind, Stream stream) {
+    return cudaMemcpyAsync(dst, src, count, kind, stream);
+  }
 
-  static constexpr auto Free = cudaFree;
+  static Error Free(void* ptr) { return cudaFree(ptr); }
 
   static constexpr auto kMemcpyHostToHost = cudaMemcpyHostToHost;
 
@@ -49,7 +55,9 @@ struct Runtime<Device::Type::kNvidia>
 
   static constexpr auto kMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
 
-  static constexpr auto Memset = cudaMemset;
+  static Error Memset(void* ptr, int value, std::size_t count) {
+    return cudaMemset(ptr, value, count);
+  }
 };
 
 static_assert(Runtime<Device::Type::kNvidia>::Validate());
diff --git a/src/tensor_view.h b/src/tensor_view.h
index dcf7cc9..0b6fc59 100644
--- a/src/tensor_view.h
+++ b/src/tensor_view.h
@@ -3,8 +3,6 @@
 
 #include <cstdint>
 #include <string>
-#include <type_traits>
-#include <utility>
 #include <vector>
 
 #include "data_type.h"
@@ -13,22 +11,6 @@
 
 namespace infini::rt {
 
-namespace tensor_view_detail {
-
-template <typename T, typename = void>
-struct IsTensorLike : std::false_type {};
-
-template <typename T>
-struct IsTensorLike<T,
-                    std::void_t<decltype(std::declval<const T&>().data()),
-                                decltype(std::declval<const T&>().shape()),
-                                decltype(std::declval<const T&>().dtype()),
-                                decltype(std::declval<const T&>().device()),
-                                decltype(std::declval<const T&>().strides())>>
-    : std::true_type {};
-
-}  // namespace tensor_view_detail
-
 class TensorView {
  public:
   using Size = std::size_t;
@@ -41,9 +23,7 @@ class TensorView {
 
   using Strides = std::vector<Stride>;
 
-  template <typename TensorLike,
-            typename = std::enable_if_t<
-                tensor_view_detail::IsTensorLike<TensorLike>::value>>
+  template <typename TensorLike>
   TensorView(const TensorLike& tensor)
       : data_{const_cast<void*>(static_cast<const void*>(tensor.data()))},
         shape_{tensor.shape()},
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1aeca74..f24954c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,13 +8,13 @@ add_infini_rt_test(test_smoke test_smoke.cc)
 add_infini_rt_test(test_core test_core.cc)
 
 if(WITH_CPU OR WITH_NVIDIA)
-    add_infini_rt_test(test_default_runtime test_default_runtime.cc)
+    add_infini_rt_test(test_runtime_dispatch test_runtime_dispatch.cc)
     if(WITH_CPU)
-        target_compile_definitions(test_default_runtime
+        target_compile_definitions(test_runtime_dispatch
                                    PRIVATE INFINI_RT_TEST_WITH_CPU=1)
     endif()
     if(WITH_NVIDIA)
-        target_compile_definitions(test_default_runtime
+        target_compile_definitions(test_runtime_dispatch
                                    PRIVATE INFINI_RT_TEST_WITH_NVIDIA=1)
     endif()
 endif()
diff --git a/tests/test_core.cc b/tests/test_core.cc
index 1c57c04..e8b8071 100644
--- a/tests/test_core.cc
+++ b/tests/test_core.cc
@@ -2,7 +2,6 @@
 
 #include <cstdint>
 #include <string>
-#include <type_traits>
 #include <vector>
 
 #include "test_helper.h"
@@ -13,9 +12,6 @@ using infini::rt::DataType;
 using infini::rt::Device;
 using infini::rt::TensorView;
 
-static_assert(!std::is_constructible_v<TensorView, std::vector<TensorView>>,
-              "TensorView should not treat tensor containers as tensor-like.");
-
 void TestDevice(infini::rt::test::TestContext* context) {
   const Device cpu{Device::Type::kCpu};
   const Device nvidia{Device::Type::kNvidia, 1};
diff --git a/tests/test_default_runtime.cc b/tests/test_runtime_dispatch.cc
similarity index 60%
rename from tests/test_default_runtime.cc
rename to tests/test_runtime_dispatch.cc
index bf76a4b..e48a276 100644
--- a/tests/test_default_runtime.cc
+++ b/tests/test_runtime_dispatch.cc
@@ -20,7 +20,7 @@ void TestCpuDispatch(infini::rt::test::TestContext* context) {
   infini::rt::set_runtime_device_type(infini::rt::Device::Type::kCpu);
   context->Expect(
       infini::rt::runtime_device_type() == infini::rt::Device::Type::kCpu,
-      "Default runtime should report CPU dispatch.");
+      "Runtime dispatch should report CPU dispatch.");
 
   std::array<std::uint8_t, 4> input{1, 2, 3, 4};
   std::array<std::uint8_t, 4> output{};
@@ -28,6 +28,18 @@ void TestCpuDispatch(infini::rt::test::TestContext* context) {
 
   ExpectSuccess(context, runtime::SetDevice(0),
                 "CPU dispatch should set device 0.");
+  int current_device = -1;
+  ExpectSuccess(context, runtime::GetDevice(&current_device),
+                "CPU dispatch should get the current device.");
+  context->ExpectEqual(current_device, 0,
+                       "CPU dispatch should keep the current device.");
+
+  int device_count = 0;
+  ExpectSuccess(context, runtime::GetDeviceCount(&device_count),
+                "CPU dispatch should get the device count.");
+  context->Expect(device_count > 0,
+                  "CPU dispatch should report at least one device.");
+
   ExpectSuccess(context, runtime::Malloc(&ptr, input.size()),
                 "CPU dispatch should allocate memory.");
   if (ptr == nullptr) {
@@ -46,11 +58,23 @@ void TestCpuDispatch(infini::rt::test::TestContext* context) {
                 runtime::Memcpy(output.data(), ptr, output.size(),
                                 runtime::MemcpyKind::kMemcpyDeviceToHost),
                 "CPU dispatch should copy runtime memory to host.");
-  ExpectSuccess(context, runtime::Free(ptr),
-                "CPU dispatch should free memory.");
 
   context->ExpectEqual(output, input,
                        "CPU dispatch should preserve copied bytes.");
+
+  ExpectSuccess(context, runtime::Memset(ptr, 0x5A, output.size()),
+                "CPU dispatch should fill runtime memory.");
+  ExpectSuccess(context,
+                runtime::Memcpy(output.data(), ptr, output.size(),
+                                runtime::MemcpyKind::kMemcpyDeviceToHost),
+                "CPU dispatch should copy filled memory to host.");
+  for (const auto value : output) {
+    context->ExpectEqual(value, static_cast<std::uint8_t>(0x5A),
+                         "CPU dispatch should preserve filled bytes.");
+  }
+
+  ExpectSuccess(context, runtime::Free(ptr),
+                "CPU dispatch should free memory.");
 }
 #endif
 
@@ -59,7 +83,7 @@ void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
   infini::rt::set_runtime_device_type(infini::rt::Device::Type::kNvidia);
   context->Expect(
       infini::rt::runtime_device_type() == infini::rt::Device::Type::kNvidia,
-      "Default runtime should report NVIDIA dispatch.");
+      "Runtime dispatch should report NVIDIA dispatch.");
 
   std::array<std::uint8_t, 4> input{5, 6, 7, 8};
   std::array<std::uint8_t, 4> output{};
@@ -67,6 +91,18 @@ void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
 
   ExpectSuccess(context, runtime::SetDevice(0),
                 "NVIDIA dispatch should set device 0.");
+  int current_device = -1;
+  ExpectSuccess(context, runtime::GetDevice(&current_device),
+                "NVIDIA dispatch should get the current device.");
+  context->ExpectEqual(current_device, 0,
+                       "NVIDIA dispatch should keep the current device.");
+
+  int device_count = 0;
+  ExpectSuccess(context, runtime::GetDeviceCount(&device_count),
+                "NVIDIA dispatch should get the device count.");
+  context->Expect(device_count > 0,
+                  "NVIDIA dispatch should report at least one device.");
+
   ExpectSuccess(context, runtime::Malloc(&ptr, input.size()),
                 "NVIDIA dispatch should allocate memory.");
   if (ptr == nullptr) {
@@ -84,11 +120,25 @@ void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
                 runtime::Memcpy(output.data(), ptr, output.size(),
                                 runtime::MemcpyKind::kMemcpyDeviceToHost),
                 "NVIDIA dispatch should copy device data to host.");
-  ExpectSuccess(context, runtime::Free(ptr),
-                "NVIDIA dispatch should free memory.");
 
   context->ExpectEqual(output, input,
                        "NVIDIA dispatch should preserve copied bytes.");
+
+  ExpectSuccess(context, runtime::Memset(ptr, 0x5A, output.size()),
+                "NVIDIA dispatch should fill runtime memory.");
+  ExpectSuccess(context, runtime::DeviceSynchronize(),
+                "NVIDIA dispatch should synchronize filled memory.");
+  ExpectSuccess(context,
+                runtime::Memcpy(output.data(), ptr, output.size(),
+                                runtime::MemcpyKind::kMemcpyDeviceToHost),
+                "NVIDIA dispatch should copy filled memory to host.");
+  for (const auto value : output) {
+    context->ExpectEqual(value, static_cast<std::uint8_t>(0x5A),
+                         "NVIDIA dispatch should preserve filled bytes.");
+  }
+
+  ExpectSuccess(context, runtime::Free(ptr),
+                "NVIDIA dispatch should free memory.");
 }
 #endif
 

From d854b8639ecb62403a1ada47abe0aab3f0db4ec8 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 2 Jul 2026 10:45:54 +0800
Subject: [PATCH 07/15] Keep runtime API list in generator

---
 scripts/generate_public_headers.py | 155 ++++++++++++-----------------
 src/native/cuda/nvidia/runtime_.h  |  32 +++---
 2 files changed, 73 insertions(+), 114 deletions(-)

diff --git a/scripts/generate_public_headers.py b/scripts/generate_public_headers.py
index d2eeca9..b10e7f4 100644
--- a/scripts/generate_public_headers.py
+++ b/scripts/generate_public_headers.py
@@ -71,13 +71,6 @@
     "cpu",
 )
 
-_NVIDIA_RUNTIME_HEADER = "native/cuda/nvidia/runtime_.h"
-
-_PUBLIC_TYPE_NAMES = {
-    "cudaMemcpyKind": "MemcpyKind",
-    "size_t": "std::size_t",
-}
-
 
 def _guard(path):
     token = "_".join(path.parts).replace(".", "_").upper()
@@ -163,8 +156,7 @@ def _write_detail_headers(include_root, source_root, devices):
         _write_detail_header(include_root, source_root, relative_path)
 
 
-def _write_generated_header(include_root, source_root, devices):
-    runtime_api = _parse_nvidia_runtime_api(source_root)
+def _write_generated_header(include_root, devices):
     default_device = _default_device(devices)
     default_device_type = _DEVICE_TYPES[default_device]
     includes = [
@@ -182,17 +174,8 @@ def _write_generated_header(include_root, source_root, devices):
     for device in devices:
         includes.append(f"#include <infini/rt/{device}/runtime_.h>")
 
-    runtime_aliases = "\n\n".join(
-        f"using {alias} = typename generated_detail::DefaultRuntime::{alias};"
-        for alias in runtime_api.aliases
-    )
-    memcpy_kind_values = "\n".join(
-        f"""  {constant} =
-      static_cast<int>(generated_detail::DefaultRuntime::{constant}),"""
-        for constant in runtime_api.memcpy_kind_constants
-    )
     runtime_declarations = "\n\n".join(
-        f"{function.signature()};" for function in runtime_api.functions
+        f"{function.signature()};" for function in _PUBLIC_RUNTIME_FUNCTIONS
     )
 
     path = include_root / "infini" / "rt" / "generated.h"
@@ -212,18 +195,27 @@ def _write_generated_header(include_root, source_root, devices):
 namespace runtime {{
 namespace generated_detail {{
 
-using DefaultRuntime = Runtime<{default_device_type}>;
+using DefaultErrorRuntime = Runtime<{default_device_type}>;
 
 inline constexpr Device::Type kDefaultDeviceType = {default_device_type};
 
 }}  // namespace generated_detail
 
-{runtime_aliases}
+using Error = typename generated_detail::DefaultErrorRuntime::Error;
+
+using Stream = typename generated_detail::DefaultErrorRuntime::Stream;
 
-inline constexpr Error kSuccess = generated_detail::DefaultRuntime::kSuccess;
+inline constexpr Error kSuccess = generated_detail::DefaultErrorRuntime::kSuccess;
 
 enum class MemcpyKind {{
-{memcpy_kind_values}
+  kMemcpyHostToHost =
+      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyHostToHost),
+  kMemcpyHostToDevice =
+      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyHostToDevice),
+  kMemcpyDeviceToHost =
+      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyDeviceToHost),
+  kMemcpyDeviceToDevice =
+      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyDeviceToDevice),
 }};
 
 {runtime_declarations}
@@ -255,73 +247,48 @@ def params_decl(self):
         return ", ".join(f"{param.type} {param.name}" for param in self.params)
 
 
-@dataclasses.dataclass(frozen=True)
-class _RuntimeApi:
-    aliases: tuple[str, ...]
-    memcpy_kind_constants: tuple[str, ...]
-    functions: tuple[_Function, ...]
-
-
-def _runtime_struct_body(source_root):
-    text = (source_root / _NVIDIA_RUNTIME_HEADER).read_text()
-    match = re.search(
-        r"struct Runtime<Device::Type::kNvidia>.*?\{\n(?P<body>.*?)\n\};",
-        text,
-        flags=re.DOTALL,
-    )
-    if match is None:
-        raise ValueError("could not find NVIDIA Runtime specialization")
-
-    return match.group("body")
-
-
-def _public_type_name(name):
-    return _PUBLIC_TYPE_NAMES.get(name, name)
-
-
-def _parse_param(param):
-    param_type, param_name = " ".join(param.strip().split()).rsplit(" ", 1)
-
-    return _Param(_public_type_name(param_type), param_name)
-
-
-def _parse_function(match):
-    params = match.group("params").strip()
-    return _Function(
-        _public_type_name(match.group("return_type")),
-        match.group("name"),
-        tuple(_parse_param(param) for param in re.split(r"\s*,\s*", params) if param),
-    )
-
-
-def _parse_nvidia_runtime_api(source_root):
-    body = _runtime_struct_body(source_root)
-    aliases = tuple(
-        alias
-        for alias in re.findall(r"^\s+using\s+(\w+)\s*=", body, flags=re.MULTILINE)
-        if alias in {"Error", "Stream"}
-    )
-    memcpy_kind_constants = tuple(
-        re.findall(
-            r"^\s+static constexpr auto (kMemcpy\w+)\s*=",
-            body,
-            flags=re.MULTILINE,
-        )
-    )
-    functions = tuple(
-        _parse_function(match)
-        for match in re.finditer(
-            r"^\s+static\s+(?P<return_type>\w+)\s+"
-            r"(?P<name>[A-Z]\w*)\((?P<params>[^)]*)\)\s*\{",
-            body,
-            flags=re.MULTILINE,
-        )
-    )
-
-    if not aliases or not memcpy_kind_constants or not functions:
-        raise ValueError("NVIDIA Runtime specialization has no public API")
-
-    return _RuntimeApi(aliases, memcpy_kind_constants, functions)
+_PUBLIC_RUNTIME_FUNCTIONS = (
+    _Function("Error", "SetDevice", (_Param("int", "device"),)),
+    _Function("Error", "GetDevice", (_Param("int*", "device"),)),
+    _Function("Error", "GetDeviceCount", (_Param("int*", "count"),)),
+    _Function("Error", "DeviceSynchronize", ()),
+    _Function(
+        "Error",
+        "Malloc",
+        (_Param("void**", "ptr"), _Param("std::size_t", "size")),
+    ),
+    _Function("Error", "Free", (_Param("void*", "ptr"),)),
+    _Function(
+        "Error",
+        "Memset",
+        (
+            _Param("void*", "ptr"),
+            _Param("int", "value"),
+            _Param("std::size_t", "count"),
+        ),
+    ),
+    _Function(
+        "Error",
+        "Memcpy",
+        (
+            _Param("void*", "dst"),
+            _Param("const void*", "src"),
+            _Param("std::size_t", "count"),
+            _Param("MemcpyKind", "kind"),
+        ),
+    ),
+    _Function(
+        "Error",
+        "MemcpyAsync",
+        (
+            _Param("void*", "dst"),
+            _Param("const void*", "src"),
+            _Param("std::size_t", "count"),
+            _Param("MemcpyKind", "kind"),
+            _Param("Stream", "stream"),
+        ),
+    ),
+)
 
 
 def _default_device(devices):
@@ -379,8 +346,8 @@ def _write_runtime_dispatch_function(function, devices):
 """
 
 
-def _write_runtime_dispatch(source_path, source_root, devices):
-    functions = _parse_nvidia_runtime_api(source_root).functions
+def _write_runtime_dispatch(source_path, devices):
+    functions = _PUBLIC_RUNTIME_FUNCTIONS
     dispatch_functions = "\n".join(
         _write_runtime_dispatch_function(function, devices=devices)
         for function in functions
@@ -490,8 +457,8 @@ def main():
         for wrapper_device, header_name, target in _DEVICE_HEADERS[device]:
             _write_wrapper(include_root, wrapper_device, header_name, target)
 
-    _write_generated_header(include_root, source_root, devices)
-    _write_runtime_dispatch(pathlib.Path(args.source_output), source_root, devices)
+    _write_generated_header(include_root, devices)
+    _write_runtime_dispatch(pathlib.Path(args.source_output), devices)
 
 
 if __name__ == "__main__":
diff --git a/src/native/cuda/nvidia/runtime_.h b/src/native/cuda/nvidia/runtime_.h
index 1ce61f7..1786e08 100644
--- a/src/native/cuda/nvidia/runtime_.h
+++ b/src/native/cuda/nvidia/runtime_.h
@@ -1,7 +1,7 @@
 #ifndef INFINI_RT_NVIDIA_RUNTIME__H_
 #define INFINI_RT_NVIDIA_RUNTIME__H_
 
-#include <cstddef>
+#include <utility>
 
 // clang-format off
 #include <cuda_runtime.h>
@@ -23,29 +23,23 @@ struct Runtime<Device::Type::kNvidia>
 
   static constexpr Error kSuccess = cudaSuccess;
 
-  static Error SetDevice(int device) { return cudaSetDevice(device); }
+  static constexpr auto SetDevice = cudaSetDevice;
 
-  static Error GetDevice(int* device) { return cudaGetDevice(device); }
+  static constexpr auto GetDevice = cudaGetDevice;
 
-  static Error GetDeviceCount(int* count) { return cudaGetDeviceCount(count); }
+  static constexpr auto GetDeviceCount = cudaGetDeviceCount;
 
-  static Error DeviceSynchronize() { return cudaDeviceSynchronize(); }
+  static constexpr auto DeviceSynchronize = cudaDeviceSynchronize;
 
-  static Error Malloc(void** ptr, std::size_t size) {
-    return cudaMalloc(ptr, size);
-  }
+  static constexpr auto Malloc = [](auto&&... args) {
+    return cudaMalloc(std::forward<decltype(args)>(args)...);
+  };
 
-  static Error Memcpy(void* dst, const void* src, std::size_t count,
-                      cudaMemcpyKind kind) {
-    return cudaMemcpy(dst, src, count, kind);
-  }
+  static constexpr auto Memcpy = cudaMemcpy;
 
-  static Error MemcpyAsync(void* dst, const void* src, std::size_t count,
-                           cudaMemcpyKind kind, Stream stream) {
-    return cudaMemcpyAsync(dst, src, count, kind, stream);
-  }
+  static constexpr auto MemcpyAsync = cudaMemcpyAsync;
 
-  static Error Free(void* ptr) { return cudaFree(ptr); }
+  static constexpr auto Free = cudaFree;
 
   static constexpr auto kMemcpyHostToHost = cudaMemcpyHostToHost;
 
@@ -55,9 +49,7 @@ struct Runtime<Device::Type::kNvidia>
 
   static constexpr auto kMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
 
-  static Error Memset(void* ptr, int value, std::size_t count) {
-    return cudaMemset(ptr, value, count);
-  }
+  static constexpr auto Memset = cudaMemset;
 };
 
 static_assert(Runtime<Device::Type::kNvidia>::Validate());

From 091d22bb7df85f248c311a6f1724716e9fbc4558 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <huangjiacheng0709@outlook.com>
Date: Thu, 2 Jul 2026 12:46:23 +0800
Subject: [PATCH 08/15] Add TensorView constructor guard test

---
 src/tensor_view.h         | 22 +++++++++++++-
 tests/CMakeLists.txt      |  1 +
 tests/test_core.cc        | 40 -------------------------
 tests/test_tensor_view.cc | 63 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+), 41 deletions(-)
 create mode 100644 tests/test_tensor_view.cc

diff --git a/src/tensor_view.h b/src/tensor_view.h
index 0b6fc59..dcf7cc9 100644
--- a/src/tensor_view.h
+++ b/src/tensor_view.h
@@ -3,6 +3,8 @@
 
 #include <cstdint>
 #include <string>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "data_type.h"
@@ -11,6 +13,22 @@
 
 namespace infini::rt {
 
+namespace tensor_view_detail {
+
+template <typename T, typename = void>
+struct IsTensorLike : std::false_type {};
+
+template <typename T>
+struct IsTensorLike<T,
+                    std::void_t<decltype(std::declval<const T&>().data()),
+                                decltype(std::declval<const T&>().shape()),
+                                decltype(std::declval<const T&>().dtype()),
+                                decltype(std::declval<const T&>().device()),
+                                decltype(std::declval<const T&>().strides())>>
+    : std::true_type {};
+
+}  // namespace tensor_view_detail
+
 class TensorView {
  public:
   using Size = std::size_t;
@@ -23,7 +41,9 @@ class TensorView {
 
   using Strides = std::vector<Stride>;
 
-  template <typename TensorLike>
+  template <typename TensorLike,
+            typename = std::enable_if_t<
+                tensor_view_detail::IsTensorLike<TensorLike>::value>>
   TensorView(const TensorLike& tensor)
       : data_{const_cast<void*>(static_cast<const void*>(tensor.data()))},
         shape_{tensor.shape()},
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f24954c..672d3b6 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,6 +6,7 @@ endfunction()
 
 add_infini_rt_test(test_smoke test_smoke.cc)
 add_infini_rt_test(test_core test_core.cc)
+add_infini_rt_test(test_tensor_view test_tensor_view.cc)
 
 if(WITH_CPU OR WITH_NVIDIA)
     add_infini_rt_test(test_runtime_dispatch test_runtime_dispatch.cc)
diff --git a/tests/test_core.cc b/tests/test_core.cc
index e8b8071..20be207 100644
--- a/tests/test_core.cc
+++ b/tests/test_core.cc
@@ -2,7 +2,6 @@
 
 #include <cstdint>
 #include <string>
-#include <vector>
 
 #include "test_helper.h"
 
@@ -10,7 +9,6 @@ namespace {
 
 using infini::rt::DataType;
 using infini::rt::Device;
-using infini::rt::TensorView;
 
 void TestDevice(infini::rt::test::TestContext* context) {
   const Device cpu{Device::Type::kCpu};
@@ -41,43 +39,6 @@ void TestDataType(infini::rt::test::TestContext* context) {
                        DataType::kUInt16, "uint16 should parse by name.");
 }
 
-void TestTensorView(infini::rt::test::TestContext* context) {
-  std::vector<float> data{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  TensorView tensor{data.data(), std::vector<std::size_t>{2, 3},
-                    DataType::kFloat32, Device{Device::Type::kCpu}};
-
-  context->ExpectEqual(tensor.ndim(), std::size_t{2},
-                       "TensorView should keep its rank.");
-  context->ExpectEqual(tensor.numel(), std::size_t{6},
-                       "TensorView should compute element count.");
-  context->ExpectEqual(tensor.element_size(), std::size_t{4},
-                       "TensorView should compute element size.");
-  context->ExpectEqual(tensor.size(0), std::size_t{2},
-                       "TensorView should expose dimension sizes.");
-  context->ExpectEqual(tensor.size(-1), std::size_t{3},
-                       "TensorView should support negative dimension sizes.");
-  context->ExpectEqual(tensor.stride(0), std::ptrdiff_t{3},
-                       "TensorView should compute default row-major strides.");
-  context->ExpectEqual(tensor.stride(1), std::ptrdiff_t{1},
-                       "TensorView should compute default innermost stride.");
-  context->Expect(tensor.IsContiguous(),
-                  "Default TensorView strides should be contiguous.");
-
-  TensorView transposed = tensor.T();
-  context->ExpectEqual(transposed.shape(), TensorView::Shape({3, 2}),
-                       "Transposed TensorView should swap shape.");
-  context->ExpectEqual(transposed.strides(), TensorView::Strides({1, 3}),
-                       "Transposed TensorView should swap strides.");
-  context->Expect(!transposed.IsContiguous(),
-                  "Transposed TensorView should not be contiguous.");
-
-  TensorView strided{data.data(), std::vector<std::size_t>{2, 3},
-                     DataType::kFloat32, Device{Device::Type::kCpu},
-                     std::vector<std::ptrdiff_t>{4, 1}};
-  context->Expect(!strided.IsContiguous(),
-                  "TensorView with row padding should not be contiguous.");
-}
-
 }  // namespace
 
 int main() {
@@ -85,7 +46,6 @@ int main() {
 
   TestDevice(&context);
   TestDataType(&context);
-  TestTensorView(&context);
 
   return context.ExitCode();
 }
diff --git a/tests/test_tensor_view.cc b/tests/test_tensor_view.cc
new file mode 100644
index 0000000..37b2c8d
--- /dev/null
+++ b/tests/test_tensor_view.cc
@@ -0,0 +1,63 @@
+#include <infini/rt.h>
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include "test_helper.h"
+
+namespace {
+
+using infini::rt::DataType;
+using infini::rt::Device;
+using infini::rt::TensorView;
+
+static_assert(!std::is_constructible_v<TensorView, std::vector<TensorView>>,
+              "TensorView should not treat tensor containers as tensor-like.");
+
+void TestTensorView(infini::rt::test::TestContext* context) {
+  std::vector<float> data{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  TensorView tensor{data.data(), std::vector<std::size_t>{2, 3},
+                    DataType::kFloat32, Device{Device::Type::kCpu}};
+
+  context->ExpectEqual(tensor.ndim(), std::size_t{2},
+                       "TensorView should keep its rank.");
+  context->ExpectEqual(tensor.numel(), std::size_t{6},
+                       "TensorView should compute element count.");
+  context->ExpectEqual(tensor.element_size(), std::size_t{4},
+                       "TensorView should compute element size.");
+  context->ExpectEqual(tensor.size(0), std::size_t{2},
+                       "TensorView should expose dimension sizes.");
+  context->ExpectEqual(tensor.size(-1), std::size_t{3},
+                       "TensorView should support negative dimension sizes.");
+  context->ExpectEqual(tensor.stride(0), std::ptrdiff_t{3},
+                       "TensorView should compute default row-major strides.");
+  context->ExpectEqual(tensor.stride(1), std::ptrdiff_t{1},
+                       "TensorView should compute default innermost stride.");
+  context->Expect(tensor.IsContiguous(),
+                  "Default TensorView strides should be contiguous.");
+
+  TensorView transposed = tensor.T();
+  context->ExpectEqual(transposed.shape(), TensorView::Shape({3, 2}),
+                       "Transposed TensorView should swap shape.");
+  context->ExpectEqual(transposed.strides(), TensorView::Strides({1, 3}),
+                       "Transposed TensorView should swap strides.");
+  context->Expect(!transposed.IsContiguous(),
+                  "Transposed TensorView should not be contiguous.");
+
+  TensorView strided{data.data(), std::vector<std::size_t>{2, 3},
+                     DataType::kFloat32, Device{Device::Type::kCpu},
+                     std::vector<std::ptrdiff_t>{4, 1}};
+  context->Expect(!strided.IsContiguous(),
+                  "TensorView with row padding should not be contiguous.");
+}
+
+}  // namespace
+
+int main() {
+  infini::rt::test::TestContext context;
+
+  TestTensorView(&context);
+
+  return context.ExitCode();
+}

From 8189784c52f8502c17a74a170ac3270854741e41 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <45955067+voltjia@users.noreply.github.com>
Date: Thu, 2 Jul 2026 16:25:49 +0800
Subject: [PATCH 09/15] Align runtime memcpy kind constants with CUDA API

---
 scripts/generate_public_headers.py | 33 +++++++++++++++++-------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/scripts/generate_public_headers.py b/scripts/generate_public_headers.py
index b10e7f4..e929682 100644
--- a/scripts/generate_public_headers.py
+++ b/scripts/generate_public_headers.py
@@ -161,6 +161,7 @@ def _write_generated_header(include_root, devices):
     default_device_type = _DEVICE_TYPES[default_device]
     includes = [
         "#include <cstddef>",
+        "#include <type_traits>",
         f"#include {_detail_include('data_type.h')}",
         f"#include {_detail_include('device.h')}",
         f"#include {_detail_include('hash.h')}",
@@ -205,18 +206,22 @@ def _write_generated_header(include_root, devices):
 
 using Stream = typename generated_detail::DefaultErrorRuntime::Stream;
 
+using MemcpyKind = std::remove_cv_t<
+    decltype(generated_detail::DefaultErrorRuntime::kMemcpyHostToHost)>;
+
 inline constexpr Error kSuccess = generated_detail::DefaultErrorRuntime::kSuccess;
 
-enum class MemcpyKind {{
-  kMemcpyHostToHost =
-      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyHostToHost),
-  kMemcpyHostToDevice =
-      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyHostToDevice),
-  kMemcpyDeviceToHost =
-      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyDeviceToHost),
-  kMemcpyDeviceToDevice =
-      static_cast<int>(generated_detail::DefaultErrorRuntime::kMemcpyDeviceToDevice),
-}};
+inline constexpr MemcpyKind kMemcpyHostToHost =
+    generated_detail::DefaultErrorRuntime::kMemcpyHostToHost;
+
+inline constexpr MemcpyKind kMemcpyHostToDevice =
+    generated_detail::DefaultErrorRuntime::kMemcpyHostToDevice;
+
+inline constexpr MemcpyKind kMemcpyDeviceToHost =
+    generated_detail::DefaultErrorRuntime::kMemcpyDeviceToHost;
+
+inline constexpr MemcpyKind kMemcpyDeviceToDevice =
+    generated_detail::DefaultErrorRuntime::kMemcpyDeviceToDevice;
 
 {runtime_declarations}
 
@@ -412,13 +417,13 @@ def _write_runtime_dispatch(source_path, devices):
   using DeviceRuntime = Runtime<device_type>;
 
   switch (kind) {{
-    case MemcpyKind::kMemcpyHostToHost:
+    case kMemcpyHostToHost:
       return DeviceRuntime::kMemcpyHostToHost;
-    case MemcpyKind::kMemcpyHostToDevice:
+    case kMemcpyHostToDevice:
       return DeviceRuntime::kMemcpyHostToDevice;
-    case MemcpyKind::kMemcpyDeviceToHost:
+    case kMemcpyDeviceToHost:
       return DeviceRuntime::kMemcpyDeviceToHost;
-    case MemcpyKind::kMemcpyDeviceToDevice:
+    case kMemcpyDeviceToDevice:
       return DeviceRuntime::kMemcpyDeviceToDevice;
   }}
 

From c35cd9f18e5f31ccbfd5713099f3a3e23ea31561 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <45955067+voltjia@users.noreply.github.com>
Date: Thu, 2 Jul 2026 16:26:40 +0800
Subject: [PATCH 10/15] Use CUDA-style runtime memcpy constants

---
 tests/test_runtime_dispatch.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test_runtime_dispatch.cc b/tests/test_runtime_dispatch.cc
index e48a276..347d6e8 100644
--- a/tests/test_runtime_dispatch.cc
+++ b/tests/test_runtime_dispatch.cc
@@ -48,15 +48,15 @@ void TestCpuDispatch(infini::rt::test::TestContext* context) {
 
   ExpectSuccess(context,
                 runtime::Memcpy(ptr, input.data(), input.size(),
-                                runtime::MemcpyKind::kMemcpyHostToDevice),
+                                runtime::kMemcpyHostToDevice),
                 "CPU dispatch should copy host data to runtime memory.");
   context->Expect(runtime::MemcpyAsync(ptr, input.data(), input.size(),
-                                       runtime::MemcpyKind::kMemcpyHostToDevice,
+                                       runtime::kMemcpyHostToDevice,
                                        nullptr) != runtime::kSuccess,
                   "CPU dispatch should not report async memcpy success.");
   ExpectSuccess(context,
                 runtime::Memcpy(output.data(), ptr, output.size(),
-                                runtime::MemcpyKind::kMemcpyDeviceToHost),
+                                runtime::kMemcpyDeviceToHost),
                 "CPU dispatch should copy runtime memory to host.");
 
   context->ExpectEqual(output, input,
@@ -66,7 +66,7 @@ void TestCpuDispatch(infini::rt::test::TestContext* context) {
                 "CPU dispatch should fill runtime memory.");
   ExpectSuccess(context,
                 runtime::Memcpy(output.data(), ptr, output.size(),
-                                runtime::MemcpyKind::kMemcpyDeviceToHost),
+                                runtime::kMemcpyDeviceToHost),
                 "CPU dispatch should copy filled memory to host.");
   for (const auto value : output) {
     context->ExpectEqual(value, static_cast<std::uint8_t>(0x5A),
@@ -112,13 +112,13 @@ void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
   ExpectSuccess(
       context,
       runtime::MemcpyAsync(ptr, input.data(), input.size(),
-                           runtime::MemcpyKind::kMemcpyHostToDevice, nullptr),
+                           runtime::kMemcpyHostToDevice, nullptr),
       "NVIDIA dispatch should support async host-to-device copy.");
   ExpectSuccess(context, runtime::DeviceSynchronize(),
                 "NVIDIA dispatch should synchronize the device.");
   ExpectSuccess(context,
                 runtime::Memcpy(output.data(), ptr, output.size(),
-                                runtime::MemcpyKind::kMemcpyDeviceToHost),
+                                runtime::kMemcpyDeviceToHost),
                 "NVIDIA dispatch should copy device data to host.");
 
   context->ExpectEqual(output, input,
@@ -130,7 +130,7 @@ void TestNvidiaDispatch(infini::rt::test::TestContext* context) {
                 "NVIDIA dispatch should synchronize filled memory.");
   ExpectSuccess(context,
                 runtime::Memcpy(output.data(), ptr, output.size(),
-                                runtime::MemcpyKind::kMemcpyDeviceToHost),
+                                runtime::kMemcpyDeviceToHost),
                 "NVIDIA dispatch should copy filled memory to host.");
   for (const auto value : output) {
     context->ExpectEqual(value, static_cast<std::uint8_t>(0x5A),

From 7b5b7e34293f9f728692f5585d82bde411add2aa Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <45955067+voltjia@users.noreply.github.com>
Date: Thu, 2 Jul 2026 16:27:02 +0800
Subject: [PATCH 11/15] Use CUDA-style runtime memcpy constants

---
 tests/install_consumer_smoke.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/install_consumer_smoke.cc b/tests/install_consumer_smoke.cc
index 14e6c0d..0684763 100644
--- a/tests/install_consumer_smoke.cc
+++ b/tests/install_consumer_smoke.cc
@@ -59,19 +59,18 @@ int main() {
     return 1;
   }
   if (runtime::Memcpy(ptr, input.data(), input.size(),
-                      runtime::MemcpyKind::kMemcpyHostToDevice) !=
-      runtime::kSuccess) {
+                      runtime::kMemcpyHostToDevice) != runtime::kSuccess) {
     return 1;
   }
 #if defined(INFINI_RT_CONSUMER_BACKEND_CPU)
   if (runtime::MemcpyAsync(ptr, input.data(), input.size(),
-                           runtime::MemcpyKind::kMemcpyHostToDevice,
+                           runtime::kMemcpyHostToDevice,
                            nullptr) == runtime::kSuccess) {
     return 1;
   }
 #else
   if (runtime::MemcpyAsync(ptr, input.data(), input.size(),
-                           runtime::MemcpyKind::kMemcpyHostToDevice,
+                           runtime::kMemcpyHostToDevice,
                            nullptr) != runtime::kSuccess) {
     return 1;
   }
@@ -80,8 +79,7 @@ int main() {
     return 1;
   }
   if (runtime::Memcpy(output.data(), ptr, output.size(),
-                      runtime::MemcpyKind::kMemcpyDeviceToHost) !=
-      runtime::kSuccess) {
+                      runtime::kMemcpyDeviceToHost) != runtime::kSuccess) {
     return 1;
   }
   if (output != input) {

From 58e0c854d1c88af4937e04ec38c15f29c6c41570 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <45955067+voltjia@users.noreply.github.com>
Date: Thu, 2 Jul 2026 16:36:09 +0800
Subject: [PATCH 12/15] Move TensorView tests back into core test

---
 tests/test_core.cc | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test_core.cc b/tests/test_core.cc
index 20be207..273ecef 100644
--- a/tests/test_core.cc
+++ b/tests/test_core.cc
@@ -1,7 +1,10 @@
 #include <infini/rt.h>
 
+#include <cstddef>
 #include <cstdint>
 #include <string>
+#include <type_traits>
+#include <vector>
 
 #include "test_helper.h"
 
@@ -9,6 +12,10 @@ namespace {
 
 using infini::rt::DataType;
 using infini::rt::Device;
+using infini::rt::TensorView;
+
+static_assert(!std::is_constructible_v<TensorView, std::vector<TensorView>>,
+              "TensorView should not treat tensor containers as tensor-like.");
 
 void TestDevice(infini::rt::test::TestContext* context) {
   const Device cpu{Device::Type::kCpu};
@@ -39,6 +46,43 @@ void TestDataType(infini::rt::test::TestContext* context) {
                        DataType::kUInt16, "uint16 should parse by name.");
 }
 
+void TestTensorView(infini::rt::test::TestContext* context) {
+  std::vector<float> data{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  TensorView tensor{data.data(), std::vector<std::size_t>{2, 3},
+                    DataType::kFloat32, Device{Device::Type::kCpu}};
+
+  context->ExpectEqual(tensor.ndim(), std::size_t{2},
+                       "TensorView should keep its rank.");
+  context->ExpectEqual(tensor.numel(), std::size_t{6},
+                       "TensorView should compute element count.");
+  context->ExpectEqual(tensor.element_size(), std::size_t{4},
+                       "TensorView should compute element size.");
+  context->ExpectEqual(tensor.size(0), std::size_t{2},
+                       "TensorView should expose dimension sizes.");
+  context->ExpectEqual(tensor.size(-1), std::size_t{3},
+                       "TensorView should support negative dimension sizes.");
+  context->ExpectEqual(tensor.stride(0), std::ptrdiff_t{3},
+                       "TensorView should compute default row-major strides.");
+  context->ExpectEqual(tensor.stride(1), std::ptrdiff_t{1},
+                       "TensorView should compute default innermost stride.");
+  context->Expect(tensor.IsContiguous(),
+                  "Default TensorView strides should be contiguous.");
+
+  TensorView transposed = tensor.T();
+  context->ExpectEqual(transposed.shape(), TensorView::Shape({3, 2}),
+                       "Transposed TensorView should swap shape.");
+  context->ExpectEqual(transposed.strides(), TensorView::Strides({1, 3}),
+                       "Transposed TensorView should swap strides.");
+  context->Expect(!transposed.IsContiguous(),
+                  "Transposed TensorView should not be contiguous.");
+
+  TensorView strided{data.data(), std::vector<std::size_t>{2, 3},
+                     DataType::kFloat32, Device{Device::Type::kCpu},
+                     std::vector<std::ptrdiff_t>{4, 1}};
+  context->Expect(!strided.IsContiguous(),
+                  "TensorView with row padding should not be contiguous.");
+}
+
 }  // namespace
 
 int main() {
@@ -46,6 +90,7 @@ int main() {
 
   TestDevice(&context);
   TestDataType(&context);
+  TestTensorView(&context);
 
   return context.ExitCode();
 }

From 4cdda6d183a3d024da15c8b6e02c2151f1709b42 Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <45955067+voltjia@users.noreply.github.com>
Date: Thu, 2 Jul 2026 16:37:09 +0800
Subject: [PATCH 13/15] Remove standalone TensorView test target

---
 tests/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 672d3b6..f24954c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,7 +6,6 @@ endfunction()
 
 add_infini_rt_test(test_smoke test_smoke.cc)
 add_infini_rt_test(test_core test_core.cc)
-add_infini_rt_test(test_tensor_view test_tensor_view.cc)
 
 if(WITH_CPU OR WITH_NVIDIA)
     add_infini_rt_test(test_runtime_dispatch test_runtime_dispatch.cc)

From a0679a71fa35f329480a6d9c3be3ab7944058c0b Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <45955067+voltjia@users.noreply.github.com>
Date: Thu, 2 Jul 2026 16:37:40 +0800
Subject: [PATCH 14/15] Remove standalone TensorView test file

---
 tests/test_tensor_view.cc | 63 ---------------------------------------
 1 file changed, 63 deletions(-)
 delete mode 100644 tests/test_tensor_view.cc

diff --git a/tests/test_tensor_view.cc b/tests/test_tensor_view.cc
deleted file mode 100644
index 37b2c8d..0000000
--- a/tests/test_tensor_view.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <infini/rt.h>
-
-#include <cstdint>
-#include <type_traits>
-#include <vector>
-
-#include "test_helper.h"
-
-namespace {
-
-using infini::rt::DataType;
-using infini::rt::Device;
-using infini::rt::TensorView;
-
-static_assert(!std::is_constructible_v<TensorView, std::vector<TensorView>>,
-              "TensorView should not treat tensor containers as tensor-like.");
-
-void TestTensorView(infini::rt::test::TestContext* context) {
-  std::vector<float> data{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  TensorView tensor{data.data(), std::vector<std::size_t>{2, 3},
-                    DataType::kFloat32, Device{Device::Type::kCpu}};
-
-  context->ExpectEqual(tensor.ndim(), std::size_t{2},
-                       "TensorView should keep its rank.");
-  context->ExpectEqual(tensor.numel(), std::size_t{6},
-                       "TensorView should compute element count.");
-  context->ExpectEqual(tensor.element_size(), std::size_t{4},
-                       "TensorView should compute element size.");
-  context->ExpectEqual(tensor.size(0), std::size_t{2},
-                       "TensorView should expose dimension sizes.");
-  context->ExpectEqual(tensor.size(-1), std::size_t{3},
-                       "TensorView should support negative dimension sizes.");
-  context->ExpectEqual(tensor.stride(0), std::ptrdiff_t{3},
-                       "TensorView should compute default row-major strides.");
-  context->ExpectEqual(tensor.stride(1), std::ptrdiff_t{1},
-                       "TensorView should compute default innermost stride.");
-  context->Expect(tensor.IsContiguous(),
-                  "Default TensorView strides should be contiguous.");
-
-  TensorView transposed = tensor.T();
-  context->ExpectEqual(transposed.shape(), TensorView::Shape({3, 2}),
-                       "Transposed TensorView should swap shape.");
-  context->ExpectEqual(transposed.strides(), TensorView::Strides({1, 3}),
-                       "Transposed TensorView should swap strides.");
-  context->Expect(!transposed.IsContiguous(),
-                  "Transposed TensorView should not be contiguous.");
-
-  TensorView strided{data.data(), std::vector<std::size_t>{2, 3},
-                     DataType::kFloat32, Device{Device::Type::kCpu},
-                     std::vector<std::ptrdiff_t>{4, 1}};
-  context->Expect(!strided.IsContiguous(),
-                  "TensorView with row padding should not be contiguous.");
-}
-
-}  // namespace
-
-int main() {
-  infini::rt::test::TestContext context;
-
-  TestTensorView(&context);
-
-  return context.ExitCode();
-}

From 04c13489456aa62c2734ced4353d8237d961785b Mon Sep 17 00:00:00 2001
From: Jiacheng Huang <45955067+voltjia@users.noreply.github.com>
Date: Thu, 2 Jul 2026 16:38:06 +0800
Subject: [PATCH 15/15] Use fully qualified runtime API names in README

---
 README.md | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 51099fc..5d41768 100644
--- a/README.md
+++ b/README.md
@@ -66,16 +66,14 @@ cmake --install build
 #include <infini/rt.h>
 
 int main() {
-  namespace runtime = infini::rt::runtime;
-
-  runtime::SetDevice(0);
+  infini::rt::runtime::SetDevice(0);
 
   constexpr std::size_t size = 1024;
   void* ptr = nullptr;
 
-  runtime::Malloc(&ptr, size);
-  runtime::Memset(ptr, 0, size);
-  runtime::Free(ptr);
+  infini::rt::runtime::Malloc(&ptr, size);
+  infini::rt::runtime::Memset(ptr, 0, size);
+  infini::rt::runtime::Free(ptr);
 
   return 0;
 }
@@ -88,18 +86,16 @@ those runtime calls. A GPU backend is selected initially when one is enabled;
 otherwise CPU is selected.
 
 ```cpp
-namespace runtime = infini::rt::runtime;
-
 constexpr std::size_t size = 1024;
 void* ptr = nullptr;
 
 infini::rt::set_runtime_device_type(infini::rt::Device::Type::kCpu);
-runtime::Malloc(&ptr, size);
-runtime::Free(ptr);
+infini::rt::runtime::Malloc(&ptr, size);
+infini::rt::runtime::Free(ptr);
 
 infini::rt::set_runtime_device_type(infini::rt::Device::Type::kNvidia);
-runtime::Malloc(&ptr, size);
-runtime::Free(ptr);
+infini::rt::runtime::Malloc(&ptr, size);
+infini::rt::runtime::Free(ptr);
 ```
 
 Use `infini::rt::runtime::Runtime<infini::rt::Device::Type::kCpu>` when CPU