From 49c616bbfef19b37a5d4fddcb6c8fef4e8b51d4e Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Thu, 29 Jan 2026 00:51:20 -0600
Subject: [PATCH 01/20] TensorRT implementation
---
backend/src/nodes/impl/tensorrt/__init__.py | 37 +++
backend/src/nodes/impl/tensorrt/auto_split.py | 98 +++++++
.../src/nodes/impl/tensorrt/engine_builder.py | 268 ++++++++++++++++++
backend/src/nodes/impl/tensorrt/inference.py | 239 ++++++++++++++++
backend/src/nodes/impl/tensorrt/memory.py | 185 ++++++++++++
backend/src/nodes/impl/tensorrt/model.py | 56 ++++
.../src/nodes/properties/inputs/__init__.py | 4 +
.../nodes/properties/inputs/file_inputs.py | 14 +
.../properties/inputs/tensorrt_inputs.py | 57 ++++
.../src/nodes/properties/outputs/__init__.py | 4 +
.../properties/outputs/tensorrt_outputs.py | 55 ++++
.../packages/chaiNNer_tensorrt/__init__.py | 60 ++++
.../packages/chaiNNer_tensorrt/settings.py | 104 +++++++
.../chaiNNer_tensorrt/tensorrt/__init__.py | 10 +
.../chaiNNer_tensorrt/tensorrt/io/__init__.py | 1 +
.../tensorrt/io/build_engine_from_onnx.py | 199 +++++++++++++
.../tensorrt/io/load_engine.py | 114 ++++++++
.../tensorrt/io/save_engine.py | 36 +++
.../tensorrt/processing/__init__.py | 1 +
.../tensorrt/processing/upscale_image.py | 126 ++++++++
.../tensorrt/utility/__init__.py | 1 +
.../tensorrt/utility/get_engine_info.py | 60 ++++
backend/src/server.py | 1 +
src/common/types/chainner-scope.ts | 21 ++
src/renderer/colors.scss | 2 +
src/renderer/components/CustomIcons.tsx | 18 ++
src/renderer/helpers/accentColors.ts | 1 +
27 files changed, 1772 insertions(+)
create mode 100644 backend/src/nodes/impl/tensorrt/__init__.py
create mode 100644 backend/src/nodes/impl/tensorrt/auto_split.py
create mode 100644 backend/src/nodes/impl/tensorrt/engine_builder.py
create mode 100644 backend/src/nodes/impl/tensorrt/inference.py
create mode 100644 backend/src/nodes/impl/tensorrt/memory.py
create mode 100644 backend/src/nodes/impl/tensorrt/model.py
create mode 100644 backend/src/nodes/properties/inputs/tensorrt_inputs.py
create mode 100644 backend/src/nodes/properties/outputs/tensorrt_outputs.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/__init__.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/settings.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/__init__.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/io/__init__.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/io/build_engine_from_onnx.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/__init__.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/__init__.py
create mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py
diff --git a/backend/src/nodes/impl/tensorrt/__init__.py b/backend/src/nodes/impl/tensorrt/__init__.py
new file mode 100644
index 0000000000..a957ac846a
--- /dev/null
+++ b/backend/src/nodes/impl/tensorrt/__init__.py
@@ -0,0 +1,37 @@
+"""TensorRT implementation utilities."""
+
+from .auto_split import tensorrt_auto_split
+from .engine_builder import BuildConfig, build_engine_from_onnx
+from .inference import (
+ TensorRTSession,
+ clear_session_cache,
+ get_tensorrt_session,
+ run_inference,
+)
+from .memory import (
+ CudaBuffer,
+ CudaMemoryManager,
+ check_cuda_available,
+ cuda_memory_context,
+ get_cuda_compute_capability,
+ get_cuda_device_name,
+)
+from .model import TensorRTEngine, TensorRTEngineInfo
+
+__all__ = [
+ "BuildConfig",
+ "CudaBuffer",
+ "CudaMemoryManager",
+ "TensorRTEngine",
+ "TensorRTEngineInfo",
+ "TensorRTSession",
+ "build_engine_from_onnx",
+ "check_cuda_available",
+ "clear_session_cache",
+ "cuda_memory_context",
+ "get_cuda_compute_capability",
+ "get_cuda_device_name",
+ "get_tensorrt_session",
+ "run_inference",
+ "tensorrt_auto_split",
+]
diff --git a/backend/src/nodes/impl/tensorrt/auto_split.py b/backend/src/nodes/impl/tensorrt/auto_split.py
new file mode 100644
index 0000000000..079ba0c429
--- /dev/null
+++ b/backend/src/nodes/impl/tensorrt/auto_split.py
@@ -0,0 +1,98 @@
+"""Auto-tiling support for TensorRT inference."""
+
+from __future__ import annotations
+
+import gc
+
+import numpy as np
+
+from ..upscale.auto_split import Tiler, auto_split
+from .inference import get_tensorrt_session
+from .model import TensorRTEngine
+
+
+def _into_batched_form(img: np.ndarray) -> np.ndarray:
+ """Convert image to NCHW batched format."""
+ shape_size = len(img.shape)
+ if shape_size == 3:
+ # (H, W, C) -> (1, C, H, W)
+ return img.transpose((2, 0, 1))[np.newaxis, :]
+ elif shape_size == 2:
+ # (H, W) -> (1, 1, H, W)
+ return img[np.newaxis, np.newaxis, :, :]
+ else:
+ raise ValueError("Unsupported input tensor shape")
+
+
+def _into_standard_image_form(img: np.ndarray) -> np.ndarray:
+ """Convert NCHW output back to HWC format."""
+ shape_size = len(img.shape)
+ if shape_size == 4:
+ # (1, C, H, W) -> (H, W, C)
+ return img.squeeze(0).transpose(1, 2, 0)
+ elif shape_size == 3:
+ # (C, H, W) -> (H, W, C)
+ return img.transpose(1, 2, 0)
+ elif shape_size == 2:
+ # (H, W)
+ return img
+ else:
+ raise ValueError("Unsupported output tensor shape")
+
+
+def tensorrt_auto_split(
+ img: np.ndarray,
+ engine: TensorRTEngine,
+ tiler: Tiler,
+ gpu_index: int = 0,
+) -> np.ndarray:
+ """
+ Run TensorRT inference with automatic tiling for large images.
+
+ Args:
+ img: Input image in HWC format (float32, 0-1 range)
+ engine: TensorRT engine
+ tiler: Tiler configuration for splitting
+ gpu_index: GPU device index
+
+ Returns:
+ Upscaled image in HWC format
+ """
+ session = get_tensorrt_session(engine, gpu_index)
+ is_fp16 = engine.precision == "fp16"
+
+ def upscale(img: np.ndarray, _: object):
+ try:
+ # Convert to appropriate precision
+ lr_img = img.astype(np.float16) if is_fp16 else img.astype(np.float32)
+
+ # Convert to NCHW batched format
+ lr_img = _into_batched_form(lr_img)
+
+ # Run inference
+ output = session.infer(lr_img)
+
+ # Convert back to HWC format
+ output = _into_standard_image_form(output)
+
+ return output.astype(np.float32)
+
+ except Exception as e:
+ error_str = str(e).lower()
+ # Check for CUDA OOM errors
+ if (
+ "out of memory" in error_str
+ or "cuda" in error_str and "memory" in error_str
+ or "allocation" in error_str
+ ):
+ raise RuntimeError( # noqa: B904
+ "A VRAM out-of-memory error has occurred. Please try using a smaller tile size."
+ )
+ else:
+ # Re-raise the exception if not an OOM error
+ raise
+
+ try:
+ return auto_split(img, upscale, tiler)
+ finally:
+ gc.collect()
diff --git a/backend/src/nodes/impl/tensorrt/engine_builder.py b/backend/src/nodes/impl/tensorrt/engine_builder.py
new file mode 100644
index 0000000000..a736031126
--- /dev/null
+++ b/backend/src/nodes/impl/tensorrt/engine_builder.py
@@ -0,0 +1,268 @@
+"""TensorRT engine building utilities."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal
+
+from logger import logger
+
+from .memory import get_cuda_compute_capability, get_cuda_device_name
+from .model import TensorRTEngine, TensorRTEngineInfo
+
+
+@dataclass
+class BuildConfig:
+ """Configuration for TensorRT engine building."""
+
+ precision: Literal["fp32", "fp16"]
+ workspace_size_gb: float
+ min_shape: tuple[int, int] # (height, width)
+ opt_shape: tuple[int, int] # (height, width)
+ max_shape: tuple[int, int] # (height, width)
+ use_dynamic_shapes: bool
+
+
+class TrtLogger:
+ """Custom TensorRT logger that integrates with chaiNNer's logging."""
+
+ def __init__(self):
+ import tensorrt as trt
+
+ self.severity_map = {
+ trt.ILogger.Severity.INTERNAL_ERROR: logger.error,
+ trt.ILogger.Severity.ERROR: logger.error,
+ trt.ILogger.Severity.WARNING: logger.warning,
+ trt.ILogger.Severity.INFO: logger.info,
+ trt.ILogger.Severity.VERBOSE: logger.debug,
+ }
+
+ def log(self, severity, msg):
+ log_fn = self.severity_map.get(severity, logger.debug)
+ log_fn("[TensorRT] %s", msg)
+
+
+def get_trt_logger():
+ """Get a TensorRT logger instance."""
+ import tensorrt as trt
+
+ # Use a simple logger class
+ class SimpleLogger(trt.ILogger):
+ def log(self, severity, msg):
+ if severity <= trt.ILogger.Severity.WARNING:
+ logger.warning("[TensorRT] %s", msg)
+ elif severity <= trt.ILogger.Severity.INFO:
+ logger.info("[TensorRT] %s", msg)
+ else:
+ logger.debug("[TensorRT] %s", msg)
+
+ return SimpleLogger()
+
+
+def parse_onnx_model(
+ onnx_bytes: bytes,
+ network,
+ parser,
+) -> None:
+ """Parse an ONNX model into a TensorRT network."""
+ success = parser.parse(onnx_bytes)
+ if not success:
+ error_msgs = []
+ for i in range(parser.num_errors):
+ error = parser.get_error(i)
+ error_msgs.append(f" {error.desc()}")
+ errors_str = "\n".join(error_msgs)
+ raise RuntimeError(f"Failed to parse ONNX model:\n{errors_str}")
+
+
+def configure_builder_config(
+ builder,
+ config: BuildConfig,
+) -> None:
+ """Configure the TensorRT builder with the given settings."""
+ import tensorrt as trt
+
+ builder_config = builder.create_builder_config()
+
+ # Set workspace size
+ workspace_bytes = int(config.workspace_size_gb * (1024**3))
+ builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes)
+
+ # Set precision
+ if config.precision == "fp16":
+ if builder.platform_has_fast_fp16:
+ builder_config.set_flag(trt.BuilderFlag.FP16)
+ logger.info("FP16 mode enabled")
+ else:
+ logger.warning("FP16 not supported on this platform, falling back to FP32")
+
+ return builder_config
+
+
+def build_engine_from_onnx(
+ onnx_bytes: bytes,
+ config: BuildConfig,
+ gpu_index: int = 0,
+ timing_cache_path: str | None = None,
+) -> TensorRTEngine:
+ """
+ Build a TensorRT engine from an ONNX model.
+
+ Args:
+ onnx_bytes: The ONNX model as bytes
+ config: Build configuration
+ gpu_index: GPU device index to use
+ timing_cache_path: Optional path to timing cache for faster rebuilds
+
+ Returns:
+ A TensorRTEngine instance
+ """
+ import tensorrt as trt
+ from cuda import cudart
+
+ # Set the CUDA device
+ cudart.cudaSetDevice(gpu_index)
+
+ trt_logger = trt.Logger(trt.Logger.WARNING)
+
+ # Create builder and network
+ builder = trt.Builder(trt_logger)
+ network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(network_flags)
+ parser = trt.OnnxParser(network, trt_logger)
+
+ # Parse ONNX model
+ logger.info("Parsing ONNX model...")
+ parse_onnx_model(onnx_bytes, network, parser)
+
+ # Get input/output info
+ input_tensor = network.get_input(0)
+ output_tensor = network.get_output(0)
+
+ input_shape = input_tensor.shape
+ output_shape = output_tensor.shape
+
+ # Detect channels (assuming NCHW format)
+ input_channels = input_shape[1] if len(input_shape) >= 4 else 3
+ output_channels = output_shape[1] if len(output_shape) >= 4 else 3
+
+ # Calculate scale from input/output dimensions
+ scale = None
+ if len(input_shape) >= 4 and len(output_shape) >= 4:
+ in_h, in_w = input_shape[2], input_shape[3]
+ out_h, out_w = output_shape[2], output_shape[3]
+
+ if in_h > 0 and in_w > 0 and out_h > 0 and out_w > 0:
+ scale_h = out_h // in_h if out_h % in_h == 0 else None
+ scale_w = out_w // in_w if out_w % in_w == 0 else None
+ if scale_h == scale_w and scale_h is not None:
+ scale = scale_h
+
+ # Configure builder
+ builder_config = builder.create_builder_config()
+ workspace_bytes = int(config.workspace_size_gb * (1024**3))
+ builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes)
+
+ # Set precision
+ if config.precision == "fp16":
+ if builder.platform_has_fast_fp16:
+ builder_config.set_flag(trt.BuilderFlag.FP16)
+ logger.info("FP16 mode enabled")
+ else:
+ logger.warning("FP16 not supported on this platform, using FP32")
+
+ # Configure dynamic shapes if needed
+ has_dynamic = any(d == -1 for d in input_shape)
+ if has_dynamic or config.use_dynamic_shapes:
+ logger.info("Configuring dynamic shapes...")
+ profile = builder.create_optimization_profile()
+
+ min_h, min_w = config.min_shape
+ opt_h, opt_w = config.opt_shape
+ max_h, max_w = config.max_shape
+
+ # Set the optimization profile for the input tensor
+ min_dims = (1, input_channels, min_h, min_w)
+ opt_dims = (1, input_channels, opt_h, opt_w)
+ max_dims = (1, input_channels, max_h, max_w)
+
+ profile.set_shape(input_tensor.name, min_dims, opt_dims, max_dims)
+ builder_config.add_optimization_profile(profile)
+
+ # Load timing cache if available
+ if timing_cache_path:
+ try:
+ with open(timing_cache_path, "rb") as f:
+ cache_data = f.read()
+ timing_cache = builder_config.create_timing_cache(cache_data)
+ builder_config.set_timing_cache(timing_cache, ignore_mismatch=False)
+ logger.info("Loaded timing cache from %s", timing_cache_path)
+ except FileNotFoundError:
+ logger.debug("No timing cache found at %s", timing_cache_path)
+ except Exception as e:
+ logger.warning("Failed to load timing cache: %s", e)
+
+ # Build the engine
+ logger.info("Building TensorRT engine (this may take a while)...")
+ serialized_engine = builder.build_serialized_network(network, builder_config)
+
+ if serialized_engine is None:
+ raise RuntimeError("Failed to build TensorRT engine")
+
+ # Save timing cache
+ if timing_cache_path:
+ try:
+ timing_cache = builder_config.get_timing_cache()
+ if timing_cache:
+ cache_data = timing_cache.serialize()
+ with open(timing_cache_path, "wb") as f:
+ f.write(cache_data)
+ logger.info("Saved timing cache to %s", timing_cache_path)
+ except Exception as e:
+ logger.warning("Failed to save timing cache: %s", e)
+
+ # Get GPU info
+ major, minor = get_cuda_compute_capability(gpu_index)
+ gpu_arch = f"sm_{major}{minor}"
+ gpu_name = get_cuda_device_name(gpu_index)
+ logger.info("Built engine for %s (%s)", gpu_name, gpu_arch)
+
+ # Create engine info
+ info = TensorRTEngineInfo(
+ precision=config.precision,
+ input_channels=input_channels,
+ output_channels=output_channels,
+ scale=scale,
+ gpu_architecture=gpu_arch,
+ tensorrt_version=trt.__version__,
+ has_dynamic_shapes=has_dynamic or config.use_dynamic_shapes,
+ min_shape=(config.min_shape[1], config.min_shape[0]) if config.use_dynamic_shapes else None,
+ opt_shape=(config.opt_shape[1], config.opt_shape[0]) if config.use_dynamic_shapes else None,
+ max_shape=(config.max_shape[1], config.max_shape[0]) if config.use_dynamic_shapes else None,
+ )
+
+ return TensorRTEngine(bytes(serialized_engine), info)
+
+
+def load_engine_from_bytes(
+ engine_bytes: bytes,
+ gpu_index: int = 0,
+) -> tuple[object, object]:
+ """
+ Load a TensorRT engine from bytes.
+
+ Returns the runtime and deserialized engine.
+ """
+ import tensorrt as trt
+ from cuda import cudart
+
+ cudart.cudaSetDevice(gpu_index)
+
+ trt_logger = trt.Logger(trt.Logger.WARNING)
+ runtime = trt.Runtime(trt_logger)
+
+ engine = runtime.deserialize_cuda_engine(engine_bytes)
+ if engine is None:
+ raise RuntimeError("Failed to deserialize TensorRT engine")
+
+ return runtime, engine
diff --git a/backend/src/nodes/impl/tensorrt/inference.py b/backend/src/nodes/impl/tensorrt/inference.py
new file mode 100644
index 0000000000..358d38c6aa
--- /dev/null
+++ b/backend/src/nodes/impl/tensorrt/inference.py
@@ -0,0 +1,239 @@
+"""TensorRT inference execution utilities."""
+
+from __future__ import annotations
+
+import numpy as np
+
+from logger import logger
+
+from .memory import CudaMemoryManager
+from .model import TensorRTEngine
+
+
+class TensorRTSession:
+ """
+ A session for running TensorRT inference.
+
+ Manages the execution context and memory buffers.
+ """
+
+ def __init__(
+ self,
+ engine: TensorRTEngine,
+ gpu_index: int = 0,
+ ):
+ self.engine = engine
+ self.gpu_index = gpu_index
+ self._trt_engine = None
+ self._runtime = None
+ self._context = None
+ self._memory_manager: CudaMemoryManager | None = None
+ self._stream = None
+ self._is_loaded = False
+
+ def load(self) -> None:
+ """Load the engine and create execution context."""
+ if self._is_loaded:
+ return
+
+ import tensorrt as trt
+ from cuda import cudart
+
+ cudart.cudaSetDevice(self.gpu_index)
+
+ trt_logger = trt.Logger(trt.Logger.WARNING)
+ self._runtime = trt.Runtime(trt_logger)
+ self._trt_engine = self._runtime.deserialize_cuda_engine(self.engine.bytes)
+
+ if self._trt_engine is None:
+ raise RuntimeError("Failed to deserialize TensorRT engine")
+
+ self._context = self._trt_engine.create_execution_context()
+ if self._context is None:
+ raise RuntimeError("Failed to create execution context")
+
+ self._memory_manager = CudaMemoryManager(self.gpu_index)
+ self._stream = self._memory_manager.create_stream()
+ self._is_loaded = True
+
+ def unload(self) -> None:
+ """Unload the engine and free resources."""
+ if not self._is_loaded:
+ return
+
+ if self._memory_manager:
+ self._memory_manager.cleanup()
+ self._memory_manager = None
+
+ self._context = None
+ self._trt_engine = None
+ self._runtime = None
+ self._stream = None
+ self._is_loaded = False
+
+ def __enter__(self):
+ self.load()
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.unload()
+ return False
+
+ @property
+ def is_loaded(self) -> bool:
+ return self._is_loaded
+
+ def _get_binding_info(self) -> tuple[str, str, tuple, tuple]:
+ """Get input and output binding information."""
+ input_name = self._trt_engine.get_tensor_name(0)
+ output_name = self._trt_engine.get_tensor_name(1)
+
+ input_shape = self._trt_engine.get_tensor_shape(input_name)
+ output_shape = self._trt_engine.get_tensor_shape(output_name)
+
+ return input_name, output_name, tuple(input_shape), tuple(output_shape)
+
+ def infer(self, input_array: np.ndarray) -> np.ndarray:
+ """
+ Run inference on an input array.
+
+ Args:
+ input_array: Input image in NCHW format (batch, channels, height, width)
+
+ Returns:
+ Output array in NCHW format
+ """
+ if not self._is_loaded:
+ raise RuntimeError("Session not loaded. Call load() first.")
+
+ import tensorrt as trt
+ from cuda import cudart
+
+ input_name, output_name, _, _ = self._get_binding_info()
+
+ # Get actual input shape
+ batch, channels, height, width = input_array.shape
+
+ # Set input shape for dynamic inputs
+ if self.engine.has_dynamic_shapes:
+ self._context.set_input_shape(input_name, (batch, channels, height, width))
+
+ # Get output shape (may depend on input shape for dynamic models)
+ output_shape = self._context.get_tensor_shape(output_name)
+
+ # Determine precision
+ is_fp16 = self.engine.precision == "fp16"
+ dtype = np.float16 if is_fp16 else np.float32
+
+ # Ensure input is contiguous and correct dtype
+ input_array = np.ascontiguousarray(input_array.astype(dtype))
+
+ # Allocate output array
+ output_array = np.zeros(output_shape, dtype=dtype)
+
+ # Allocate device memory
+ assert self._memory_manager is not None
+ input_buffer = self._memory_manager.allocate_like(input_array)
+ output_buffer = self._memory_manager.allocate_like(output_array)
+
+ try:
+ # Copy input to device
+ self._memory_manager.copy_to_device(input_array, input_buffer)
+
+ # Set tensor addresses
+ self._context.set_tensor_address(input_name, input_buffer.device_ptr)
+ self._context.set_tensor_address(output_name, output_buffer.device_ptr)
+
+ # Execute inference
+ success = self._context.execute_async_v3(self._stream)
+ if not success:
+ raise RuntimeError("TensorRT inference execution failed")
+
+ # Synchronize stream
+ self._memory_manager.synchronize_stream()
+
+ # Copy output to host
+ self._memory_manager.copy_to_host(output_buffer, output_array)
+
+ finally:
+ # Free temporary buffers
+ input_buffer.free()
+ output_buffer.free()
+ # Remove from tracked buffers
+ if input_buffer in self._memory_manager._buffers:
+ self._memory_manager._buffers.remove(input_buffer)
+ if output_buffer in self._memory_manager._buffers:
+ self._memory_manager._buffers.remove(output_buffer)
+
+ return output_array.astype(np.float32)
+
+
+# Session cache to avoid repeatedly loading engines
+_session_cache: dict[int, TensorRTSession] = {}
+
+
+def get_tensorrt_session(
+ engine: TensorRTEngine,
+ gpu_index: int = 0,
+) -> TensorRTSession:
+ """
+ Get a TensorRT session, using caching to avoid reloading.
+
+ The session is cached based on the engine bytes identity.
+ """
+ cache_key = id(engine.bytes)
+
+ if cache_key not in _session_cache:
+ session = TensorRTSession(engine, gpu_index)
+ session.load()
+ _session_cache[cache_key] = session
+
+ return _session_cache[cache_key]
+
+
+def clear_session_cache() -> None:
+ """Clear the session cache and unload all engines."""
+ global _session_cache
+ for session in _session_cache.values():
+ session.unload()
+ _session_cache.clear()
+
+
+def run_inference(
+ img: np.ndarray,
+ engine: TensorRTEngine,
+ gpu_index: int = 0,
+) -> np.ndarray:
+ """
+ Run TensorRT inference on an image.
+
+ Args:
+ img: Input image in HWC format (height, width, channels), float32, 0-1 range
+ engine: TensorRT engine
+ gpu_index: GPU device index
+
+ Returns:
+ Output image in HWC format
+ """
+ session = get_tensorrt_session(engine, gpu_index)
+
+ # Convert HWC to NCHW
+ if img.ndim == 2:
+ # Grayscale
+ img = img[np.newaxis, np.newaxis, :, :]
+ elif img.ndim == 3:
+ # HWC -> NCHW
+ img = img.transpose(2, 0, 1)[np.newaxis, :, :, :]
+ else:
+ raise ValueError(f"Unexpected image dimensions: {img.ndim}")
+
+ # Run inference
+ output = session.infer(img)
+
+ # Convert NCHW back to HWC
+ if output.ndim == 4:
+ output = output.squeeze(0).transpose(1, 2, 0)
+ elif output.ndim == 3:
+ output = output.transpose(1, 2, 0)
+
+ return output
diff --git a/backend/src/nodes/impl/tensorrt/memory.py b/backend/src/nodes/impl/tensorrt/memory.py
new file mode 100644
index 0000000000..d96a887860
--- /dev/null
+++ b/backend/src/nodes/impl/tensorrt/memory.py
@@ -0,0 +1,185 @@
+"""CUDA memory management utilities for TensorRT inference."""
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+ from cuda import cudart
+
+
+@dataclass
+class CudaBuffer:
+ """Represents a CUDA device memory buffer."""
+
+ device_ptr: int
+ size: int
+ dtype: np.dtype
+
+ def free(self) -> None:
+ """Free the device memory."""
+ from cuda import cudart
+
+ if self.device_ptr != 0:
+ cudart.cudaFree(self.device_ptr)
+ self.device_ptr = 0
+
+
+class CudaMemoryManager:
+ """
+ Manages CUDA memory allocation for TensorRT inference.
+
+ Uses RAII pattern with context managers to ensure proper cleanup.
+ """
+
+ def __init__(self, device_id: int = 0):
+ self.device_id = device_id
+ self._buffers: list[CudaBuffer] = []
+ self._stream: int | None = None
+
+ def _check_cuda_error(self, result: tuple) -> None:
+ """Check CUDA runtime API result for errors."""
+ err = result[0]
+ if err.value != 0:
+ from cuda import cudart
+
+ err_name = cudart.cudaGetErrorName(err)[1]
+ err_string = cudart.cudaGetErrorString(err)[1]
+ raise RuntimeError(f"CUDA Error {err_name}: {err_string}")
+
+ def allocate(self, size: int, dtype: np.dtype = np.float32) -> CudaBuffer:
+ """Allocate device memory."""
+ from cuda import cudart
+
+ result = cudart.cudaMalloc(size)
+ self._check_cuda_error(result)
+ device_ptr = result[1]
+ buffer = CudaBuffer(device_ptr, size, dtype)
+ self._buffers.append(buffer)
+ return buffer
+
+ def allocate_like(self, array: np.ndarray) -> CudaBuffer:
+ """Allocate device memory matching the size and dtype of an array."""
+ return self.allocate(array.nbytes, array.dtype)
+
+ def copy_to_device(self, host_array: np.ndarray, device_buffer: CudaBuffer) -> None:
+ """Copy data from host to device."""
+ from cuda import cudart
+
+ host_ptr = host_array.ctypes.data
+ self._check_cuda_error(
+ cudart.cudaMemcpy(
+ device_buffer.device_ptr,
+ host_ptr,
+ host_array.nbytes,
+ cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
+ )
+ )
+
+ def copy_to_host(
+ self, device_buffer: CudaBuffer, host_array: np.ndarray
+ ) -> np.ndarray:
+ """Copy data from device to host."""
+ from cuda import cudart
+
+ host_ptr = host_array.ctypes.data
+ self._check_cuda_error(
+ cudart.cudaMemcpy(
+ host_ptr,
+ device_buffer.device_ptr,
+ host_array.nbytes,
+ cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+ )
+ )
+ return host_array
+
+ def create_stream(self) -> int:
+ """Create a CUDA stream."""
+ from cuda import cudart
+
+ result = cudart.cudaStreamCreate()
+ self._check_cuda_error(result)
+ self._stream = result[1]
+ return self._stream
+
+ def synchronize(self) -> None:
+ """Synchronize the CUDA device."""
+ from cuda import cudart
+
+ self._check_cuda_error(cudart.cudaDeviceSynchronize())
+
+ def synchronize_stream(self) -> None:
+ """Synchronize the CUDA stream."""
+ from cuda import cudart
+
+ if self._stream is not None:
+ self._check_cuda_error(cudart.cudaStreamSynchronize(self._stream))
+
+ def cleanup(self) -> None:
+ """Free all allocated resources."""
+ from cuda import cudart
+
+ for buffer in self._buffers:
+ buffer.free()
+ self._buffers.clear()
+
+ if self._stream is not None:
+ cudart.cudaStreamDestroy(self._stream)
+ self._stream = None
+
+
+@contextmanager
+def cuda_memory_context(device_id: int = 0):
+ """
+ Context manager for CUDA memory operations.
+
+ Ensures all allocated memory is freed when the context exits.
+ """
+ manager = CudaMemoryManager(device_id)
+ try:
+ yield manager
+ finally:
+ manager.cleanup()
+
+
+def check_cuda_available() -> bool:
+ """Check if CUDA is available."""
+ try:
+ from cuda import cudart
+
+ result = cudart.cudaGetDeviceCount()
+ return result[0].value == 0 and result[1] > 0
+ except ImportError:
+ return False
+ except Exception:
+ return False
+
+
+def get_cuda_device_name(device_id: int = 0) -> str:
+ """Get the name of a CUDA device."""
+ try:
+ from cuda import cudart
+
+ result = cudart.cudaGetDeviceProperties(device_id)
+ if result[0].value == 0:
+ return result[1].name.decode("utf-8") if isinstance(result[1].name, bytes) else result[1].name
+ return "Unknown"
+ except Exception:
+ return "Unknown"
+
+
+def get_cuda_compute_capability(device_id: int = 0) -> tuple[int, int]:
+ """Get the compute capability of a CUDA device."""
+ try:
+ from cuda import cudart
+
+ result = cudart.cudaGetDeviceProperties(device_id)
+ if result[0].value == 0:
+ return result[1].major, result[1].minor
+ return (0, 0)
+ except Exception:
+ return (0, 0)
diff --git a/backend/src/nodes/impl/tensorrt/model.py b/backend/src/nodes/impl/tensorrt/model.py
new file mode 100644
index 0000000000..8d0959c6b8
--- /dev/null
+++ b/backend/src/nodes/impl/tensorrt/model.py
@@ -0,0 +1,56 @@
+# This class defines an interface.
+# It is important that it does not contain types that depend on TensorRT.
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal
+
+
+@dataclass
+class TensorRTEngineInfo:
+ """Metadata about a TensorRT engine."""
+
+ precision: Literal["fp32", "fp16", "int8"]
+ input_channels: int
+ output_channels: int
+ scale: int | None
+ gpu_architecture: str
+ tensorrt_version: str
+ has_dynamic_shapes: bool
+ min_shape: tuple[int, int] | None # (width, height)
+ opt_shape: tuple[int, int] | None # (width, height)
+ max_shape: tuple[int, int] | None # (width, height)
+
+
+class TensorRTEngine:
+ """
+ A wrapper class for TensorRT engine data.
+
+ This class holds the serialized engine bytes and metadata without
+ requiring TensorRT to be imported. The actual engine loading and
+ inference are handled by separate utility functions.
+ """
+
+ def __init__(self, engine_bytes: bytes, info: TensorRTEngineInfo):
+ self.bytes: bytes = engine_bytes
+ self.info: TensorRTEngineInfo = info
+
+ @property
+ def precision(self) -> Literal["fp32", "fp16", "int8"]:
+ return self.info.precision
+
+ @property
+ def scale(self) -> int | None:
+ return self.info.scale
+
+ @property
+ def input_channels(self) -> int:
+ return self.info.input_channels
+
+ @property
+ def output_channels(self) -> int:
+ return self.info.output_channels
+
+ @property
+ def has_dynamic_shapes(self) -> bool:
+ return self.info.has_dynamic_shapes
diff --git a/backend/src/nodes/properties/inputs/__init__.py b/backend/src/nodes/properties/inputs/__init__.py
index ee136edcbd..ac63e30d46 100644
--- a/backend/src/nodes/properties/inputs/__init__.py
+++ b/backend/src/nodes/properties/inputs/__init__.py
@@ -16,3 +16,7 @@
from .pytorch_inputs import *
except Exception:
pass
+try:
+ from .tensorrt_inputs import *
+except Exception:
+ pass
diff --git a/backend/src/nodes/properties/inputs/file_inputs.py b/backend/src/nodes/properties/inputs/file_inputs.py
index e5df528a3b..4cd77d50c9 100644
--- a/backend/src/nodes/properties/inputs/file_inputs.py
+++ b/backend/src/nodes/properties/inputs/file_inputs.py
@@ -19,6 +19,7 @@
| Literal["param"]
| Literal["pt"]
| Literal["pth"]
+ | Literal["tensorrt"]
| Literal["video"]
)
@@ -188,6 +189,19 @@ def OnnxFileInput(primary_input: bool = False) -> FileInput:
)
+def TensorRTFileInput(primary_input: bool = False) -> FileInput:
+ """Input for submitting a local TensorRT engine file"""
+ return FileInput(
+ label="TensorRT Engine File",
+ file_kind="tensorrt",
+ filetypes=[".engine", ".trt", ".plan"],
+ primary_input=primary_input,
+ ).with_docs(
+ "TensorRT engine files (.engine, .trt, .plan) are pre-built for a specific GPU architecture.",
+ "Engines built on one GPU may not work on a different GPU architecture.",
+ )
+
+
_INVALID_PATH_CHARS = re.compile(r'[<>:"|?*\x00-\x1F]')
diff --git a/backend/src/nodes/properties/inputs/tensorrt_inputs.py b/backend/src/nodes/properties/inputs/tensorrt_inputs.py
new file mode 100644
index 0000000000..11cc440614
--- /dev/null
+++ b/backend/src/nodes/properties/inputs/tensorrt_inputs.py
@@ -0,0 +1,57 @@
+import navi
+from api import BaseInput
+
+from ...impl.tensorrt.model import TensorRTEngine
+from .generic_inputs import DropDownInput
+
+
+class TensorRTEngineInput(BaseInput):
+ """Input for TensorRT engine."""
+
+ def __init__(
+ self,
+ label: str = "TensorRT Engine",
+ input_type: navi.ExpressionJson = "TensorRTEngine",
+ ):
+ super().__init__(input_type, label)
+ self.associated_type = TensorRTEngine
+
+
+def TensorRTPrecisionDropdown() -> DropDownInput:
+ """Dropdown for selecting TensorRT precision mode."""
+ return DropDownInput(
+ input_type="TrtPrecision",
+ label="Precision",
+ options=[
+ {
+ "option": "FP32",
+ "value": "fp32",
+ "type": "TrtPrecision::fp32",
+ },
+ {
+ "option": "FP16",
+ "value": "fp16",
+ "type": "TrtPrecision::fp16",
+ },
+ ],
+ )
+
+
+def TensorRTShapeModeDropdown() -> DropDownInput:
+ """Dropdown for selecting TensorRT shape mode."""
+ return DropDownInput(
+ input_type="TrtShapeMode",
+ label="Shape Mode",
+ options=[
+ {
+ "option": "Fixed",
+ "value": "fixed",
+ "type": "TrtShapeMode::fixed",
+ },
+ {
+ "option": "Dynamic",
+ "value": "dynamic",
+ "type": "TrtShapeMode::dynamic",
+ },
+ ],
+ )
diff --git a/backend/src/nodes/properties/outputs/__init__.py b/backend/src/nodes/properties/outputs/__init__.py
index ea124b26db..006d46f166 100644
--- a/backend/src/nodes/properties/outputs/__init__.py
+++ b/backend/src/nodes/properties/outputs/__init__.py
@@ -14,3 +14,7 @@
from .pytorch_outputs import *
except Exception:
pass
+try:
+ from .tensorrt_outputs import *
+except Exception:
+ pass
diff --git a/backend/src/nodes/properties/outputs/tensorrt_outputs.py b/backend/src/nodes/properties/outputs/tensorrt_outputs.py
new file mode 100644
index 0000000000..55161c0c31
--- /dev/null
+++ b/backend/src/nodes/properties/outputs/tensorrt_outputs.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+import navi
+from api import BaseOutput, OutputKind
+from nodes.utils.format import format_channel_numbers
+
+from ...impl.tensorrt.model import TensorRTEngine
+
+
+class TensorRTEngineOutput(BaseOutput):
+ """Output for TensorRT engine."""
+
+ def __init__(
+ self,
+ model_type: navi.ExpressionJson = "TensorRTEngine",
+ label: str = "TensorRT Engine",
+ kind: OutputKind = "generic",
+ ):
+ super().__init__(model_type, label, kind=kind, associated_type=TensorRTEngine)
+
+ def get_broadcast_data(self, value: TensorRTEngine):
+ i = value.info
+
+ tags: list[str] = []
+
+ # Add channel info
+ if i.input_channels is not None and i.output_channels is not None:
+ tags.append(format_channel_numbers(i.input_channels, i.output_channels))
+
+ # Add scale info
+ if i.scale is not None:
+ tags.append(f"{i.scale}x")
+
+ # Add precision
+ tags.append(i.precision.upper())
+
+ # Add architecture
+ tags.append(i.gpu_architecture)
+
+ return {"tags": tags}
+
+ def get_broadcast_type(self, value: TensorRTEngine):
+ fields: dict[str, navi.ExpressionJson] = {
+ "precision": navi.literal(value.info.precision),
+ }
+
+ i = value.info
+ if i.scale is not None:
+ fields["scale"] = i.scale
+ if i.input_channels is not None:
+ fields["inputChannels"] = i.input_channels
+ if i.output_channels is not None:
+ fields["outputChannels"] = i.output_channels
+
+ return navi.named("TensorRTEngine", fields)
diff --git a/backend/src/packages/chaiNNer_tensorrt/__init__.py b/backend/src/packages/chaiNNer_tensorrt/__init__.py
new file mode 100644
index 0000000000..7b2f3fee46
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/__init__.py
@@ -0,0 +1,60 @@
+from api import GB, MB, Dependency, add_package
+from gpu import nvidia
+from logger import logger
+from system import is_arm_mac
+
+package_description = (
+ "TensorRT provides native NVIDIA TensorRT support for optimized GPU inference. "
+ "It offers significant performance improvements over ONNX Runtime by using "
+ "NVIDIA's proprietary optimization engine. Engines are built specifically for "
+ "your GPU architecture for maximum performance."
+)
+
+install_hint = (
+ "TensorRT uses NVIDIA's TensorRT engine for GPU-accelerated inference. "
+ "Requires an NVIDIA GPU with CUDA support."
+)
+
+# Only define the package if NVIDIA GPUs are available
+if nvidia.is_available and not is_arm_mac:
+ package = add_package(
+ __file__,
+ id="chaiNNer_tensorrt",
+ name="TensorRT",
+ description=package_description,
+ dependencies=[
+ Dependency(
+ display_name="TensorRT",
+ pypi_name="tensorrt",
+ version="10.0.1",
+ size_estimate=int(1.2 * GB),
+ auto_update=False,
+ ),
+ Dependency(
+ display_name="CUDA Python",
+ pypi_name="cuda-python",
+ version="12.3.0",
+ size_estimate=20 * MB,
+ ),
+ ],
+ icon="Nvidia",
+ color="#76B900",
+ )
+
+ tensorrt_category = package.add_category(
+ name="TensorRT",
+ description="Nodes for using NVIDIA TensorRT for optimized GPU inference.",
+ icon="Nvidia",
+ color="#76B900",
+ install_hint=install_hint,
+ )
+
+ logger.debug("Loaded package %s", package.name)
+else:
+ # Create a dummy for imports to not fail
+ package = None # type: ignore
+ tensorrt_category = None # type: ignore
+ if is_arm_mac:
+ logger.debug("TensorRT package not available on ARM Mac")
+ else:
+ logger.debug("TensorRT package not available (no NVIDIA GPU detected)")
diff --git a/backend/src/packages/chaiNNer_tensorrt/settings.py b/backend/src/packages/chaiNNer_tensorrt/settings.py
new file mode 100644
index 0000000000..7495c2ff18
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/settings.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+
+from api import CacheSetting, DropdownSetting, NodeContext, NumberSetting
+from gpu import nvidia
+from logger import logger
+
+from . import package
+
+if package is not None:
+ package.add_setting(
+ DropdownSetting(
+ label="GPU",
+ key="gpu_index",
+ description="Which GPU to use for TensorRT. This is only relevant if you have multiple GPUs.",
+ options=[{"label": d.name, "value": str(d.index)} for d in nvidia.devices],
+ default="0",
+ )
+ )
+
+ should_fp16 = nvidia.is_available and nvidia.all_support_fp16
+
+ package.add_setting(
+ DropdownSetting(
+ label="Default Precision",
+ key="default_precision",
+ description="Default precision for building new TensorRT engines. FP16 is faster on RTX GPUs.",
+ options=[
+ {"label": "FP32 (Higher Precision)", "value": "fp32"},
+ {"label": "FP16 (Faster on RTX GPUs)", "value": "fp16"},
+ ],
+ default="fp16" if should_fp16 else "fp32",
+ )
+ )
+
+ package.add_setting(
+ NumberSetting(
+ label="Workspace Size (GB)",
+ key="workspace_size",
+ description="Maximum GPU memory to use during engine building. Larger values may allow better optimizations.",
+ default=4.0,
+ min=1.0,
+ max=32.0,
+ )
+ )
+
+ package.add_setting(
+ CacheSetting(
+ label="Engine Cache",
+ key="engine_cache",
+ description="Directory to cache built TensorRT engines. Engines are specific to your GPU and TensorRT version.",
+ directory="tensorrt_engine_cache",
+ )
+ )
+
+ package.add_setting(
+ CacheSetting(
+ label="Timing Cache",
+ key="timing_cache",
+ description="Directory for TensorRT timing cache. Speeds up engine building for similar models.",
+ directory="tensorrt_timing_cache",
+ )
+ )
+
+
+@dataclass(frozen=True)
+class TensorRTSettings:
+ gpu_index: int
+ default_precision: str
+ workspace_size: float
+ engine_cache_path: str | None
+ timing_cache_path: str | None
+
+
+def _get_float(settings, key: str, default: float) -> float:
+ """Helper to get float value from settings."""
+ raw = settings._SettingsParser__settings.get(key, default)
+ if isinstance(raw, (int, float)):
+ return float(raw)
+ return default
+
+
+def get_settings(context: NodeContext) -> TensorRTSettings:
+ settings = context.settings
+
+ engine_cache_path = settings.get_cache_location("engine_cache")
+ if engine_cache_path and not os.path.exists(engine_cache_path):
+ os.makedirs(engine_cache_path)
+ logger.info("Created TensorRT engine cache at: %s", engine_cache_path)
+
+ timing_cache_path = settings.get_cache_location("timing_cache")
+ if timing_cache_path and not os.path.exists(timing_cache_path):
+ os.makedirs(timing_cache_path)
+ logger.info("Created TensorRT timing cache at: %s", timing_cache_path)
+
+ return TensorRTSettings(
+ gpu_index=settings.get_int("gpu_index", 0, parse_str=True),
+ default_precision=settings.get_str("default_precision", "fp32"),
+ workspace_size=_get_float(settings, "workspace_size", 4.0),
+ engine_cache_path=engine_cache_path,
+ timing_cache_path=timing_cache_path,
+ )
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/__init__.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/__init__.py
new file mode 100644
index 0000000000..17d4021625
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/__init__.py
@@ -0,0 +1,10 @@
+from .. import tensorrt_category
+
+if tensorrt_category is not None:
+ io_group = tensorrt_category.add_node_group("Input & Output")
+ processing_group = tensorrt_category.add_node_group("Processing")
+ utility_group = tensorrt_category.add_node_group("Utility")
+else:
+ io_group = None # type: ignore
+ processing_group = None # type: ignore
+ utility_group = None # type: ignore
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/__init__.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/__init__.py
new file mode 100644
index 0000000000..6e9c2cdebf
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/__init__.py
@@ -0,0 +1 @@
+# IO nodes for TensorRT
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/build_engine_from_onnx.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/build_engine_from_onnx.py
new file mode 100644
index 0000000000..b6321bfd8b
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/build_engine_from_onnx.py
@@ -0,0 +1,199 @@
+from __future__ import annotations
+
+from enum import Enum
+
+from api import NodeContext
+from logger import logger
+from nodes.groups import Condition, if_enum_group
+from nodes.impl.onnx.model import OnnxModel
+from nodes.impl.tensorrt.engine_builder import BuildConfig, build_engine_from_onnx
+from nodes.impl.tensorrt.model import TensorRTEngine
+from nodes.properties.inputs import (
+ EnumInput,
+ NumberInput,
+ OnnxModelInput,
+)
+from nodes.properties.outputs import TensorRTEngineOutput, TextOutput
+
+from ...settings import get_settings
+from .. import io_group
+
+
+class Precision(Enum):
+ FP32 = "fp32"
+ FP16 = "fp16"
+
+
+PRECISION_LABELS = {
+ Precision.FP32: "FP32 (Higher Precision)",
+ Precision.FP16: "FP16 (Faster on RTX GPUs)",
+}
+
+
+class ShapeMode(Enum):
+ FIXED = "fixed"
+ DYNAMIC = "dynamic"
+
+
+SHAPE_MODE_LABELS = {
+ ShapeMode.FIXED: "Fixed (Single Size)",
+ ShapeMode.DYNAMIC: "Dynamic (Variable Sizes)",
+}
+
+
+if io_group is not None:
+
+ @io_group.register(
+ schema_id="chainner:tensorrt:build_engine",
+ name="Build Engine from ONNX",
+ description=[
+ "Convert an ONNX model to a TensorRT engine.",
+ "Building an engine can take several minutes depending on the model size and optimization settings.",
+ "The built engine is optimized specifically for your GPU and TensorRT version.",
+ "It is recommended to save the built engine for reuse, as building is slow.",
+ ],
+ icon="Nvidia",
+ inputs=[
+ OnnxModelInput("ONNX Model"),
+ EnumInput(
+ Precision,
+ label="Precision",
+ default=Precision.FP16,
+ option_labels=PRECISION_LABELS,
+ ).with_docs(
+ "FP16 is faster on RTX GPUs and uses less VRAM.",
+ "FP32 provides higher precision but is slower.",
+ ),
+ EnumInput(
+ ShapeMode,
+ label="Shape Mode",
+ default=ShapeMode.FIXED,
+ option_labels=SHAPE_MODE_LABELS,
+ ).with_docs(
+ "Fixed: Build engine for a single input size. Fastest inference.",
+ "Dynamic: Build engine for variable input sizes. More flexible but slightly slower.",
+ ),
+ if_enum_group(2, ShapeMode.DYNAMIC)(
+ NumberInput(
+ "Min Height",
+ default=64,
+ min=16,
+ max=4096,
+ unit="px",
+ ).with_docs("Minimum input height for dynamic shapes."),
+ NumberInput(
+ "Min Width",
+ default=64,
+ min=16,
+ max=4096,
+ unit="px",
+ ).with_docs("Minimum input width for dynamic shapes."),
+ NumberInput(
+ "Optimal Height",
+ default=512,
+ min=16,
+ max=4096,
+ unit="px",
+ ).with_docs("Optimal input height (used for optimization)."),
+ NumberInput(
+ "Optimal Width",
+ default=512,
+ min=16,
+ max=4096,
+ unit="px",
+ ).with_docs("Optimal input width (used for optimization)."),
+ NumberInput(
+ "Max Height",
+ default=2048,
+ min=16,
+ max=8192,
+ unit="px",
+ ).with_docs("Maximum input height for dynamic shapes."),
+ NumberInput(
+ "Max Width",
+ default=2048,
+ min=16,
+ max=8192,
+ unit="px",
+ ).with_docs("Maximum input width for dynamic shapes."),
+ ),
+ NumberInput(
+ "Workspace (GB)",
+ default=4.0,
+ min=1.0,
+ max=32.0,
+ precision=1,
+ step=0.5,
+ ).with_docs(
+ "Maximum GPU memory for building. Larger values may allow better optimizations."
+ ),
+ ],
+ outputs=[
+ TensorRTEngineOutput(kind="tagged"),
+ TextOutput("Build Info"),
+ ],
+ node_context=True,
+ )
+ def build_engine_from_onnx_node(
+ context: NodeContext,
+ onnx_model: OnnxModel,
+ precision: Precision,
+ shape_mode: ShapeMode,
+ min_height: int,
+ min_width: int,
+ opt_height: int,
+ opt_width: int,
+ max_height: int,
+ max_width: int,
+ workspace: float,
+ ) -> tuple[TensorRTEngine, str]:
+ settings = get_settings(context)
+ gpu_index = settings.gpu_index
+
+ # Determine timing cache path
+ timing_cache_path = None
+ if settings.timing_cache_path:
+ import hashlib
+
+ # Create a cache key based on the model
+ model_hash = hashlib.md5(onnx_model.bytes[:1024]).hexdigest()[:8]
+ timing_cache_path = f"{settings.timing_cache_path}/timing_{model_hash}.cache"
+
+ use_dynamic = shape_mode == ShapeMode.DYNAMIC
+
+ # For fixed mode, use reasonable defaults
+ if not use_dynamic:
+ min_height = min_width = 64
+ opt_height = opt_width = 256
+ max_height = max_width = 256
+
+ config = BuildConfig(
+ precision=precision.value,
+ workspace_size_gb=workspace,
+ min_shape=(min_height, min_width),
+ opt_shape=(opt_height, opt_width),
+ max_shape=(max_height, max_width),
+ use_dynamic_shapes=use_dynamic,
+ )
+
+ logger.info(
+ "Building TensorRT engine: precision=%s, dynamic=%s, workspace=%.1fGB",
+ precision.value,
+ use_dynamic,
+ workspace,
+ )
+
+ engine = build_engine_from_onnx(
+ onnx_model.bytes,
+ config,
+ gpu_index=gpu_index,
+ timing_cache_path=timing_cache_path,
+ )
+
+ build_info = (
+ f"Built {precision.value.upper()} engine for {engine.info.gpu_architecture}"
+ )
+ if use_dynamic:
+ build_info += f" (dynamic: {min_height}x{min_width} to {max_height}x{max_width})"
+
+ return engine, build_info
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py
new file mode 100644
index 0000000000..4ecb56241a
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from api import NodeContext
+from logger import logger
+from nodes.impl.tensorrt.memory import get_cuda_compute_capability
+from nodes.impl.tensorrt.model import TensorRTEngine, TensorRTEngineInfo
+from nodes.properties.inputs import TensorRTFileInput
+from nodes.properties.outputs import DirectoryOutput, FileNameOutput, TensorRTEngineOutput
+from nodes.utils.utils import split_file_path
+
+from ...settings import get_settings
+from .. import io_group
+
+if io_group is not None:
+
+ @io_group.register(
+ schema_id="chainner:tensorrt:load_engine",
+ name="Load Engine",
+ description=(
+ "Load a TensorRT engine file (.engine, .trt, .plan). "
+ "TensorRT engines are built for a specific GPU architecture and may not work "
+ "on different GPUs. The node will warn you if there's a potential compatibility issue."
+ ),
+ icon="Nvidia",
+ inputs=[TensorRTFileInput(primary_input=True)],
+ outputs=[
+ TensorRTEngineOutput(kind="tagged").suggest(),
+ DirectoryOutput("Directory", of_input=0).with_id(2),
+ FileNameOutput("Name", of_input=0).with_id(1),
+ ],
+ side_effects=True,
+ node_context=True,
+ )
+ def load_engine_node(
+ context: NodeContext, path: Path
+ ) -> tuple[TensorRTEngine, Path, str]:
+ import tensorrt as trt
+
+ assert os.path.exists(path), f"Engine file at location {path} does not exist"
+ assert os.path.isfile(path), f"Path {path} is not a file"
+
+ logger.debug("Reading TensorRT engine from path: %s", path)
+
+ settings = get_settings(context)
+ gpu_index = settings.gpu_index
+
+ # Check GPU compatibility
+ current_major, current_minor = get_cuda_compute_capability(gpu_index)
+ current_arch = f"sm_{current_major}{current_minor}"
+
+ # Load engine bytes
+ with open(path, "rb") as f:
+ engine_bytes = f.read()
+
+ # Deserialize to get engine info
+ trt_logger = trt.Logger(trt.Logger.WARNING)
+ runtime = trt.Runtime(trt_logger)
+ engine = runtime.deserialize_cuda_engine(engine_bytes)
+
+ if engine is None:
+ raise RuntimeError(
+ f"Failed to deserialize TensorRT engine from {path}. "
+ "This may happen if the engine was built with an incompatible TensorRT version "
+ "or for a different GPU architecture."
+ )
+
+ # Extract engine information
+ input_name = engine.get_tensor_name(0)
+ output_name = engine.get_tensor_name(1)
+
+ input_shape = engine.get_tensor_shape(input_name)
+ output_shape = engine.get_tensor_shape(output_name)
+
+ # Detect channels
+ input_channels = input_shape[1] if len(input_shape) >= 4 else 3
+ output_channels = output_shape[1] if len(output_shape) >= 4 else 3
+
+ # Calculate scale
+ scale = None
+ if len(input_shape) >= 4 and len(output_shape) >= 4:
+ in_h, in_w = input_shape[2], input_shape[3]
+ out_h, out_w = output_shape[2], output_shape[3]
+ if in_h > 0 and in_w > 0 and out_h > 0 and out_w > 0:
+ scale_h = out_h // in_h if out_h % in_h == 0 else None
+ scale_w = out_w // in_w if out_w % in_w == 0 else None
+ if scale_h == scale_w and scale_h is not None:
+ scale = scale_h
+
+ # Check for dynamic shapes
+ has_dynamic = any(d == -1 for d in input_shape)
+
+ # Detect precision from the engine (this is a heuristic)
+ # TensorRT doesn't provide a direct way to query the precision
+ precision = "fp32" # Default assumption
+
+ # Create info
+ info = TensorRTEngineInfo(
+ precision=precision,
+ input_channels=input_channels,
+ output_channels=output_channels,
+ scale=scale,
+ gpu_architecture=current_arch,
+ tensorrt_version=trt.__version__,
+ has_dynamic_shapes=has_dynamic,
+ min_shape=None,
+ opt_shape=None,
+ max_shape=None,
+ )
+
+ dirname, basename, _ = split_file_path(path)
+ return TensorRTEngine(engine_bytes, info), dirname, basename
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py
new file mode 100644
index 0000000000..fee3600295
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from logger import logger
+from nodes.impl.tensorrt.model import TensorRTEngine
+from nodes.properties.inputs import DirectoryInput, RelativePathInput, TensorRTEngineInput
+
+from .. import io_group
+
+if io_group is not None:
+
+ @io_group.register(
+ schema_id="chainner:tensorrt:save_engine",
+ name="Save Engine",
+ description=(
+ "Save a TensorRT engine to a file (.engine). "
+ "The saved engine is specific to your GPU architecture and TensorRT version."
+ ),
+ icon="MdSave",
+ inputs=[
+ TensorRTEngineInput(),
+ DirectoryInput(must_exist=False),
+ RelativePathInput("Engine Name"),
+ ],
+ outputs=[],
+ side_effects=True,
+ )
+ def save_engine_node(
+ engine: TensorRTEngine, directory: Path, engine_name: str
+ ) -> None:
+ full_path = (directory / f"{engine_name}.engine").resolve()
+ logger.debug("Writing TensorRT engine to path: %s", full_path)
+ full_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(full_path, "wb") as f:
+ f.write(engine.bytes)
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/__init__.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/__init__.py
new file mode 100644
index 0000000000..f82e4595cc
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/__init__.py
@@ -0,0 +1 @@
+# Processing nodes for TensorRT
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
new file mode 100644
index 0000000000..5723efd5a2
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
@@ -0,0 +1,126 @@
+from __future__ import annotations
+
+import numpy as np
+
+from api import NodeContext
+from logger import logger
+from nodes.groups import Condition, if_enum_group, if_group
+from nodes.impl.tensorrt.auto_split import tensorrt_auto_split
+from nodes.impl.tensorrt.model import TensorRTEngine
+from nodes.impl.upscale.auto_split_tiles import (
+ CUSTOM,
+ TILE_SIZE_256,
+ TileSize,
+ parse_tile_size_input,
+)
+from nodes.impl.upscale.convenient_upscale import convenient_upscale
+from nodes.impl.upscale.tiler import MaxTileSize
+from nodes.properties.inputs import (
+ BoolInput,
+ ImageInput,
+ NumberInput,
+ TensorRTEngineInput,
+ TileSizeDropdown,
+)
+from nodes.properties.outputs import ImageOutput
+from nodes.utils.utils import get_h_w_c
+
+from ...settings import get_settings
+from .. import processing_group
+
+
+def upscale(
+ img: np.ndarray,
+ engine: TensorRTEngine,
+ tile_size: TileSize,
+ gpu_index: int,
+) -> np.ndarray:
+ logger.debug("Upscaling image with TensorRT")
+
+ def estimate():
+ # Conservative estimate for TensorRT
+ return MaxTileSize(TILE_SIZE_256)
+
+ tiler = parse_tile_size_input(tile_size, estimate)
+
+ return tensorrt_auto_split(img, engine, tiler, gpu_index=gpu_index)
+
+
+if processing_group is not None:
+
+ @processing_group.register(
+ schema_id="chainner:tensorrt:upscale_image",
+ description=(
+ "Upscales an image using a TensorRT engine. TensorRT provides optimized GPU inference "
+ "for maximum performance. If you encounter out-of-memory errors, try using a smaller tile size."
+ ),
+ inputs=[
+ ImageInput().with_id(1),
+ TensorRTEngineInput().with_id(0),
+ TileSizeDropdown(estimate=False, default=TILE_SIZE_256)
+ .with_id(2)
+ .with_docs(
+ "Tiled upscaling is used to allow large images to be upscaled without hitting memory limits.",
+ "This works by splitting the image into tiles (with overlap), upscaling each tile individually, and seamlessly recombining them.",
+ "Generally it's recommended to use the largest tile size possible for best performance, but depending on the model and image size, this may not be possible.",
+ ),
+ if_enum_group(2, CUSTOM)(
+ NumberInput(
+ "Custom Tile Size",
+ min=1,
+ max=None,
+ default=TILE_SIZE_256,
+ unit="px",
+ )
+ ),
+ if_group(Condition.type(1, "Image { channels: 4 } "))(
+ BoolInput("Separate Alpha", default=False).with_docs(
+ "Upscale alpha separately from color. Enabling this option will cause the alpha of"
+ " the upscaled image to be less noisy and more accurate to the alpha of the original"
+ " image, but the image may suffer from dark borders near transparency edges"
+ " (transition from fully transparent to fully opaque).",
+ "Whether enabling this option will improve the upscaled image depends on the original"
+ " image. We generally recommend this option for images with smooth transitions between"
+ " transparent and opaque regions.",
+ )
+ ),
+ ],
+ outputs=[
+ ImageOutput(
+ "Image",
+ image_type="convenientUpscaleTrt(Input0, Input1)",
+ )
+ ],
+ name="Upscale Image",
+ icon="Nvidia",
+ node_context=True,
+ )
+ def upscale_image_node(
+ context: NodeContext,
+ img: np.ndarray,
+ engine: TensorRTEngine,
+ tile_size: TileSize,
+ custom_tile_size: int,
+ separate_alpha: bool,
+ ) -> np.ndarray:
+ settings = get_settings(context)
+ gpu_index = settings.gpu_index
+
+ in_nc = engine.input_channels
+ out_nc = engine.output_channels
+
+ h, w, c = get_h_w_c(img)
+ logger.debug("Image is %dx%dx%d", h, w, c)
+
+ return convenient_upscale(
+ img,
+ in_nc,
+ out_nc,
+ lambda i: upscale(
+ i,
+ engine,
+ TileSize(custom_tile_size) if tile_size == CUSTOM else tile_size,
+ gpu_index,
+ ),
+ separate_alpha,
+ )
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/__init__.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/__init__.py
new file mode 100644
index 0000000000..813067c4b9
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/__init__.py
@@ -0,0 +1 @@
+# Utility nodes for TensorRT
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py
new file mode 100644
index 0000000000..531cb6ac99
--- /dev/null
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from nodes.impl.tensorrt.model import TensorRTEngine
+from nodes.properties.inputs import TensorRTEngineInput
+from nodes.properties.outputs import NumberOutput, TextOutput
+
+from .. import utility_group
+
+if utility_group is not None:
+
+ @utility_group.register(
+ schema_id="chainner:tensorrt:engine_info",
+ name="Get Engine Info",
+ description="Returns information about a TensorRT engine.",
+ icon="ImInfo",
+ inputs=[TensorRTEngineInput("TensorRT Engine")],
+ outputs=[
+ NumberOutput(
+ "Scale",
+ output_type="""
+ if Input0.scale == null { 0 } else { Input0.scale }
+ """,
+ ),
+ TextOutput(
+ "Precision",
+ output_type="Input0.precision",
+ ),
+ TextOutput(
+ "GPU Architecture",
+ ),
+ TextOutput(
+ "TensorRT Version",
+ ),
+ TextOutput(
+ "Shape Info",
+ ),
+ ],
+ )
+ def get_engine_info_node(
+ engine: TensorRTEngine,
+ ) -> tuple[int, str, str, str, str]:
+ info = engine.info
+
+ scale = info.scale if info.scale is not None else 0
+ precision = info.precision.upper()
+ gpu_arch = info.gpu_architecture
+ trt_version = info.tensorrt_version
+
+ if info.has_dynamic_shapes:
+ if info.min_shape and info.max_shape:
+ shape_info = (
+ f"Dynamic: {info.min_shape[0]}x{info.min_shape[1]} to "
+ f"{info.max_shape[0]}x{info.max_shape[1]}"
+ )
+ else:
+ shape_info = "Dynamic shapes"
+ else:
+ shape_info = "Fixed shapes"
+
+ return scale, precision, gpu_arch, trt_version, shape_info
diff --git a/backend/src/server.py b/backend/src/server.py
index 6c6fbd944c..d90489768f 100644
--- a/backend/src/server.py
+++ b/backend/src/server.py
@@ -634,6 +634,7 @@ async def import_packages(
importlib.import_module("packages.chaiNNer_pytorch")
importlib.import_module("packages.chaiNNer_ncnn")
importlib.import_module("packages.chaiNNer_onnx")
+ importlib.import_module("packages.chaiNNer_tensorrt")
importlib.import_module("packages.chaiNNer_external")
logger.info("Loading Nodes...")
diff --git a/src/common/types/chainner-scope.ts b/src/common/types/chainner-scope.ts
index acee884469..dd1ef6548b 100644
--- a/src/common/types/chainner-scope.ts
+++ b/src/common/types/chainner-scope.ts
@@ -92,6 +92,16 @@ let OnnxGenericModel = OnnxModel {
subType: "Generic",
};
+struct TensorRTEngine {
+ scale: int(1..),
+ inputChannels: int(1..),
+ outputChannels: int(1..),
+ precision: string,
+}
+
+enum TrtPrecision { fp32, fp16 }
+enum TrtShapeMode { fixed, dynamic }
+
def pytorchToOnnx(model: PyTorchModel): OnnxModel {
OnnxModel {
scaleHeight: model.scale,
@@ -148,6 +158,17 @@ def convenientUpscaleOnnx(model: OnnxModel, image: Image) {
}
}
}
+def convenientUpscaleTrt(engine: TensorRTEngine, image: Image) {
+ Image {
+ width: engine.scale * image.width,
+ height: engine.scale * image.height,
+ channels: if engine.inputChannels == engine.outputChannels {
+ image.channels
+ } else {
+ engine.outputChannels
+ }
+ }
+}
struct SplitFilePath {
dir: Directory,
diff --git a/src/renderer/colors.scss b/src/renderer/colors.scss
index f99b79ee40..41f1b98dbe 100644
--- a/src/renderer/colors.scss
+++ b/src/renderer/colors.scss
@@ -35,6 +35,7 @@
--type-color-torch: #dd6b20;
--type-color-onnx: #63b3ed;
--type-color-ncnn: #ed64a6;
+ --type-color-tensorrt: #76b900;
}
:root[data-theme='dark'] {
@@ -373,4 +374,5 @@
--type-color-torch: #fbae2e;
--type-color-onnx: #ac9fdc;
--type-color-ncnn: #ae5354;
+ --type-color-tensorrt: #90c41c;
}
diff --git a/src/renderer/components/CustomIcons.tsx b/src/renderer/components/CustomIcons.tsx
index 0db24f8865..088f03f4de 100644
--- a/src/renderer/components/CustomIcons.tsx
+++ b/src/renderer/components/CustomIcons.tsx
@@ -68,6 +68,17 @@ export const NcnnIcon = createIcon({
),
});
+export const NvidiaIcon = createIcon({
+ displayName: 'NvidiaIcon',
+ viewBox: '0 0 24 24',
+ path: (
+
+ ),
+});
+
interface IconFactoryProps {
icon?: string | null;
accentColor?: string;
@@ -110,6 +121,13 @@ export const IconFactory = memo(({ icon, accentColor, boxSize = 4 }: IconFactory
transition="0.15s ease-in-out"
/>
);
+ case 'Nvidia':
+ return (
+
+ );
default:
break;
}
diff --git a/src/renderer/helpers/accentColors.ts b/src/renderer/helpers/accentColors.ts
index 38dc236113..337d7d5d73 100644
--- a/src/renderer/helpers/accentColors.ts
+++ b/src/renderer/helpers/accentColors.ts
@@ -30,6 +30,7 @@ const colorList = () => {
{ type: resolveName('PyTorchModel'), color: getComputedColor('--type-color-torch') },
{ type: resolveName('OnnxModel'), color: getComputedColor('--type-color-onnx') },
{ type: resolveName('NcnnNetwork'), color: getComputedColor('--type-color-ncnn') },
+ { type: resolveName('TensorRTEngine'), color: getComputedColor('--type-color-tensorrt') },
];
};
// eslint-disable-next-line @typescript-eslint/no-unused-vars
From ba0e4f192c30cd09781fe52aa43210c75e80e54f Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Thu, 29 Jan 2026 01:01:39 -0600
Subject: [PATCH 02/20] Fix inference
---
backend/src/nodes/impl/tensorrt/auto_split.py | 20 +++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/backend/src/nodes/impl/tensorrt/auto_split.py b/backend/src/nodes/impl/tensorrt/auto_split.py
index 079ba0c429..137d6530e0 100644
--- a/backend/src/nodes/impl/tensorrt/auto_split.py
+++ b/backend/src/nodes/impl/tensorrt/auto_split.py
@@ -40,6 +40,20 @@ def _into_standard_image_form(img: np.ndarray) -> np.ndarray:
raise ValueError("Unsupported output tensor shape")
+def _flip_r_b_channels(img: np.ndarray) -> np.ndarray:
+ """Flip R and B channels (RGB <-> BGR conversion)."""
+ shape_size = len(img.shape)
+ if shape_size != 3:
+ return img
+ if img.shape[2] == 3:
+ # (H, W, C) RGB -> BGR - use ascontiguousarray to avoid stride issues
+ return np.ascontiguousarray(np.flip(img, 2))
+ elif img.shape[2] == 4:
+ # (H, W, C) RGBA -> BGRA
+ return np.dstack((img[:, :, 2], img[:, :, 1], img[:, :, 0], img[:, :, 3]))
+ return img
+
+
def tensorrt_auto_split(
img: np.ndarray,
engine: TensorRTEngine,
@@ -66,6 +80,9 @@ def upscale(img: np.ndarray, _: object):
# Convert to appropriate precision
lr_img = img.astype(np.float16) if is_fp16 else img.astype(np.float32)
+ # Convert RGB to BGR (most models expect BGR)
+ lr_img = _flip_r_b_channels(lr_img)
+
# Convert to NCHW batched format
lr_img = _into_batched_form(lr_img)
@@ -75,6 +92,9 @@ def upscale(img: np.ndarray, _: object):
# Convert back to HWC format
output = _into_standard_image_form(output)
+ # Convert BGR back to RGB
+ output = _flip_r_b_channels(output)
+
return output.astype(np.float32)
except Exception as e:
From fa3e854d726998d3a954540a614944727b854e68 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Fri, 30 Jan 2026 00:33:01 -0600
Subject: [PATCH 03/20] Clear cache after inference
---
.../chaiNNer_tensorrt/tensorrt/processing/upscale_image.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
index 5723efd5a2..ad1677bdc0 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
@@ -6,6 +6,7 @@
from logger import logger
from nodes.groups import Condition, if_enum_group, if_group
from nodes.impl.tensorrt.auto_split import tensorrt_auto_split
+from nodes.impl.tensorrt.inference import clear_session_cache
from nodes.impl.tensorrt.model import TensorRTEngine
from nodes.impl.upscale.auto_split_tiles import (
CUSTOM,
@@ -106,6 +107,8 @@ def upscale_image_node(
settings = get_settings(context)
gpu_index = settings.gpu_index
+ context.add_cleanup(clear_session_cache, after="chain")
+
in_nc = engine.input_channels
out_nc = engine.output_channels
From f39445b796043f5c1a262692e037de7c3bce4c5c Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sat, 31 Jan 2026 16:29:30 -0600
Subject: [PATCH 04/20] TensorRT improvements
---
.../src/nodes/impl/tensorrt/engine_builder.py | 16 +++--
backend/src/nodes/impl/tensorrt/inference.py | 7 +-
backend/src/nodes/impl/tensorrt/memory.py | 32 +++++----
backend/src/nodes/impl/tensorrt/model.py | 6 +-
.../properties/outputs/tensorrt_outputs.py | 15 +++--
.../packages/chaiNNer_tensorrt/__init__.py | 8 +--
.../chaiNNer_tensorrt/tensorrt/io/__init__.py | 1 -
.../tensorrt/io/load_engine.py | 67 +++++++++++++++----
.../tensorrt/processing/__init__.py | 1 -
.../tensorrt/processing/upscale_image.py | 2 +-
.../tensorrt/utility/__init__.py | 1 -
.../build_engine.py} | 31 +++++----
src/renderer/components/CustomIcons.tsx | 18 -----
13 files changed, 122 insertions(+), 83 deletions(-)
delete mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/io/__init__.py
delete mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/__init__.py
delete mode 100644 backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/__init__.py
rename backend/src/packages/chaiNNer_tensorrt/tensorrt/{io/build_engine_from_onnx.py => utility/build_engine.py} (88%)
diff --git a/backend/src/nodes/impl/tensorrt/engine_builder.py b/backend/src/nodes/impl/tensorrt/engine_builder.py
index a736031126..37441ac7a7 100644
--- a/backend/src/nodes/impl/tensorrt/engine_builder.py
+++ b/backend/src/nodes/impl/tensorrt/engine_builder.py
@@ -118,7 +118,7 @@ def build_engine_from_onnx(
A TensorRTEngine instance
"""
import tensorrt as trt
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
# Set the CUDA device
cudart.cudaSetDevice(gpu_index)
@@ -236,9 +236,15 @@ def build_engine_from_onnx(
gpu_architecture=gpu_arch,
tensorrt_version=trt.__version__,
has_dynamic_shapes=has_dynamic or config.use_dynamic_shapes,
- min_shape=(config.min_shape[1], config.min_shape[0]) if config.use_dynamic_shapes else None,
- opt_shape=(config.opt_shape[1], config.opt_shape[0]) if config.use_dynamic_shapes else None,
- max_shape=(config.max_shape[1], config.max_shape[0]) if config.use_dynamic_shapes else None,
+ min_shape=(config.min_shape[1], config.min_shape[0])
+ if config.use_dynamic_shapes
+ else None,
+ opt_shape=(config.opt_shape[1], config.opt_shape[0])
+ if config.use_dynamic_shapes
+ else None,
+ max_shape=(config.max_shape[1], config.max_shape[0])
+ if config.use_dynamic_shapes
+ else None,
)
return TensorRTEngine(bytes(serialized_engine), info)
@@ -254,7 +260,7 @@ def load_engine_from_bytes(
Returns the runtime and deserialized engine.
"""
import tensorrt as trt
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
cudart.cudaSetDevice(gpu_index)
diff --git a/backend/src/nodes/impl/tensorrt/inference.py b/backend/src/nodes/impl/tensorrt/inference.py
index 358d38c6aa..9884c1e17a 100644
--- a/backend/src/nodes/impl/tensorrt/inference.py
+++ b/backend/src/nodes/impl/tensorrt/inference.py
@@ -4,8 +4,6 @@
import numpy as np
-from logger import logger
-
from .memory import CudaMemoryManager
from .model import TensorRTEngine
@@ -37,7 +35,7 @@ def load(self) -> None:
return
import tensorrt as trt
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
cudart.cudaSetDevice(self.gpu_index)
@@ -106,9 +104,6 @@ def infer(self, input_array: np.ndarray) -> np.ndarray:
if not self._is_loaded:
raise RuntimeError("Session not loaded. Call load() first.")
- import tensorrt as trt
- from cuda import cudart
-
input_name, output_name, _, _ = self._get_binding_info()
# Get actual input shape
diff --git a/backend/src/nodes/impl/tensorrt/memory.py b/backend/src/nodes/impl/tensorrt/memory.py
index d96a887860..9c7644dec5 100644
--- a/backend/src/nodes/impl/tensorrt/memory.py
+++ b/backend/src/nodes/impl/tensorrt/memory.py
@@ -9,7 +9,7 @@
import numpy as np
if TYPE_CHECKING:
- from cuda import cudart
+ pass
@dataclass
@@ -22,7 +22,7 @@ class CudaBuffer:
def free(self) -> None:
"""Free the device memory."""
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
if self.device_ptr != 0:
cudart.cudaFree(self.device_ptr)
@@ -45,7 +45,7 @@ def _check_cuda_error(self, result: tuple) -> None:
"""Check CUDA runtime API result for errors."""
err = result[0]
if err.value != 0:
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
err_name = cudart.cudaGetErrorName(err)[1]
err_string = cudart.cudaGetErrorString(err)[1]
@@ -53,7 +53,7 @@ def _check_cuda_error(self, result: tuple) -> None:
def allocate(self, size: int, dtype: np.dtype = np.float32) -> CudaBuffer:
"""Allocate device memory."""
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
result = cudart.cudaMalloc(size)
self._check_cuda_error(result)
@@ -68,7 +68,7 @@ def allocate_like(self, array: np.ndarray) -> CudaBuffer:
def copy_to_device(self, host_array: np.ndarray, device_buffer: CudaBuffer) -> None:
"""Copy data from host to device."""
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
host_ptr = host_array.ctypes.data
self._check_cuda_error(
@@ -84,7 +84,7 @@ def copy_to_host(
self, device_buffer: CudaBuffer, host_array: np.ndarray
) -> np.ndarray:
"""Copy data from device to host."""
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
host_ptr = host_array.ctypes.data
self._check_cuda_error(
@@ -99,7 +99,7 @@ def copy_to_host(
def create_stream(self) -> int:
"""Create a CUDA stream."""
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
result = cudart.cudaStreamCreate()
self._check_cuda_error(result)
@@ -108,20 +108,20 @@ def create_stream(self) -> int:
def synchronize(self) -> None:
"""Synchronize the CUDA device."""
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
self._check_cuda_error(cudart.cudaDeviceSynchronize())
def synchronize_stream(self) -> None:
"""Synchronize the CUDA stream."""
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
if self._stream is not None:
self._check_cuda_error(cudart.cudaStreamSynchronize(self._stream))
def cleanup(self) -> None:
"""Free all allocated resources."""
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
for buffer in self._buffers:
buffer.free()
@@ -149,7 +149,7 @@ def cuda_memory_context(device_id: int = 0):
def check_cuda_available() -> bool:
"""Check if CUDA is available."""
try:
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
result = cudart.cudaGetDeviceCount()
return result[0].value == 0 and result[1] > 0
@@ -162,11 +162,15 @@ def check_cuda_available() -> bool:
def get_cuda_device_name(device_id: int = 0) -> str:
"""Get the name of a CUDA device."""
try:
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
result = cudart.cudaGetDeviceProperties(device_id)
if result[0].value == 0:
- return result[1].name.decode("utf-8") if isinstance(result[1].name, bytes) else result[1].name
+ return (
+ result[1].name.decode("utf-8")
+ if isinstance(result[1].name, bytes)
+ else result[1].name
+ )
return "Unknown"
except Exception:
return "Unknown"
@@ -175,7 +179,7 @@ def get_cuda_device_name(device_id: int = 0) -> str:
def get_cuda_compute_capability(device_id: int = 0) -> tuple[int, int]:
"""Get the compute capability of a CUDA device."""
try:
- from cuda import cudart
+ from cuda.bindings import runtime as cudart
result = cudart.cudaGetDeviceProperties(device_id)
if result[0].value == 0:
diff --git a/backend/src/nodes/impl/tensorrt/model.py b/backend/src/nodes/impl/tensorrt/model.py
index 8d0959c6b8..1a9b5ca3dd 100644
--- a/backend/src/nodes/impl/tensorrt/model.py
+++ b/backend/src/nodes/impl/tensorrt/model.py
@@ -17,9 +17,9 @@ class TensorRTEngineInfo:
gpu_architecture: str
tensorrt_version: str
has_dynamic_shapes: bool
- min_shape: tuple[int, int] | None # (width, height)
- opt_shape: tuple[int, int] | None # (width, height)
- max_shape: tuple[int, int] | None # (width, height)
+ min_shape: tuple[int, int, int, int] | None # (-, channels, width, height)
+ opt_shape: tuple[int, int, int, int] | None # (-, channels, width, height)
+ max_shape: tuple[int, int, int, int] | None # (-, channels, width, height)
class TensorRTEngine:
diff --git a/backend/src/nodes/properties/outputs/tensorrt_outputs.py b/backend/src/nodes/properties/outputs/tensorrt_outputs.py
index 55161c0c31..a31b294d31 100644
--- a/backend/src/nodes/properties/outputs/tensorrt_outputs.py
+++ b/backend/src/nodes/properties/outputs/tensorrt_outputs.py
@@ -24,8 +24,7 @@ def get_broadcast_data(self, value: TensorRTEngine):
tags: list[str] = []
# Add channel info
- if i.input_channels is not None and i.output_channels is not None:
- tags.append(format_channel_numbers(i.input_channels, i.output_channels))
+ tags.append(format_channel_numbers(i.input_channels, i.output_channels))
# Add scale info
if i.scale is not None:
@@ -37,6 +36,14 @@ def get_broadcast_data(self, value: TensorRTEngine):
# Add architecture
tags.append(i.gpu_architecture)
+ # Add dynamic shape info
+ if i.min_shape is not None:
+ tags.append("Min: " + str(i.min_shape[2]) + "x" + str(i.min_shape[3]))
+ if i.opt_shape is not None:
+ tags.append("Opt: " + str(i.opt_shape[2]) + "x" + str(i.opt_shape[3]))
+ if i.max_shape is not None:
+ tags.append("Max: " + str(i.max_shape[2]) + "x" + str(i.max_shape[3]))
+
return {"tags": tags}
def get_broadcast_type(self, value: TensorRTEngine):
@@ -47,9 +54,9 @@ def get_broadcast_type(self, value: TensorRTEngine):
i = value.info
if i.scale is not None:
fields["scale"] = i.scale
- if i.input_channels is not None:
+ if i.input_channels:
fields["inputChannels"] = i.input_channels
- if i.output_channels is not None:
+ if i.output_channels:
fields["outputChannels"] = i.output_channels
return navi.named("TensorRTEngine", fields)
diff --git a/backend/src/packages/chaiNNer_tensorrt/__init__.py b/backend/src/packages/chaiNNer_tensorrt/__init__.py
index 7b2f3fee46..8aa3f0cd99 100644
--- a/backend/src/packages/chaiNNer_tensorrt/__init__.py
+++ b/backend/src/packages/chaiNNer_tensorrt/__init__.py
@@ -26,25 +26,25 @@
Dependency(
display_name="TensorRT",
pypi_name="tensorrt",
- version="10.0.1",
+ version="10.15.1.29",
size_estimate=int(1.2 * GB),
auto_update=False,
),
Dependency(
display_name="CUDA Python",
pypi_name="cuda-python",
- version="12.3.0",
+ version="13.1.1",
size_estimate=20 * MB,
),
],
- icon="Nvidia",
+ icon="BsNvidia",
color="#76B900",
)
tensorrt_category = package.add_category(
name="TensorRT",
description="Nodes for using NVIDIA TensorRT for optimized GPU inference.",
- icon="Nvidia",
+ icon="BsNvidia",
color="#76B900",
install_hint=install_hint,
)
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/__init__.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/__init__.py
deleted file mode 100644
index 6e9c2cdebf..0000000000
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# IO nodes for TensorRT
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py
index 4ecb56241a..05794591fd 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py
@@ -3,12 +3,18 @@
import os
from pathlib import Path
+import tensorrt as trt
+
from api import NodeContext
from logger import logger
from nodes.impl.tensorrt.memory import get_cuda_compute_capability
from nodes.impl.tensorrt.model import TensorRTEngine, TensorRTEngineInfo
from nodes.properties.inputs import TensorRTFileInput
-from nodes.properties.outputs import DirectoryOutput, FileNameOutput, TensorRTEngineOutput
+from nodes.properties.outputs import (
+ DirectoryOutput,
+ FileNameOutput,
+ TensorRTEngineOutput,
+)
from nodes.utils.utils import split_file_path
from ...settings import get_settings
@@ -24,7 +30,7 @@
"TensorRT engines are built for a specific GPU architecture and may not work "
"on different GPUs. The node will warn you if there's a potential compatibility issue."
),
- icon="Nvidia",
+ icon="BsNvidia",
inputs=[TensorRTFileInput(primary_input=True)],
outputs=[
TensorRTEngineOutput(kind="tagged").suggest(),
@@ -37,8 +43,6 @@
def load_engine_node(
context: NodeContext, path: Path
) -> tuple[TensorRTEngine, Path, str]:
- import tensorrt as trt
-
assert os.path.exists(path), f"Engine file at location {path} does not exist"
assert os.path.isfile(path), f"Path {path} is not a file"
@@ -56,8 +60,8 @@ def load_engine_node(
engine_bytes = f.read()
# Deserialize to get engine info
- trt_logger = trt.Logger(trt.Logger.WARNING)
- runtime = trt.Runtime(trt_logger)
+ trt_logger = trt.Logger(trt.Logger.WARNING) # type: ignore
+ runtime = trt.Runtime(trt_logger) # type: ignore
engine = runtime.deserialize_cuda_engine(engine_bytes)
if engine is None:
@@ -72,7 +76,9 @@ def load_engine_node(
output_name = engine.get_tensor_name(1)
input_shape = engine.get_tensor_shape(input_name)
+ logger.info("Input shape: %s", input_shape)
output_shape = engine.get_tensor_shape(output_name)
+ logger.info("Output shape: %s", output_shape)
# Detect channels
input_channels = input_shape[1] if len(input_shape) >= 4 else 3
@@ -92,9 +98,46 @@ def load_engine_node(
# Check for dynamic shapes
has_dynamic = any(d == -1 for d in input_shape)
- # Detect precision from the engine (this is a heuristic)
- # TensorRT doesn't provide a direct way to query the precision
- precision = "fp32" # Default assumption
+ # Detect precision from the engine
+ precision = "fp16"
+
+ # log out all properties of engine
+ # logger.info(dir(engine))
+ # 'device_memory_size', 'device_memory_size_v2', 'engine_capability', 'error_recorder', 'get_aliased_input_tensor', 'get_device_memory_size_for_profile', 'get_device_memory_size_for_profile_v2', 'get_engine_stat', 'get_tensor_bytes_per_component', 'get_tensor_components_per_element', 'get_tensor_dtype', 'get_tensor_format', 'get_tensor_format_desc', 'get_tensor_location', 'get_tensor_mode', 'get_tensor_name', 'get_tensor_profile_shape', 'get_tensor_profile_values', 'get_tensor_shape', 'get_tensor_vectorized_dim', 'get_weight_streaming_automatic_budget'
+ logger.info("device_memory_size: %s", engine.device_memory_size)
+ logger.info("engine_capability: %s", engine.engine_capability)
+ logger.info("get_engine_stat: %s", engine.get_engine_stat())
+ logger.info(
+ "get_tensor_dtype for input '%s': %s",
+ input_name,
+ engine.get_tensor_dtype(input_name),
+ )
+ logger.info(
+ "get_tensor_dtype for output '%s': %s",
+ output_name,
+ engine.get_tensor_dtype(output_name),
+ )
+ logger.info(
+ "get_tensor_format for input '%s': %s",
+ input_name,
+ engine.get_tensor_format(input_name),
+ )
+ logger.info(
+ "get_tensor_format for output '%s': %s",
+ output_name,
+ engine.get_tensor_format(output_name),
+ )
+
+ precision = (
+ "fp16"
+ if engine.get_tensor_dtype(input_name) == trt.DataType.HALF
+ else "fp32"
+ )
+
+ tensor_profile_name = input_name
+ min_shape, opt_shape, max_shape = engine.get_tensor_profile_shape(
+ tensor_profile_name, 0
+ )
# Create info
info = TensorRTEngineInfo(
@@ -105,9 +148,9 @@ def load_engine_node(
gpu_architecture=current_arch,
tensorrt_version=trt.__version__,
has_dynamic_shapes=has_dynamic,
- min_shape=None,
- opt_shape=None,
- max_shape=None,
+ min_shape=min_shape,
+ opt_shape=opt_shape,
+ max_shape=max_shape,
)
dirname, basename, _ = split_file_path(path)
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/__init__.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/__init__.py
deleted file mode 100644
index f82e4595cc..0000000000
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Processing nodes for TensorRT
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
index ad1677bdc0..1f9a083a7a 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
@@ -93,7 +93,7 @@ def estimate():
)
],
name="Upscale Image",
- icon="Nvidia",
+ icon="BsNvidia",
node_context=True,
)
def upscale_image_node(
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/__init__.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/__init__.py
deleted file mode 100644
index 813067c4b9..0000000000
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Utility nodes for TensorRT
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/build_engine_from_onnx.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py
similarity index 88%
rename from backend/src/packages/chaiNNer_tensorrt/tensorrt/io/build_engine_from_onnx.py
rename to backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py
index b6321bfd8b..1e8f83fdb0 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/build_engine_from_onnx.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py
@@ -4,7 +4,7 @@
from api import NodeContext
from logger import logger
-from nodes.groups import Condition, if_enum_group
+from nodes.groups import if_enum_group
from nodes.impl.onnx.model import OnnxModel
from nodes.impl.tensorrt.engine_builder import BuildConfig, build_engine_from_onnx
from nodes.impl.tensorrt.model import TensorRTEngine
@@ -16,7 +16,7 @@
from nodes.properties.outputs import TensorRTEngineOutput, TextOutput
from ...settings import get_settings
-from .. import io_group
+from .. import utility_group
class Precision(Enum):
@@ -26,7 +26,7 @@ class Precision(Enum):
PRECISION_LABELS = {
Precision.FP32: "FP32 (Higher Precision)",
- Precision.FP16: "FP16 (Faster on RTX GPUs)",
+ Precision.FP16: "FP16 (Faster)",
}
@@ -41,18 +41,18 @@ class ShapeMode(Enum):
}
-if io_group is not None:
+if utility_group is not None:
- @io_group.register(
+ @utility_group.register(
schema_id="chainner:tensorrt:build_engine",
- name="Build Engine from ONNX",
+ name="Build Engine",
description=[
"Convert an ONNX model to a TensorRT engine.",
"Building an engine can take several minutes depending on the model size and optimization settings.",
"The built engine is optimized specifically for your GPU and TensorRT version.",
"It is recommended to save the built engine for reuse, as building is slow.",
],
- icon="Nvidia",
+ icon="BsNvidia",
inputs=[
OnnxModelInput("ONNX Model"),
EnumInput(
@@ -61,8 +61,8 @@ class ShapeMode(Enum):
default=Precision.FP16,
option_labels=PRECISION_LABELS,
).with_docs(
- "FP16 is faster on RTX GPUs and uses less VRAM.",
- "FP32 provides higher precision but is slower.",
+ "FP16: lower precision but faster and uses less memory, especially on RTX GPUs. FP16 also does not work with certain models.",
+ "FP32: higher precision but slower. Use especially if FP16 fails.",
),
EnumInput(
ShapeMode,
@@ -125,7 +125,8 @@ class ShapeMode(Enum):
precision=1,
step=0.5,
).with_docs(
- "Maximum GPU memory for building. Larger values may allow better optimizations."
+ "Maximum GPU memory for building. Larger values may allow better optimizations.",
+ hint=True,
),
],
outputs=[
@@ -134,7 +135,7 @@ class ShapeMode(Enum):
],
node_context=True,
)
- def build_engine_from_onnx_node(
+ def build_engine_node(
context: NodeContext,
onnx_model: OnnxModel,
precision: Precision,
@@ -157,7 +158,9 @@ def build_engine_from_onnx_node(
# Create a cache key based on the model
model_hash = hashlib.md5(onnx_model.bytes[:1024]).hexdigest()[:8]
- timing_cache_path = f"{settings.timing_cache_path}/timing_{model_hash}.cache"
+ timing_cache_path = (
+ f"{settings.timing_cache_path}/timing_{model_hash}.cache"
+ )
use_dynamic = shape_mode == ShapeMode.DYNAMIC
@@ -194,6 +197,8 @@ def build_engine_from_onnx_node(
f"Built {precision.value.upper()} engine for {engine.info.gpu_architecture}"
)
if use_dynamic:
- build_info += f" (dynamic: {min_height}x{min_width} to {max_height}x{max_width})"
+ build_info += (
+ f" (dynamic: {min_height}x{min_width} to {max_height}x{max_width})"
+ )
return engine, build_info
diff --git a/src/renderer/components/CustomIcons.tsx b/src/renderer/components/CustomIcons.tsx
index 088f03f4de..0db24f8865 100644
--- a/src/renderer/components/CustomIcons.tsx
+++ b/src/renderer/components/CustomIcons.tsx
@@ -68,17 +68,6 @@ export const NcnnIcon = createIcon({
),
});
-export const NvidiaIcon = createIcon({
- displayName: 'NvidiaIcon',
- viewBox: '0 0 24 24',
- path: (
-
- ),
-});
-
interface IconFactoryProps {
icon?: string | null;
accentColor?: string;
@@ -121,13 +110,6 @@ export const IconFactory = memo(({ icon, accentColor, boxSize = 4 }: IconFactory
transition="0.15s ease-in-out"
/>
);
- case 'Nvidia':
- return (
-
- );
default:
break;
}
From 77fd6f5b2414df2aa9256ea8602693fdd1f83014 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sat, 31 Jan 2026 16:58:30 -0600
Subject: [PATCH 05/20] Fix static shape
---
.../properties/outputs/tensorrt_outputs.py | 24 +++++++++------
.../tensorrt/utility/build_engine.py | 30 +++++++++++++++----
2 files changed, 40 insertions(+), 14 deletions(-)
diff --git a/backend/src/nodes/properties/outputs/tensorrt_outputs.py b/backend/src/nodes/properties/outputs/tensorrt_outputs.py
index a31b294d31..1228f95ece 100644
--- a/backend/src/nodes/properties/outputs/tensorrt_outputs.py
+++ b/backend/src/nodes/properties/outputs/tensorrt_outputs.py
@@ -34,15 +34,21 @@ def get_broadcast_data(self, value: TensorRTEngine):
tags.append(i.precision.upper())
# Add architecture
- tags.append(i.gpu_architecture)
-
- # Add dynamic shape info
- if i.min_shape is not None:
- tags.append("Min: " + str(i.min_shape[2]) + "x" + str(i.min_shape[3]))
- if i.opt_shape is not None:
- tags.append("Opt: " + str(i.opt_shape[2]) + "x" + str(i.opt_shape[3]))
- if i.max_shape is not None:
- tags.append("Max: " + str(i.max_shape[2]) + "x" + str(i.max_shape[3]))
+ # tags.append(i.gpu_architecture)
+
+ # Add shape info
+ if i.has_dynamic_shapes:
+ tags.append("Dynamic")
+ if i.min_shape is not None:
+ tags.append("Min: " + str(i.min_shape[2]) + "x" + str(i.min_shape[3]))
+ if i.opt_shape is not None:
+ tags.append("Opt: " + str(i.opt_shape[2]) + "x" + str(i.opt_shape[3]))
+ if i.max_shape is not None:
+ tags.append("Max: " + str(i.max_shape[2]) + "x" + str(i.max_shape[3]))
+ else:
+ tags.append("Fixed")
+ if i.opt_shape is not None:
+ tags.append(str(i.opt_shape[2]) + "x" + str(i.opt_shape[3]))
return {"tags": tags}
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py
index 1e8f83fdb0..dde82f38be 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py
@@ -117,6 +117,22 @@ class ShapeMode(Enum):
unit="px",
).with_docs("Maximum input width for dynamic shapes."),
),
+ if_enum_group(2, ShapeMode.FIXED)(
+ NumberInput(
+ "Height",
+ default=256,
+ min=16,
+ max=8192,
+ unit="px",
+ ).with_docs("Fixed input height."),
+ NumberInput(
+ "Width",
+ default=256,
+ min=16,
+ max=8192,
+ unit="px",
+ ).with_docs("Fixed input width."),
+ ),
NumberInput(
"Workspace (GB)",
default=4.0,
@@ -130,7 +146,7 @@ class ShapeMode(Enum):
),
],
outputs=[
- TensorRTEngineOutput(kind="tagged"),
+ TensorRTEngineOutput(),
TextOutput("Build Info"),
],
node_context=True,
@@ -146,6 +162,8 @@ def build_engine_node(
opt_width: int,
max_height: int,
max_width: int,
+ static_height: int,
+ static_width: int,
workspace: float,
) -> tuple[TensorRTEngine, str]:
settings = get_settings(context)
@@ -164,11 +182,13 @@ def build_engine_node(
use_dynamic = shape_mode == ShapeMode.DYNAMIC
- # For fixed mode, use reasonable defaults
if not use_dynamic:
- min_height = min_width = 64
- opt_height = opt_width = 256
- max_height = max_width = 256
+ min_height = static_height
+ min_width = static_width
+ opt_height = static_height
+ opt_width = static_width
+ max_height = static_height
+ max_width = static_width
config = BuildConfig(
precision=precision.value,
From 96253ffa040c326db5057b74072b4372396fd0e7 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sat, 31 Jan 2026 19:36:18 -0600
Subject: [PATCH 06/20] Type errors when image size unsupported by TRT engine
---
.../properties/outputs/tensorrt_outputs.py | 23 +++++++++++++
.../tensorrt/processing/upscale_image.py | 33 ++++++++++++++++++-
src/common/types/chainner-scope.ts | 6 ++++
3 files changed, 61 insertions(+), 1 deletion(-)
diff --git a/backend/src/nodes/properties/outputs/tensorrt_outputs.py b/backend/src/nodes/properties/outputs/tensorrt_outputs.py
index 1228f95ece..37270ca12d 100644
--- a/backend/src/nodes/properties/outputs/tensorrt_outputs.py
+++ b/backend/src/nodes/properties/outputs/tensorrt_outputs.py
@@ -65,4 +65,27 @@ def get_broadcast_type(self, value: TensorRTEngine):
if i.output_channels:
fields["outputChannels"] = i.output_channels
+ # Add shape constraints (min/max height/width)
+ # Shape tuples are NCHW format: (batch, channels, height, width)
+ if i.min_shape is not None:
+ fields["minHeight"] = i.min_shape[2]
+ fields["minWidth"] = i.min_shape[3]
+ else:
+ fields["minHeight"] = "null"
+ fields["minWidth"] = "null"
+
+ if i.max_shape is not None:
+ fields["maxHeight"] = i.max_shape[2]
+ fields["maxWidth"] = i.max_shape[3]
+ else:
+ fields["maxHeight"] = "null"
+ fields["maxWidth"] = "null"
+
+ if i.opt_shape is not None:
+ fields["optHeight"] = i.opt_shape[2]
+ fields["optWidth"] = i.opt_shape[3]
+ else:
+ fields["optHeight"] = "null"
+ fields["optWidth"] = "null"
+
return navi.named("TensorRTEngine", fields)
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
index 1f9a083a7a..0a8999a1d9 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
@@ -89,7 +89,38 @@ def estimate():
outputs=[
ImageOutput(
"Image",
- image_type="convenientUpscaleTrt(Input0, Input1)",
+ image_type="""
+ let engine = Input0;
+ let image = Input1;
+
+ // Check minimum size constraints
+ let minWidthOk = match engine.minWidth {
+ null => true,
+ _ as w => image.width >= w
+ };
+ let minHeightOk = match engine.minHeight {
+ null => true,
+ _ as h => image.height >= h
+ };
+
+ // Check maximum size constraints
+ let maxWidthOk = match engine.maxWidth {
+ null => true,
+ _ as w => image.width <= w
+ };
+ let maxHeightOk = match engine.maxHeight {
+ null => true,
+ _ as h => image.height <= h
+ };
+
+ if not minWidthOk or not minHeightOk {
+ error("Image is smaller than the minimum size supported by this TensorRT engine.")
+ } else if not maxWidthOk or not maxHeightOk {
+ error("Image is larger than the maximum size supported by this TensorRT engine.")
+ } else {
+ convenientUpscaleTrt(engine, image)
+ }
+ """,
)
],
name="Upscale Image",
diff --git a/src/common/types/chainner-scope.ts b/src/common/types/chainner-scope.ts
index dd1ef6548b..cd235611c4 100644
--- a/src/common/types/chainner-scope.ts
+++ b/src/common/types/chainner-scope.ts
@@ -97,6 +97,12 @@ struct TensorRTEngine {
inputChannels: int(1..),
outputChannels: int(1..),
precision: string,
+ minHeight: int(1..) | null,
+ minWidth: int(1..) | null,
+ optHeight: int(1..) | null,
+ optWidth: int(1..) | null,
+ maxHeight: int(1..) | null,
+ maxWidth: int(1..) | null,
}
enum TrtPrecision { fp32, fp16 }
From f6e5cb38ed1fa835c5acab250316ac7053412b95 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sat, 31 Jan 2026 22:20:07 -0600
Subject: [PATCH 07/20] comment out validation for now
---
.../tensorrt/processing/upscale_image.py | 59 ++++++++++---------
1 file changed, 31 insertions(+), 28 deletions(-)
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
index 0a8999a1d9..960582a312 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
@@ -90,36 +90,39 @@ def estimate():
ImageOutput(
"Image",
image_type="""
+ // Until we can take tile size into account, we don't want to error unnecessarily, so i've commented out the checks for now
+
let engine = Input0;
let image = Input1;
-
- // Check minimum size constraints
- let minWidthOk = match engine.minWidth {
- null => true,
- _ as w => image.width >= w
- };
- let minHeightOk = match engine.minHeight {
- null => true,
- _ as h => image.height >= h
- };
-
- // Check maximum size constraints
- let maxWidthOk = match engine.maxWidth {
- null => true,
- _ as w => image.width <= w
- };
- let maxHeightOk = match engine.maxHeight {
- null => true,
- _ as h => image.height <= h
- };
-
- if not minWidthOk or not minHeightOk {
- error("Image is smaller than the minimum size supported by this TensorRT engine.")
- } else if not maxWidthOk or not maxHeightOk {
- error("Image is larger than the maximum size supported by this TensorRT engine.")
- } else {
- convenientUpscaleTrt(engine, image)
- }
+ convenientUpscaleTrt(engine, image)
+
+ // // Check minimum size constraints
+ // let minWidthOk = match engine.minWidth {
+ // null => true,
+ // _ as w => image.width >= w
+ // };
+ // let minHeightOk = match engine.minHeight {
+ // null => true,
+ // _ as h => image.height >= h
+ // };
+
+ // // Check maximum size constraints
+ // let maxWidthOk = match engine.maxWidth {
+ // null => true,
+ // _ as w => image.width >= w
+ // };
+ // let maxHeightOk = match engine.maxHeight {
+ // null => true,
+ // _ as h => image.height <= h
+ // };
+
+ // if not minWidthOk or not minHeightOk {
+ // error("Image is smaller than the minimum size supported by this TensorRT engine.")
+ // } else if not maxWidthOk or not maxHeightOk {
+ // error("Image is larger than the maximum size supported by this TensorRT engine.")
+ // } else {
+ // convenientUpscaleTrt(engine, image)
+ // }
""",
)
],
From 6dcf53ed307e26a5cf60431f844c8d61ceaf680f Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sat, 31 Jan 2026 22:20:35 -0600
Subject: [PATCH 08/20] Remove some unneeded logging
---
.../tensorrt/io/load_engine.py | 29 -------------------
1 file changed, 29 deletions(-)
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py
index 05794591fd..4fcff48e99 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py
@@ -99,35 +99,6 @@ def load_engine_node(
has_dynamic = any(d == -1 for d in input_shape)
# Detect precision from the engine
- precision = "fp16"
-
- # log out all properties of engine
- # logger.info(dir(engine))
- # 'device_memory_size', 'device_memory_size_v2', 'engine_capability', 'error_recorder', 'get_aliased_input_tensor', 'get_device_memory_size_for_profile', 'get_device_memory_size_for_profile_v2', 'get_engine_stat', 'get_tensor_bytes_per_component', 'get_tensor_components_per_element', 'get_tensor_dtype', 'get_tensor_format', 'get_tensor_format_desc', 'get_tensor_location', 'get_tensor_mode', 'get_tensor_name', 'get_tensor_profile_shape', 'get_tensor_profile_values', 'get_tensor_shape', 'get_tensor_vectorized_dim', 'get_weight_streaming_automatic_budget'
- logger.info("device_memory_size: %s", engine.device_memory_size)
- logger.info("engine_capability: %s", engine.engine_capability)
- logger.info("get_engine_stat: %s", engine.get_engine_stat())
- logger.info(
- "get_tensor_dtype for input '%s': %s",
- input_name,
- engine.get_tensor_dtype(input_name),
- )
- logger.info(
- "get_tensor_dtype for output '%s': %s",
- output_name,
- engine.get_tensor_dtype(output_name),
- )
- logger.info(
- "get_tensor_format for input '%s': %s",
- input_name,
- engine.get_tensor_format(input_name),
- )
- logger.info(
- "get_tensor_format for output '%s': %s",
- output_name,
- engine.get_tensor_format(output_name),
- )
-
precision = (
"fp16"
if engine.get_tensor_dtype(input_name) == trt.DataType.HALF
From eff40e964c3377f3414e4d13ba21d3938241f59e Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sat, 31 Jan 2026 22:53:00 -0600
Subject: [PATCH 09/20] Improve get engine info
---
.../tensorrt/utility/get_engine_info.py | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py
index 531cb6ac99..d2db3238bd 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py
@@ -18,7 +18,7 @@
NumberOutput(
"Scale",
output_type="""
- if Input0.scale == null { 0 } else { Input0.scale }
+ if Input0.scale == null { -1 } else { Input0.scale }
""",
),
TextOutput(
@@ -41,20 +41,24 @@ def get_engine_info_node(
) -> tuple[int, str, str, str, str]:
info = engine.info
- scale = info.scale if info.scale is not None else 0
+ scale = info.scale if info.scale is not None else -1
precision = info.precision.upper()
gpu_arch = info.gpu_architecture
trt_version = info.tensorrt_version
if info.has_dynamic_shapes:
- if info.min_shape and info.max_shape:
+ if info.min_shape and info.max_shape and info.opt_shape:
shape_info = (
- f"Dynamic: {info.min_shape[0]}x{info.min_shape[1]} to "
- f"{info.max_shape[0]}x{info.max_shape[1]}"
+ f"Dynamic: "
+ f"Min: {info.min_shape[2]}x{info.min_shape[3]} "
+ f"Opt: {info.opt_shape[2]}x{info.opt_shape[3]} "
+ f"Max: {info.max_shape[2]}x{info.max_shape[3]}"
)
else:
- shape_info = "Dynamic shapes"
+ shape_info = "Dynamic"
+ elif info.opt_shape:
+ shape_info = f"Fixed: {info.opt_shape[2]}x{info.opt_shape[3]}"
else:
- shape_info = "Fixed shapes"
+ shape_info = "Fixed"
return scale, precision, gpu_arch, trt_version, shape_info
From af1e5cf40ee2fb6e9188ecbf8b256da289a628f9 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 18:57:14 -0600
Subject: [PATCH 10/20] Account for min/max engine support when doing tiling
---
.../nodes/impl/upscale/auto_split_tiles.py | 2 +-
backend/src/nodes/impl/upscale/tiler.py | 44 +++++++
.../tensorrt/processing/upscale_image.py | 113 +++++++++++-------
3 files changed, 117 insertions(+), 42 deletions(-)
diff --git a/backend/src/nodes/impl/upscale/auto_split_tiles.py b/backend/src/nodes/impl/upscale/auto_split_tiles.py
index d6e14ff7ee..f063aee602 100644
--- a/backend/src/nodes/impl/upscale/auto_split_tiles.py
+++ b/backend/src/nodes/impl/upscale/auto_split_tiles.py
@@ -6,7 +6,7 @@
from logger import logger
from ...utils.utils import get_h_w_c
-from .tiler import MaxTileSize, NoTiling, Tiler
+from .tiler import BoundedTileSize, MaxTileSize, NoTiling, Tiler
GB_AMT = 1024**3
diff --git a/backend/src/nodes/impl/upscale/tiler.py b/backend/src/nodes/impl/upscale/tiler.py
index 3741b5b1b7..c919249deb 100644
--- a/backend/src/nodes/impl/upscale/tiler.py
+++ b/backend/src/nodes/impl/upscale/tiler.py
@@ -70,3 +70,47 @@ def split(self, tile_size: Size) -> Size:
f"Splits are not supported for exact size ({self.exact_size[0]}x{self.exact_size[1]}px) splitting."
f" This typically means that your machine does not have enough VRAM to run the current model."
)
+
+
+class BoundedTileSize(Tiler):
+ """
+ A tiler that respects min/max dimension constraints and uses exact tile sizes.
+
+ This is useful for inference engines (like TensorRT or some ONNX models) that have
+ specific min/max bounds on input dimensions. The tiler ensures that tile sizes stay
+ within those bounds and uses exact tile sizes (with padding for smaller images) to
+ guarantee the engine receives properly sized inputs.
+ """
+
+ def __init__(
+ self,
+ tile_size: int,
+ min_size: Size | None = None,
+ max_size: Size | None = None,
+ ) -> None:
+ self.min_w, self.min_h = min_size if min_size else (1, 1)
+ self.max_w, self.max_h = max_size if max_size else (2**31, 2**31)
+
+ # Automatically clamp tile size to be within bounds
+ self.tile_size = max(self.min_w, self.min_h, min(tile_size, self.max_w, self.max_h))
+
+ def allow_smaller_tile_size(self) -> bool:
+ # Use exact tile sizes to ensure the engine gets properly sized inputs
+ return False
+
+ def starting_tile_size(self, width: int, height: int, channels: int) -> Size:
+ return self.tile_size, self.tile_size
+
+ def split(self, tile_size: Size) -> Size:
+ w, h = tile_size
+ new_w = max(self.min_w, w // 2)
+ new_h = max(self.min_h, h // 2)
+
+ # Check if we can actually split further
+ if new_w == w and new_h == h:
+ raise ValueError(
+ f"Cannot reduce tile size below the minimum size ({self.min_w}x{self.min_h}). "
+ f"This typically means your machine does not have enough VRAM."
+ )
+
+ return new_w, new_h
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
index 960582a312..9732f79fb1 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
@@ -10,12 +10,13 @@
from nodes.impl.tensorrt.model import TensorRTEngine
from nodes.impl.upscale.auto_split_tiles import (
CUSTOM,
+ MAX_TILE_SIZE,
+ NO_TILING,
TILE_SIZE_256,
TileSize,
- parse_tile_size_input,
)
from nodes.impl.upscale.convenient_upscale import convenient_upscale
-from nodes.impl.upscale.tiler import MaxTileSize
+from nodes.impl.upscale.tiler import BoundedTileSize, NoTiling, Tiler
from nodes.properties.inputs import (
BoolInput,
ImageInput,
@@ -33,18 +34,43 @@
def upscale(
img: np.ndarray,
engine: TensorRTEngine,
- tile_size: TileSize,
+ tiler: Tiler,
gpu_index: int,
) -> np.ndarray:
logger.debug("Upscaling image with TensorRT")
+ return tensorrt_auto_split(img, engine, tiler, gpu_index=gpu_index)
- def estimate():
- # Conservative estimate for TensorRT
- return MaxTileSize(TILE_SIZE_256)
-
- tiler = parse_tile_size_input(tile_size, estimate)
- return tensorrt_auto_split(img, engine, tiler, gpu_index=gpu_index)
+def create_tiler_for_engine(engine: TensorRTEngine, tile_size: TileSize, custom_tile_size: int) -> Tiler:
+ """
+ Create an appropriate tiler based on the engine's constraints and tile size setting.
+ """
+ # Extract min/max size constraints from engine info
+ # Shape tuples are NCHW format: (batch, channels, height, width)
+ info = engine.info
+ min_size = None
+ max_size = None
+
+ if info.min_shape is not None:
+ min_size = (info.min_shape[3], info.min_shape[2]) # (width, height)
+ if info.max_shape is not None:
+ max_size = (info.max_shape[3], info.max_shape[2]) # (width, height)
+
+ if tile_size == NO_TILING:
+ # No tiling - use the whole image (type validation ensures it fits within bounds)
+ return NoTiling()
+ elif tile_size == MAX_TILE_SIZE:
+ # Use the maximum size allowed by the engine
+ if max_size is not None:
+ size = min(max_size[0], max_size[1])
+ else:
+ size = 2**31 # Effectively unlimited
+ return BoundedTileSize(size, min_size=min_size, max_size=max_size)
+ elif tile_size == CUSTOM:
+ return BoundedTileSize(custom_tile_size, min_size=min_size, max_size=max_size)
+ else:
+ # Numeric tile size
+ return BoundedTileSize(int(tile_size), min_size=min_size, max_size=max_size)
if processing_group is not None:
@@ -90,39 +116,41 @@ def estimate():
ImageOutput(
"Image",
image_type="""
- // Until we can take tile size into account, we don't want to error unnecessarily, so i've commented out the checks for now
-
let engine = Input0;
let image = Input1;
- convenientUpscaleTrt(engine, image)
-
- // // Check minimum size constraints
- // let minWidthOk = match engine.minWidth {
- // null => true,
- // _ as w => image.width >= w
- // };
- // let minHeightOk = match engine.minHeight {
- // null => true,
- // _ as h => image.height >= h
- // };
-
- // // Check maximum size constraints
- // let maxWidthOk = match engine.maxWidth {
- // null => true,
- // _ as w => image.width >= w
- // };
- // let maxHeightOk = match engine.maxHeight {
- // null => true,
- // _ as h => image.height <= h
- // };
-
- // if not minWidthOk or not minHeightOk {
- // error("Image is smaller than the minimum size supported by this TensorRT engine.")
- // } else if not maxWidthOk or not maxHeightOk {
- // error("Image is larger than the maximum size supported by this TensorRT engine.")
- // } else {
- // convenientUpscaleTrt(engine, image)
- // }
+ let tileSize = Input2;
+
+ // Only validate image dimensions when "No Tiling" is selected (-1)
+ // Otherwise, the BoundedTileSize tiler handles constraints at runtime
+ let noTiling = tileSize == -1;
+
+ // Check minimum size constraints
+ let minWidthOk = match engine.minWidth {
+ null => true,
+ _ as w => image.width >= w
+ };
+ let minHeightOk = match engine.minHeight {
+ null => true,
+ _ as h => image.height >= h
+ };
+
+ // Check maximum size constraints
+ let maxWidthOk = match engine.maxWidth {
+ null => true,
+ _ as w => image.width <= w
+ };
+ let maxHeightOk = match engine.maxHeight {
+ null => true,
+ _ as h => image.height <= h
+ };
+
+ if noTiling and (not minWidthOk or not minHeightOk) {
+ error("Image is smaller than the minimum size supported by this TensorRT engine. Use tiling or resize the image.")
+ } else if noTiling and (not maxWidthOk or not maxHeightOk) {
+ error("Image is larger than the maximum size supported by this TensorRT engine. Use tiling or resize the image.")
+ } else {
+ convenientUpscaleTrt(engine, image)
+ }
""",
)
],
@@ -149,6 +177,9 @@ def upscale_image_node(
h, w, c = get_h_w_c(img)
logger.debug("Image is %dx%dx%d", h, w, c)
+ # Create the appropriate tiler based on engine constraints and tile size setting
+ tiler = create_tiler_for_engine(engine, tile_size, custom_tile_size)
+
return convenient_upscale(
img,
in_nc,
@@ -156,7 +187,7 @@ def upscale_image_node(
lambda i: upscale(
i,
engine,
- TileSize(custom_tile_size) if tile_size == CUSTOM else tile_size,
+ tiler,
gpu_index,
),
separate_alpha,
From d44fff7873133310a2f91dc9cd773390dc98563a Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 19:09:33 -0600
Subject: [PATCH 11/20] Add values to tilesize for checking against
---
.../nodes/properties/inputs/generic_inputs.py | 34 +++++++++++++++----
.../tensorrt/processing/upscale_image.py | 4 +--
src/common/types/chainner-scope.ts | 2 +-
3 files changed, 30 insertions(+), 10 deletions(-)
diff --git a/backend/src/nodes/properties/inputs/generic_inputs.py b/backend/src/nodes/properties/inputs/generic_inputs.py
index ce68b4dc47..b41ae4d0af 100644
--- a/backend/src/nodes/properties/inputs/generic_inputs.py
+++ b/backend/src/nodes/properties/inputs/generic_inputs.py
@@ -581,15 +581,35 @@ def TileSizeDropdown(
) -> DropDownInput:
options = []
if estimate:
- options.append({"option": "Auto (estimate)", "value": ESTIMATE})
-
- options.append({"option": "Maximum", "value": MAX_TILE_SIZE})
- options.append({"option": "No Tiling", "value": NO_TILING})
+ options.append({
+ "option": "Auto (estimate)",
+ "value": ESTIMATE,
+ "type": f"TileSize {{ value: {int(ESTIMATE)} }}",
+ })
+
+ options.append({
+ "option": "Maximum",
+ "value": MAX_TILE_SIZE,
+ "type": f"TileSize {{ value: {int(MAX_TILE_SIZE)} }}",
+ })
+ options.append({
+ "option": "No Tiling",
+ "value": NO_TILING,
+ "type": f"TileSize {{ value: {int(NO_TILING)} }}",
+ })
for size in [128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096]:
- options.append({"option": str(size), "value": size})
-
- options.append({"option": "Custom", "value": CUSTOM})
+ options.append({
+ "option": str(size),
+ "value": size,
+ "type": f"TileSize {{ value: {size} }}",
+ })
+
+ options.append({
+ "option": "Custom",
+ "value": CUSTOM,
+ "type": f"TileSize {{ value: {int(CUSTOM)} }}",
+ })
return DropDownInput(
input_type="TileSize",
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
index 9732f79fb1..99c27372f6 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
@@ -120,9 +120,9 @@ def create_tiler_for_engine(engine: TensorRTEngine, tile_size: TileSize, custom_
let image = Input1;
let tileSize = Input2;
- // Only validate image dimensions when "No Tiling" is selected (-1)
+ // Only validate image dimensions when "No Tiling" is selected (value == -1)
// Otherwise, the BoundedTileSize tiler handles constraints at runtime
- let noTiling = tileSize == -1;
+ let noTiling = tileSize.value == -1;
// Check minimum size constraints
let minWidthOk = match engine.minWidth {
diff --git a/src/common/types/chainner-scope.ts b/src/common/types/chainner-scope.ts
index cd235611c4..8145175a79 100644
--- a/src/common/types/chainner-scope.ts
+++ b/src/common/types/chainner-scope.ts
@@ -130,7 +130,7 @@ struct ColorSpace { channels: 1 | 3 | 4, supportsAlpha: bool }
struct DdsFormat;
struct DdsMipMaps;
struct RotateInterpolationMode;
-struct TileSize;
+struct TileSize { value: int }
struct AudioStream;
enum FpMode { fp32, fp16 }
From 29803febd028670e9af05c21e49954c477a3f8e6 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 19:16:36 -0600
Subject: [PATCH 12/20] Lint + Format
---
backend/src/nodes/impl/tensorrt/auto_split.py | 2 +-
.../nodes/impl/upscale/auto_split_tiles.py | 2 +-
backend/src/nodes/impl/upscale/tiler.py | 4 +-
.../nodes/properties/inputs/generic_inputs.py | 64 +++++++++++--------
.../tensorrt/io/save_engine.py | 6 +-
.../tensorrt/processing/upscale_image.py | 4 +-
6 files changed, 50 insertions(+), 32 deletions(-)
diff --git a/backend/src/nodes/impl/tensorrt/auto_split.py b/backend/src/nodes/impl/tensorrt/auto_split.py
index 137d6530e0..3a1676f6f6 100644
--- a/backend/src/nodes/impl/tensorrt/auto_split.py
+++ b/backend/src/nodes/impl/tensorrt/auto_split.py
@@ -102,7 +102,7 @@ def upscale(img: np.ndarray, _: object):
# Check for CUDA OOM errors
if (
"out of memory" in error_str
- or "cuda" in error_str and "memory" in error_str
+ or ("cuda" in error_str and "memory" in error_str)
or "allocation" in error_str
):
raise RuntimeError( # noqa: B904
diff --git a/backend/src/nodes/impl/upscale/auto_split_tiles.py b/backend/src/nodes/impl/upscale/auto_split_tiles.py
index f063aee602..d6e14ff7ee 100644
--- a/backend/src/nodes/impl/upscale/auto_split_tiles.py
+++ b/backend/src/nodes/impl/upscale/auto_split_tiles.py
@@ -6,7 +6,7 @@
from logger import logger
from ...utils.utils import get_h_w_c
-from .tiler import BoundedTileSize, MaxTileSize, NoTiling, Tiler
+from .tiler import MaxTileSize, NoTiling, Tiler
GB_AMT = 1024**3
diff --git a/backend/src/nodes/impl/upscale/tiler.py b/backend/src/nodes/impl/upscale/tiler.py
index c919249deb..7f0f6d791c 100644
--- a/backend/src/nodes/impl/upscale/tiler.py
+++ b/backend/src/nodes/impl/upscale/tiler.py
@@ -92,7 +92,9 @@ def __init__(
self.max_w, self.max_h = max_size if max_size else (2**31, 2**31)
# Automatically clamp tile size to be within bounds
- self.tile_size = max(self.min_w, self.min_h, min(tile_size, self.max_w, self.max_h))
+ self.tile_size = max(
+ self.min_w, self.min_h, min(tile_size, self.max_w, self.max_h)
+ )
def allow_smaller_tile_size(self) -> bool:
# Use exact tile sizes to ensure the engine gets properly sized inputs
diff --git a/backend/src/nodes/properties/inputs/generic_inputs.py b/backend/src/nodes/properties/inputs/generic_inputs.py
index b41ae4d0af..7ccfb5550d 100644
--- a/backend/src/nodes/properties/inputs/generic_inputs.py
+++ b/backend/src/nodes/properties/inputs/generic_inputs.py
@@ -581,35 +581,45 @@ def TileSizeDropdown(
) -> DropDownInput:
options = []
if estimate:
- options.append({
- "option": "Auto (estimate)",
- "value": ESTIMATE,
- "type": f"TileSize {{ value: {int(ESTIMATE)} }}",
- })
-
- options.append({
- "option": "Maximum",
- "value": MAX_TILE_SIZE,
- "type": f"TileSize {{ value: {int(MAX_TILE_SIZE)} }}",
- })
- options.append({
- "option": "No Tiling",
- "value": NO_TILING,
- "type": f"TileSize {{ value: {int(NO_TILING)} }}",
- })
+ options.append(
+ {
+ "option": "Auto (estimate)",
+ "value": ESTIMATE,
+ "type": f"TileSize {{ value: {int(ESTIMATE)} }}",
+ }
+ )
+
+ options.append(
+ {
+ "option": "Maximum",
+ "value": MAX_TILE_SIZE,
+ "type": f"TileSize {{ value: {int(MAX_TILE_SIZE)} }}",
+ }
+ )
+ options.append(
+ {
+ "option": "No Tiling",
+ "value": NO_TILING,
+ "type": f"TileSize {{ value: {int(NO_TILING)} }}",
+ }
+ )
for size in [128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096]:
- options.append({
- "option": str(size),
- "value": size,
- "type": f"TileSize {{ value: {size} }}",
- })
-
- options.append({
- "option": "Custom",
- "value": CUSTOM,
- "type": f"TileSize {{ value: {int(CUSTOM)} }}",
- })
+ options.append(
+ {
+ "option": str(size),
+ "value": size,
+ "type": f"TileSize {{ value: {size} }}",
+ }
+ )
+
+ options.append(
+ {
+ "option": "Custom",
+ "value": CUSTOM,
+ "type": f"TileSize {{ value: {int(CUSTOM)} }}",
+ }
+ )
return DropDownInput(
input_type="TileSize",
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py
index fee3600295..60ba1b22e8 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py
@@ -4,7 +4,11 @@
from logger import logger
from nodes.impl.tensorrt.model import TensorRTEngine
-from nodes.properties.inputs import DirectoryInput, RelativePathInput, TensorRTEngineInput
+from nodes.properties.inputs import (
+ DirectoryInput,
+ RelativePathInput,
+ TensorRTEngineInput,
+)
from .. import io_group
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
index 99c27372f6..bcef15d58d 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py
@@ -41,7 +41,9 @@ def upscale(
return tensorrt_auto_split(img, engine, tiler, gpu_index=gpu_index)
-def create_tiler_for_engine(engine: TensorRTEngine, tile_size: TileSize, custom_tile_size: int) -> Tiler:
+def create_tiler_for_engine(
+ engine: TensorRTEngine, tile_size: TileSize, custom_tile_size: int
+) -> Tiler:
"""
Create an appropriate tiler based on the engine's constraints and tile size setting.
"""
From 51c76456ce7177a2e33826a21652be75ce376c21 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 19:30:03 -0600
Subject: [PATCH 13/20] Fixes & remove unneeded settings
---
.../src/nodes/impl/tensorrt/engine_builder.py | 14 ++---
backend/src/nodes/impl/tensorrt/inference.py | 16 ++++--
backend/src/nodes/impl/tensorrt/memory.py | 5 ++
.../packages/chaiNNer_tensorrt/settings.py | 56 +------------------
4 files changed, 23 insertions(+), 68 deletions(-)
diff --git a/backend/src/nodes/impl/tensorrt/engine_builder.py b/backend/src/nodes/impl/tensorrt/engine_builder.py
index 37441ac7a7..c46bac3185 100644
--- a/backend/src/nodes/impl/tensorrt/engine_builder.py
+++ b/backend/src/nodes/impl/tensorrt/engine_builder.py
@@ -3,7 +3,7 @@
from __future__ import annotations
from dataclasses import dataclass
-from typing import Literal
+from typing import Any, Literal
from logger import logger
@@ -37,7 +37,7 @@ def __init__(self):
trt.ILogger.Severity.VERBOSE: logger.debug,
}
- def log(self, severity, msg):
+ def log(self, severity: Any, msg: str) -> None:
log_fn = self.severity_map.get(severity, logger.debug)
log_fn("[TensorRT] %s", msg)
@@ -48,7 +48,7 @@ def get_trt_logger():
# Use a simple logger class
class SimpleLogger(trt.ILogger):
- def log(self, severity, msg):
+ def log(self, severity: Any, msg: str) -> None:
if severity <= trt.ILogger.Severity.WARNING:
logger.warning("[TensorRT] %s", msg)
elif severity <= trt.ILogger.Severity.INFO:
@@ -61,8 +61,8 @@ def log(self, severity, msg):
def parse_onnx_model(
onnx_bytes: bytes,
- network,
- parser,
+ network: Any,
+ parser: Any,
) -> None:
"""Parse an ONNX model into a TensorRT network."""
success = parser.parse(onnx_bytes)
@@ -76,9 +76,9 @@ def parse_onnx_model(
def configure_builder_config(
- builder,
+ builder: Any,
config: BuildConfig,
-) -> None:
+) -> Any:
"""Configure the TensorRT builder with the given settings."""
import tensorrt as trt
diff --git a/backend/src/nodes/impl/tensorrt/inference.py b/backend/src/nodes/impl/tensorrt/inference.py
index 9884c1e17a..25df584683 100644
--- a/backend/src/nodes/impl/tensorrt/inference.py
+++ b/backend/src/nodes/impl/tensorrt/inference.py
@@ -2,6 +2,8 @@
from __future__ import annotations
+import types
+
import numpy as np
from .memory import CudaMemoryManager
@@ -73,7 +75,12 @@ def __enter__(self):
self.load()
return self
- def __exit__(self, exc_type, exc_val, exc_tb):
+ def __exit__(
+ self,
+ exc_type: type[BaseException] | None,
+ exc_val: BaseException | None,
+ exc_tb: types.TracebackType | None,
+ ) -> bool:
self.unload()
return False
@@ -155,10 +162,8 @@ def infer(self, input_array: np.ndarray) -> np.ndarray:
input_buffer.free()
output_buffer.free()
# Remove from tracked buffers
- if input_buffer in self._memory_manager._buffers:
- self._memory_manager._buffers.remove(input_buffer)
- if output_buffer in self._memory_manager._buffers:
- self._memory_manager._buffers.remove(output_buffer)
+ self._memory_manager.remove_buffer(input_buffer)
+ self._memory_manager.remove_buffer(output_buffer)
return output_array.astype(np.float32)
@@ -188,7 +193,6 @@ def get_tensorrt_session(
def clear_session_cache() -> None:
"""Clear the session cache and unload all engines."""
- global _session_cache
for session in _session_cache.values():
session.unload()
_session_cache.clear()
diff --git a/backend/src/nodes/impl/tensorrt/memory.py b/backend/src/nodes/impl/tensorrt/memory.py
index 9c7644dec5..6dbdb8901a 100644
--- a/backend/src/nodes/impl/tensorrt/memory.py
+++ b/backend/src/nodes/impl/tensorrt/memory.py
@@ -119,6 +119,11 @@ def synchronize_stream(self) -> None:
if self._stream is not None:
self._check_cuda_error(cudart.cudaStreamSynchronize(self._stream))
+ def remove_buffer(self, buffer: CudaBuffer) -> None:
+ """Remove a buffer from tracked buffers without freeing it."""
+ if buffer in self._buffers:
+ self._buffers.remove(buffer)
+
def cleanup(self) -> None:
"""Free all allocated resources."""
from cuda.bindings import runtime as cudart
diff --git a/backend/src/packages/chaiNNer_tensorrt/settings.py b/backend/src/packages/chaiNNer_tensorrt/settings.py
index 7495c2ff18..500deaacfd 100644
--- a/backend/src/packages/chaiNNer_tensorrt/settings.py
+++ b/backend/src/packages/chaiNNer_tensorrt/settings.py
@@ -3,7 +3,7 @@
import os
from dataclasses import dataclass
-from api import CacheSetting, DropdownSetting, NodeContext, NumberSetting
+from api import CacheSetting, DropdownSetting, NodeContext
from gpu import nvidia
from logger import logger
@@ -20,41 +20,6 @@
)
)
- should_fp16 = nvidia.is_available and nvidia.all_support_fp16
-
- package.add_setting(
- DropdownSetting(
- label="Default Precision",
- key="default_precision",
- description="Default precision for building new TensorRT engines. FP16 is faster on RTX GPUs.",
- options=[
- {"label": "FP32 (Higher Precision)", "value": "fp32"},
- {"label": "FP16 (Faster on RTX GPUs)", "value": "fp16"},
- ],
- default="fp16" if should_fp16 else "fp32",
- )
- )
-
- package.add_setting(
- NumberSetting(
- label="Workspace Size (GB)",
- key="workspace_size",
- description="Maximum GPU memory to use during engine building. Larger values may allow better optimizations.",
- default=4.0,
- min=1.0,
- max=32.0,
- )
- )
-
- package.add_setting(
- CacheSetting(
- label="Engine Cache",
- key="engine_cache",
- description="Directory to cache built TensorRT engines. Engines are specific to your GPU and TensorRT version.",
- directory="tensorrt_engine_cache",
- )
- )
-
package.add_setting(
CacheSetting(
label="Timing Cache",
@@ -68,28 +33,12 @@
@dataclass(frozen=True)
class TensorRTSettings:
gpu_index: int
- default_precision: str
- workspace_size: float
- engine_cache_path: str | None
timing_cache_path: str | None
-def _get_float(settings, key: str, default: float) -> float:
- """Helper to get float value from settings."""
- raw = settings._SettingsParser__settings.get(key, default)
- if isinstance(raw, (int, float)):
- return float(raw)
- return default
-
-
def get_settings(context: NodeContext) -> TensorRTSettings:
settings = context.settings
- engine_cache_path = settings.get_cache_location("engine_cache")
- if engine_cache_path and not os.path.exists(engine_cache_path):
- os.makedirs(engine_cache_path)
- logger.info("Created TensorRT engine cache at: %s", engine_cache_path)
-
timing_cache_path = settings.get_cache_location("timing_cache")
if timing_cache_path and not os.path.exists(timing_cache_path):
os.makedirs(timing_cache_path)
@@ -97,8 +46,5 @@ def get_settings(context: NodeContext) -> TensorRTSettings:
return TensorRTSettings(
gpu_index=settings.get_int("gpu_index", 0, parse_str=True),
- default_precision=settings.get_str("default_precision", "fp32"),
- workspace_size=_get_float(settings, "workspace_size", 4.0),
- engine_cache_path=engine_cache_path,
timing_cache_path=timing_cache_path,
)
From acd0be6f0805332d35b342ac032e328b5f8567b3 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 20:33:55 -0600
Subject: [PATCH 14/20] ignore certain pyright errors for tensorrt
---
.../src/nodes/impl/tensorrt/engine_builder.py | 14 ++-------
backend/src/nodes/impl/tensorrt/inference.py | 5 ++-
backend/src/nodes/impl/tensorrt/memory.py | 31 +------------------
pyrightconfig.json | 22 ++++++++++++-
4 files changed, 27 insertions(+), 45 deletions(-)
diff --git a/backend/src/nodes/impl/tensorrt/engine_builder.py b/backend/src/nodes/impl/tensorrt/engine_builder.py
index c46bac3185..10b52ae294 100644
--- a/backend/src/nodes/impl/tensorrt/engine_builder.py
+++ b/backend/src/nodes/impl/tensorrt/engine_builder.py
@@ -5,6 +5,9 @@
from dataclasses import dataclass
from typing import Any, Literal
+import tensorrt as trt
+from cuda.bindings import runtime as cudart
+
from logger import logger
from .memory import get_cuda_compute_capability, get_cuda_device_name
@@ -27,8 +30,6 @@ class TrtLogger:
"""Custom TensorRT logger that integrates with chaiNNer's logging."""
def __init__(self):
- import tensorrt as trt
-
self.severity_map = {
trt.ILogger.Severity.INTERNAL_ERROR: logger.error,
trt.ILogger.Severity.ERROR: logger.error,
@@ -44,7 +45,6 @@ def log(self, severity: Any, msg: str) -> None:
def get_trt_logger():
"""Get a TensorRT logger instance."""
- import tensorrt as trt
# Use a simple logger class
class SimpleLogger(trt.ILogger):
@@ -80,8 +80,6 @@ def configure_builder_config(
config: BuildConfig,
) -> Any:
"""Configure the TensorRT builder with the given settings."""
- import tensorrt as trt
-
builder_config = builder.create_builder_config()
# Set workspace size
@@ -117,9 +115,6 @@ def build_engine_from_onnx(
Returns:
A TensorRTEngine instance
"""
- import tensorrt as trt
- from cuda.bindings import runtime as cudart
-
# Set the CUDA device
cudart.cudaSetDevice(gpu_index)
@@ -259,9 +254,6 @@ def load_engine_from_bytes(
Returns the runtime and deserialized engine.
"""
- import tensorrt as trt
- from cuda.bindings import runtime as cudart
-
cudart.cudaSetDevice(gpu_index)
trt_logger = trt.Logger(trt.Logger.WARNING)
diff --git a/backend/src/nodes/impl/tensorrt/inference.py b/backend/src/nodes/impl/tensorrt/inference.py
index 25df584683..5dbfce4380 100644
--- a/backend/src/nodes/impl/tensorrt/inference.py
+++ b/backend/src/nodes/impl/tensorrt/inference.py
@@ -5,6 +5,8 @@
import types
import numpy as np
+import tensorrt as trt
+from cuda.bindings import runtime as cudart
from .memory import CudaMemoryManager
from .model import TensorRTEngine
@@ -36,9 +38,6 @@ def load(self) -> None:
if self._is_loaded:
return
- import tensorrt as trt
- from cuda.bindings import runtime as cudart
-
cudart.cudaSetDevice(self.gpu_index)
trt_logger = trt.Logger(trt.Logger.WARNING)
diff --git a/backend/src/nodes/impl/tensorrt/memory.py b/backend/src/nodes/impl/tensorrt/memory.py
index 6dbdb8901a..4ef082e791 100644
--- a/backend/src/nodes/impl/tensorrt/memory.py
+++ b/backend/src/nodes/impl/tensorrt/memory.py
@@ -4,12 +4,9 @@
from contextlib import contextmanager
from dataclasses import dataclass
-from typing import TYPE_CHECKING
import numpy as np
-
-if TYPE_CHECKING:
- pass
+from cuda.bindings import runtime as cudart
@dataclass
@@ -22,8 +19,6 @@ class CudaBuffer:
def free(self) -> None:
"""Free the device memory."""
- from cuda.bindings import runtime as cudart
-
if self.device_ptr != 0:
cudart.cudaFree(self.device_ptr)
self.device_ptr = 0
@@ -45,16 +40,12 @@ def _check_cuda_error(self, result: tuple) -> None:
"""Check CUDA runtime API result for errors."""
err = result[0]
if err.value != 0:
- from cuda.bindings import runtime as cudart
-
err_name = cudart.cudaGetErrorName(err)[1]
err_string = cudart.cudaGetErrorString(err)[1]
raise RuntimeError(f"CUDA Error {err_name}: {err_string}")
def allocate(self, size: int, dtype: np.dtype = np.float32) -> CudaBuffer:
"""Allocate device memory."""
- from cuda.bindings import runtime as cudart
-
result = cudart.cudaMalloc(size)
self._check_cuda_error(result)
device_ptr = result[1]
@@ -68,8 +59,6 @@ def allocate_like(self, array: np.ndarray) -> CudaBuffer:
def copy_to_device(self, host_array: np.ndarray, device_buffer: CudaBuffer) -> None:
"""Copy data from host to device."""
- from cuda.bindings import runtime as cudart
-
host_ptr = host_array.ctypes.data
self._check_cuda_error(
cudart.cudaMemcpy(
@@ -84,8 +73,6 @@ def copy_to_host(
self, device_buffer: CudaBuffer, host_array: np.ndarray
) -> np.ndarray:
"""Copy data from device to host."""
- from cuda.bindings import runtime as cudart
-
host_ptr = host_array.ctypes.data
self._check_cuda_error(
cudart.cudaMemcpy(
@@ -99,8 +86,6 @@ def copy_to_host(
def create_stream(self) -> int:
"""Create a CUDA stream."""
- from cuda.bindings import runtime as cudart
-
result = cudart.cudaStreamCreate()
self._check_cuda_error(result)
self._stream = result[1]
@@ -108,14 +93,10 @@ def create_stream(self) -> int:
def synchronize(self) -> None:
"""Synchronize the CUDA device."""
- from cuda.bindings import runtime as cudart
-
self._check_cuda_error(cudart.cudaDeviceSynchronize())
def synchronize_stream(self) -> None:
"""Synchronize the CUDA stream."""
- from cuda.bindings import runtime as cudart
-
if self._stream is not None:
self._check_cuda_error(cudart.cudaStreamSynchronize(self._stream))
@@ -126,8 +107,6 @@ def remove_buffer(self, buffer: CudaBuffer) -> None:
def cleanup(self) -> None:
"""Free all allocated resources."""
- from cuda.bindings import runtime as cudart
-
for buffer in self._buffers:
buffer.free()
self._buffers.clear()
@@ -154,12 +133,8 @@ def cuda_memory_context(device_id: int = 0):
def check_cuda_available() -> bool:
"""Check if CUDA is available."""
try:
- from cuda.bindings import runtime as cudart
-
result = cudart.cudaGetDeviceCount()
return result[0].value == 0 and result[1] > 0
- except ImportError:
- return False
except Exception:
return False
@@ -167,8 +142,6 @@ def check_cuda_available() -> bool:
def get_cuda_device_name(device_id: int = 0) -> str:
"""Get the name of a CUDA device."""
try:
- from cuda.bindings import runtime as cudart
-
result = cudart.cudaGetDeviceProperties(device_id)
if result[0].value == 0:
return (
@@ -184,8 +157,6 @@ def get_cuda_device_name(device_id: int = 0) -> str:
def get_cuda_compute_capability(device_id: int = 0) -> tuple[int, int]:
"""Get the compute capability of a CUDA device."""
try:
- from cuda.bindings import runtime as cudart
-
result = cudart.cudaGetDeviceProperties(device_id)
if result[0].value == 0:
return result[1].major, result[1].minor
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 3022853f83..828e6aa7f1 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -15,7 +15,6 @@
"strictDictionaryInference": true,
"strictSetInference": true,
"reportDuplicateImport": "warning",
- "reportImportCycles": "error",
"reportIncompatibleVariableOverride": "error",
"reportIncompatibleMethodOverride": "error",
"reportOverlappingOverload": "error",
@@ -30,4 +29,25 @@
"reportUnusedFunction": "warning",
"reportUnusedImport": "warning",
"reportUnusedVariable": "warning",
+ "enableTypeIgnoreComments": true,
+ "executionEnvironments": [
+ {
+ "root": "backend/src/nodes/impl/tensorrt",
+ "extraPaths": ["backend/src"],
+ "reportMissingImports": "none",
+ "reportImportCycles": "none",
+ "reportGeneralTypeIssues": "none",
+ "reportOptionalMemberAccess": "none",
+ "reportAttributeAccessIssue": "none"
+ },
+ {
+ "root": "backend/src/packages/chaiNNer_tensorrt",
+ "extraPaths": ["backend/src"],
+ "reportMissingImports": "none",
+ "reportImportCycles": "none",
+ "reportGeneralTypeIssues": "none",
+ "reportOptionalMemberAccess": "none",
+ "reportAttributeAccessIssue": "none"
+ }
+ ]
}
From 4efb6738116a58d4e2f489b09c2364a305d221a1 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 20:54:15 -0600
Subject: [PATCH 15/20] Load package even when Nvidia not available, but don't
create categroy
---
.../packages/chaiNNer_tensorrt/__init__.py | 58 +++++++++----------
1 file changed, 28 insertions(+), 30 deletions(-)
diff --git a/backend/src/packages/chaiNNer_tensorrt/__init__.py b/backend/src/packages/chaiNNer_tensorrt/__init__.py
index 8aa3f0cd99..23cc408b42 100644
--- a/backend/src/packages/chaiNNer_tensorrt/__init__.py
+++ b/backend/src/packages/chaiNNer_tensorrt/__init__.py
@@ -15,32 +15,33 @@
"Requires an NVIDIA GPU with CUDA support."
)
-# Only define the package if NVIDIA GPUs are available
-if nvidia.is_available and not is_arm_mac:
- package = add_package(
- __file__,
- id="chaiNNer_tensorrt",
- name="TensorRT",
- description=package_description,
- dependencies=[
- Dependency(
- display_name="TensorRT",
- pypi_name="tensorrt",
- version="10.15.1.29",
- size_estimate=int(1.2 * GB),
- auto_update=False,
- ),
- Dependency(
- display_name="CUDA Python",
- pypi_name="cuda-python",
- version="13.1.1",
- size_estimate=20 * MB,
- ),
- ],
- icon="BsNvidia",
- color="#76B900",
- )
+# Always register the package so dependencies are installed in CI
+package = add_package(
+ __file__,
+ id="chaiNNer_tensorrt",
+ name="TensorRT",
+ description=package_description,
+ dependencies=[
+ Dependency(
+ display_name="TensorRT",
+ pypi_name="tensorrt",
+ version="10.15.1.29",
+ size_estimate=int(1.2 * GB),
+ auto_update=False,
+ ),
+ Dependency(
+ display_name="CUDA Python",
+ pypi_name="cuda-python",
+ version="13.1.1",
+ size_estimate=20 * MB,
+ ),
+ ],
+ icon="BsNvidia",
+ color="#76B900",
+)
+# Only add category/nodes if NVIDIA GPU is available and not on ARM Mac
+if nvidia.is_available and not is_arm_mac:
tensorrt_category = package.add_category(
name="TensorRT",
description="Nodes for using NVIDIA TensorRT for optimized GPU inference.",
@@ -48,13 +49,10 @@
color="#76B900",
install_hint=install_hint,
)
-
logger.debug("Loaded package %s", package.name)
else:
- # Create a dummy for imports to not fail
- package = None # type: ignore
tensorrt_category = None # type: ignore
if is_arm_mac:
- logger.debug("TensorRT package not available on ARM Mac")
+ logger.debug("TensorRT package registered but not available on ARM Mac")
else:
- logger.debug("TensorRT package not available (no NVIDIA GPU detected)")
+ logger.debug("TensorRT package registered but no NVIDIA GPU detected")
From d1085754581445c042c8f554b3a078456d21152c Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 21:03:54 -0600
Subject: [PATCH 16/20] specify config
---
.github/workflows/lint-backend.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/lint-backend.yml b/.github/workflows/lint-backend.yml
index a10d7c3b61..0bd542dcdf 100644
--- a/.github/workflows/lint-backend.yml
+++ b/.github/workflows/lint-backend.yml
@@ -61,4 +61,4 @@ jobs:
- name: Start backend to install dependencies
run: python ./backend/src/run.py --close-after-start --install-builtin-packages
- name: pyright
- run: pyright backend/
+ run: pyright backend/ --config ./pyrightconfig.json
From c09867ab9d925f65e855ef332075e595eecb5bb1 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 21:08:22 -0600
Subject: [PATCH 17/20] Revert "specify config"
This reverts commit d1085754581445c042c8f554b3a078456d21152c.
---
.github/workflows/lint-backend.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/lint-backend.yml b/.github/workflows/lint-backend.yml
index 0bd542dcdf..a10d7c3b61 100644
--- a/.github/workflows/lint-backend.yml
+++ b/.github/workflows/lint-backend.yml
@@ -61,4 +61,4 @@ jobs:
- name: Start backend to install dependencies
run: python ./backend/src/run.py --close-after-start --install-builtin-packages
- name: pyright
- run: pyright backend/ --config ./pyrightconfig.json
+ run: pyright backend/
From dd51195f5d4d0d069278dd1a56ac095e49bdcbc5 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 22:02:21 -0600
Subject: [PATCH 18/20] add to ignore instead
---
pyrightconfig.json | 29 ++++++-----------------------
1 file changed, 6 insertions(+), 23 deletions(-)
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 828e6aa7f1..52e72464b5 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -8,13 +8,17 @@
"exclude": [
"**/__pycache__"
],
- "ignore": [],
+ "ignore": [
+ "backend/src/nodes/impl/tensorrt",
+ "backend/src/packages/chaiNNer_tensorrt"
+ ],
"typeCheckingMode": "basic",
"useLibraryCodeForTypes": true,
"strictListInference": true,
"strictDictionaryInference": true,
"strictSetInference": true,
"reportDuplicateImport": "warning",
+ "reportImportCycles": "error",
"reportIncompatibleVariableOverride": "error",
"reportIncompatibleMethodOverride": "error",
"reportOverlappingOverload": "error",
@@ -28,26 +32,5 @@
"reportUnusedClass": "warning",
"reportUnusedFunction": "warning",
"reportUnusedImport": "warning",
- "reportUnusedVariable": "warning",
- "enableTypeIgnoreComments": true,
- "executionEnvironments": [
- {
- "root": "backend/src/nodes/impl/tensorrt",
- "extraPaths": ["backend/src"],
- "reportMissingImports": "none",
- "reportImportCycles": "none",
- "reportGeneralTypeIssues": "none",
- "reportOptionalMemberAccess": "none",
- "reportAttributeAccessIssue": "none"
- },
- {
- "root": "backend/src/packages/chaiNNer_tensorrt",
- "extraPaths": ["backend/src"],
- "reportMissingImports": "none",
- "reportImportCycles": "none",
- "reportGeneralTypeIssues": "none",
- "reportOptionalMemberAccess": "none",
- "reportAttributeAccessIssue": "none"
- }
- ]
+ "reportUnusedVariable": "warning"
}
From c3175a918e8fff606b1cb1e8a8f70f91d2287c13 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 22:10:39 -0600
Subject: [PATCH 19/20] Update README
---
README.md | 28 ++++++++++++++++++----------
1 file changed, 18 insertions(+), 10 deletions(-)
diff --git a/README.md b/README.md
index 01712a2c86..0207ea1fa6 100644
--- a/README.md
+++ b/README.md
@@ -21,9 +21,7 @@ ChaiNNer is also cross-platform, meaning you can run it on Windows, MacOS, and L
For help, suggestions, or just to hang out, you can join the [chaiNNer Discord server](https://discord.gg/pzvAKPKyHM)
-Remember: chaiNNer is still a work in progress and in alpha. While it is slowly getting more to where we want it, it is going to take quite some time to have every possible feature we want to add. If you're knowledgeable in TypeScript, React, or Python, feel free to contribute to this project and help us get closer to that goal.
-
-Note: As of right now, chaiNNer is not under active development. This may resume in the future, but at the moment there is no active dev work aside from community contributions via PRs.
+ChaiNNer is under active development. If you're knowledgeable in TypeScript, React, or Python, feel free to contribute to this project and help us continue to improve it.
## Installation
@@ -31,7 +29,7 @@ Download the latest release from the [Github releases page](https://github.com/c
You don't even need to have Python installed, as chaiNNer will download an isolated integrated Python build on startup. From there, you can install all the other dependencies via the Dependency Manager.
-If you do wish to use your system Python installation still, you can turn the system Python setting on. However, it is much more recommended to use integrated Python. If you do wish to use your system Python, we recommend using Python 3.11, but we try to support 3.10 as well.
+If you do wish to use your system Python installation still, you can turn the system Python setting on. However, it is much more recommended to use integrated Python. If you do wish to use your system Python, Python 3.10 or later is required (3.11+ recommended).
If you'd like to test the latest changes and tweaks, try out our [nightly builds](https://github.com/chaiNNer-org/chaiNNer-nightly)
@@ -45,7 +43,7 @@ While it might seem intimidating at first due to all the possible options, chaiN
-Before you get to this point though, you'll need to install one of the neural network frameworks from the dependency manager. You can access this via the button in the upper-right-hand corner. ChaiNNer offers support for PyTorch (with select model architectures), NCNN, and ONNX. For Nvidia users, PyTorch will be the preferred way to upscale. For AMD users, NCNN will be the preferred way to upscale.
+Before you get to this point though, you'll need to install one of the neural network frameworks from the dependency manager. You can access this via the button in the upper-right-hand corner. ChaiNNer offers support for PyTorch (with select model architectures), NCNN, ONNX, and TensorRT. For Nvidia users, PyTorch or TensorRT will be the preferred way to upscale. For AMD users, NCNN will be the preferred way to upscale (or PyTorch with ROCm on Linux).
All the other Python dependencies are automatically installed, and chaiNNer even carries its own integrated Python support so that you do not have to modify your existing Python configuration.
@@ -82,7 +80,7 @@ You can right-click in the editor viewport to show an inline nodes list to selec
- Windows versions 8.1 and below are also not supported.
-- Apple Silicon Macs should support almost everything. Although, ONNX only supports the CPU Execution Provider, and NCNN sometimes does not work properly.
+- Apple Silicon Macs are supported with PyTorch MPS acceleration. ONNX only supports the CPU Execution Provider, and NCNN may not work properly on some configurations.
- Some NCNN users with non-Nvidia GPUs might get all-black outputs. I am not sure what to do to fix this as it appears to be due to the graphics driver crashing as a result of going out of memory. If this happens to you, try manually setting a tiling amount.
@@ -90,13 +88,19 @@ You can right-click in the editor viewport to show an inline nodes list to selec
## GPU Support
-For PyTorch inference, only Nvidia GPUs are officially supported. If you do not have an Nvidia GPU, you will have to use PyTorch in CPU mode. This is because PyTorch only supports Nvidia's CUDA. MacOS users on Apple Silicon Macs can also take advantage of PyTorch's MPS mode, which should work with chaiNNer.
+**Nvidia GPUs:** Full support via PyTorch (CUDA), ONNX, and TensorRT. TensorRT offers the best performance for supported models.
-If you have an AMD or Intel GPU that supports NCNN however, chaiNNer now supports NCNN inference. You can use any existing NCNN .bin/.param model files (only ESRGAN-related SR models have been tested), or use chaiNNer to convert a PyTorch or ONNX model to NCNN.
+**AMD GPUs:**
+- On Linux, AMD GPUs can use PyTorch via ROCm
+- NCNN is available on all platforms for AMD GPUs
-For NCNN, make sure to select which GPU you want to use in the settings. It might be defaulting to your integrated graphics!
+**Apple Silicon (M1/M2/M3):** PyTorch MPS acceleration is supported.
-For Nvidia GPUs, ONNX is also an option to be used. ONNX will use CPU mode on non-Nvidia GPUs, similar to PyTorch.
+**Intel GPUs:** NCNN inference is supported for Intel GPUs.
+
+**CPU:** All frameworks support CPU-only mode as a fallback.
+
+For NCNN, make sure to select which GPU you want to use in the settings. It might be defaulting to your integrated graphics!
## Model Architecture Support
@@ -123,6 +127,10 @@ As of v0.21.0, chaiNNer uses our new package called [Spandrel](https://github.co
- [u2net](https://github.com/danielgatis/rembg) | [u2net](https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net.onnx), [u2netp](https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2netp.onnx), [u2net_cloth_seg](https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net_cloth_seg.onnx), [u2net_human_seg](https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net_human_seg.onnx), [silueta](https://github.com/danielgatis/rembg/releases/download/v0.0.0/silueta.onnx)
- [isnet](https://github.com/xuebinqin/DIS) | [isnet](https://github.com/danielgatis/rembg/releases/download/v0.0.0/isnet-general-use.onnx)
+### TensorRT
+
+TensorRT provides optimized inference for Nvidia GPUs. Models must be converted to TensorRT engine format for use. This offers the best performance on supported hardware.
+
## Troubleshooting
For troubleshooting information, view the [troubleshooting document](https://github.com/chaiNNer-org/chaiNNer/wiki/06--Troubleshooting).
From 975766966ce2e1f01d9c7ace0b35fbcaf2f92d26 Mon Sep 17 00:00:00 2001
From: Joey Ballentine
Date: Sun, 1 Feb 2026 22:11:58 -0600
Subject: [PATCH 20/20] Remove useless "build info" from Build Engine node
---
.../tensorrt/utility/build_engine.py | 15 +++------------
1 file changed, 3 insertions(+), 12 deletions(-)
diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py
index dde82f38be..391b2f7f33 100644
--- a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py
+++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py
@@ -13,7 +13,7 @@
NumberInput,
OnnxModelInput,
)
-from nodes.properties.outputs import TensorRTEngineOutput, TextOutput
+from nodes.properties.outputs import TensorRTEngineOutput
from ...settings import get_settings
from .. import utility_group
@@ -147,7 +147,6 @@ class ShapeMode(Enum):
],
outputs=[
TensorRTEngineOutput(),
- TextOutput("Build Info"),
],
node_context=True,
)
@@ -165,7 +164,7 @@ def build_engine_node(
static_height: int,
static_width: int,
workspace: float,
- ) -> tuple[TensorRTEngine, str]:
+ ) -> TensorRTEngine:
settings = get_settings(context)
gpu_index = settings.gpu_index
@@ -213,12 +212,4 @@ def build_engine_node(
timing_cache_path=timing_cache_path,
)
- build_info = (
- f"Built {precision.value.upper()} engine for {engine.info.gpu_architecture}"
- )
- if use_dynamic:
- build_info += (
- f" (dynamic: {min_height}x{min_width} to {max_height}x{max_width})"
- )
-
- return engine, build_info
+ return engine