diff --git a/README.md b/README.md index 01712a2c86..0207ea1fa6 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,7 @@ ChaiNNer is also cross-platform, meaning you can run it on Windows, MacOS, and L For help, suggestions, or just to hang out, you can join the [chaiNNer Discord server](https://discord.gg/pzvAKPKyHM) -Remember: chaiNNer is still a work in progress and in alpha. While it is slowly getting more to where we want it, it is going to take quite some time to have every possible feature we want to add. If you're knowledgeable in TypeScript, React, or Python, feel free to contribute to this project and help us get closer to that goal. - -Note: As of right now, chaiNNer is not under active development. This may resume in the future, but at the moment there is no active dev work aside from community contributions via PRs. +ChaiNNer is under active development. If you're knowledgeable in TypeScript, React, or Python, feel free to contribute to this project and help us continue to improve it. ## Installation @@ -31,7 +29,7 @@ Download the latest release from the [Github releases page](https://github.com/c You don't even need to have Python installed, as chaiNNer will download an isolated integrated Python build on startup. From there, you can install all the other dependencies via the Dependency Manager. -If you do wish to use your system Python installation still, you can turn the system Python setting on. However, it is much more recommended to use integrated Python. If you do wish to use your system Python, we recommend using Python 3.11, but we try to support 3.10 as well. +If you do wish to use your system Python installation still, you can turn the system Python setting on. However, it is much more recommended to use integrated Python. If you do wish to use your system Python, Python 3.10 or later is required (3.11+ recommended). If you'd like to test the latest changes and tweaks, try out our [nightly builds](https://github.com/chaiNNer-org/chaiNNer-nightly) @@ -45,7 +43,7 @@ While it might seem intimidating at first due to all the possible options, chaiN

-Before you get to this point though, you'll need to install one of the neural network frameworks from the dependency manager. You can access this via the button in the upper-right-hand corner. ChaiNNer offers support for PyTorch (with select model architectures), NCNN, and ONNX. For Nvidia users, PyTorch will be the preferred way to upscale. For AMD users, NCNN will be the preferred way to upscale. +Before you get to this point though, you'll need to install one of the neural network frameworks from the dependency manager. You can access this via the button in the upper-right-hand corner. ChaiNNer offers support for PyTorch (with select model architectures), NCNN, ONNX, and TensorRT. For Nvidia users, PyTorch or TensorRT will be the preferred way to upscale. For AMD users, NCNN will be the preferred way to upscale (or PyTorch with ROCm on Linux). All the other Python dependencies are automatically installed, and chaiNNer even carries its own integrated Python support so that you do not have to modify your existing Python configuration. @@ -82,7 +80,7 @@ You can right-click in the editor viewport to show an inline nodes list to selec - Windows versions 8.1 and below are also not supported. -- Apple Silicon Macs should support almost everything. Although, ONNX only supports the CPU Execution Provider, and NCNN sometimes does not work properly. +- Apple Silicon Macs are supported with PyTorch MPS acceleration. ONNX only supports the CPU Execution Provider, and NCNN may not work properly on some configurations. - Some NCNN users with non-Nvidia GPUs might get all-black outputs. I am not sure what to do to fix this as it appears to be due to the graphics driver crashing as a result of going out of memory. If this happens to you, try manually setting a tiling amount. @@ -90,13 +88,19 @@ You can right-click in the editor viewport to show an inline nodes list to selec ## GPU Support -For PyTorch inference, only Nvidia GPUs are officially supported. If you do not have an Nvidia GPU, you will have to use PyTorch in CPU mode. This is because PyTorch only supports Nvidia's CUDA. MacOS users on Apple Silicon Macs can also take advantage of PyTorch's MPS mode, which should work with chaiNNer. +**Nvidia GPUs:** Full support via PyTorch (CUDA), ONNX, and TensorRT. TensorRT offers the best performance for supported models. -If you have an AMD or Intel GPU that supports NCNN however, chaiNNer now supports NCNN inference. You can use any existing NCNN .bin/.param model files (only ESRGAN-related SR models have been tested), or use chaiNNer to convert a PyTorch or ONNX model to NCNN. +**AMD GPUs:** +- On Linux, AMD GPUs can use PyTorch via ROCm +- NCNN is available on all platforms for AMD GPUs -For NCNN, make sure to select which GPU you want to use in the settings. It might be defaulting to your integrated graphics! +**Apple Silicon (M1/M2/M3):** PyTorch MPS acceleration is supported. -For Nvidia GPUs, ONNX is also an option to be used. ONNX will use CPU mode on non-Nvidia GPUs, similar to PyTorch. +**Intel GPUs:** NCNN inference is supported for Intel GPUs. + +**CPU:** All frameworks support CPU-only mode as a fallback. + +For NCNN, make sure to select which GPU you want to use in the settings. It might be defaulting to your integrated graphics! ## Model Architecture Support @@ -123,6 +127,10 @@ As of v0.21.0, chaiNNer uses our new package called [Spandrel](https://github.co - [u2net](https://github.com/danielgatis/rembg) | [u2net](https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net.onnx), [u2netp](https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2netp.onnx), [u2net_cloth_seg](https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net_cloth_seg.onnx), [u2net_human_seg](https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net_human_seg.onnx), [silueta](https://github.com/danielgatis/rembg/releases/download/v0.0.0/silueta.onnx) - [isnet](https://github.com/xuebinqin/DIS) | [isnet](https://github.com/danielgatis/rembg/releases/download/v0.0.0/isnet-general-use.onnx) +### TensorRT + +TensorRT provides optimized inference for Nvidia GPUs. Models must be converted to TensorRT engine format for use. This offers the best performance on supported hardware. + ## Troubleshooting For troubleshooting information, view the [troubleshooting document](https://github.com/chaiNNer-org/chaiNNer/wiki/06--Troubleshooting). diff --git a/backend/src/nodes/impl/tensorrt/__init__.py b/backend/src/nodes/impl/tensorrt/__init__.py new file mode 100644 index 0000000000..a957ac846a --- /dev/null +++ b/backend/src/nodes/impl/tensorrt/__init__.py @@ -0,0 +1,37 @@ +"""TensorRT implementation utilities.""" + +from .auto_split import tensorrt_auto_split +from .engine_builder import BuildConfig, build_engine_from_onnx +from .inference import ( + TensorRTSession, + clear_session_cache, + get_tensorrt_session, + run_inference, +) +from .memory import ( + CudaBuffer, + CudaMemoryManager, + check_cuda_available, + cuda_memory_context, + get_cuda_compute_capability, + get_cuda_device_name, +) +from .model import TensorRTEngine, TensorRTEngineInfo + +__all__ = [ + "BuildConfig", + "CudaBuffer", + "CudaMemoryManager", + "TensorRTEngine", + "TensorRTEngineInfo", + "TensorRTSession", + "build_engine_from_onnx", + "check_cuda_available", + "clear_session_cache", + "cuda_memory_context", + "get_cuda_compute_capability", + "get_cuda_device_name", + "get_tensorrt_session", + "run_inference", + "tensorrt_auto_split", +] diff --git a/backend/src/nodes/impl/tensorrt/auto_split.py b/backend/src/nodes/impl/tensorrt/auto_split.py new file mode 100644 index 0000000000..3a1676f6f6 --- /dev/null +++ b/backend/src/nodes/impl/tensorrt/auto_split.py @@ -0,0 +1,118 @@ +"""Auto-tiling support for TensorRT inference.""" + +from __future__ import annotations + +import gc + +import numpy as np + +from ..upscale.auto_split import Tiler, auto_split +from .inference import get_tensorrt_session +from .model import TensorRTEngine + + +def _into_batched_form(img: np.ndarray) -> np.ndarray: + """Convert image to NCHW batched format.""" + shape_size = len(img.shape) + if shape_size == 3: + # (H, W, C) -> (1, C, H, W) + return img.transpose((2, 0, 1))[np.newaxis, :] + elif shape_size == 2: + # (H, W) -> (1, 1, H, W) + return img[np.newaxis, np.newaxis, :, :] + else: + raise ValueError("Unsupported input tensor shape") + + +def _into_standard_image_form(img: np.ndarray) -> np.ndarray: + """Convert NCHW output back to HWC format.""" + shape_size = len(img.shape) + if shape_size == 4: + # (1, C, H, W) -> (H, W, C) + return img.squeeze(0).transpose(1, 2, 0) + elif shape_size == 3: + # (C, H, W) -> (H, W, C) + return img.transpose(1, 2, 0) + elif shape_size == 2: + # (H, W) + return img + else: + raise ValueError("Unsupported output tensor shape") + + +def _flip_r_b_channels(img: np.ndarray) -> np.ndarray: + """Flip R and B channels (RGB <-> BGR conversion).""" + shape_size = len(img.shape) + if shape_size != 3: + return img + if img.shape[2] == 3: + # (H, W, C) RGB -> BGR - use ascontiguousarray to avoid stride issues + return np.ascontiguousarray(np.flip(img, 2)) + elif img.shape[2] == 4: + # (H, W, C) RGBA -> BGRA + return np.dstack((img[:, :, 2], img[:, :, 1], img[:, :, 0], img[:, :, 3])) + return img + + +def tensorrt_auto_split( + img: np.ndarray, + engine: TensorRTEngine, + tiler: Tiler, + gpu_index: int = 0, +) -> np.ndarray: + """ + Run TensorRT inference with automatic tiling for large images. + + Args: + img: Input image in HWC format (float32, 0-1 range) + engine: TensorRT engine + tiler: Tiler configuration for splitting + gpu_index: GPU device index + + Returns: + Upscaled image in HWC format + """ + session = get_tensorrt_session(engine, gpu_index) + is_fp16 = engine.precision == "fp16" + + def upscale(img: np.ndarray, _: object): + try: + # Convert to appropriate precision + lr_img = img.astype(np.float16) if is_fp16 else img.astype(np.float32) + + # Convert RGB to BGR (most models expect BGR) + lr_img = _flip_r_b_channels(lr_img) + + # Convert to NCHW batched format + lr_img = _into_batched_form(lr_img) + + # Run inference + output = session.infer(lr_img) + + # Convert back to HWC format + output = _into_standard_image_form(output) + + # Convert BGR back to RGB + output = _flip_r_b_channels(output) + + return output.astype(np.float32) + + except Exception as e: + error_str = str(e).lower() + # Check for CUDA OOM errors + if ( + "out of memory" in error_str + or ("cuda" in error_str and "memory" in error_str) + or "allocation" in error_str + ): + raise RuntimeError( # noqa: B904 + "A VRAM out-of-memory error has occurred. Please try using a smaller tile size." + ) + else: + # Re-raise the exception if not an OOM error + raise + + try: + return auto_split(img, upscale, tiler) + finally: + gc.collect() diff --git a/backend/src/nodes/impl/tensorrt/engine_builder.py b/backend/src/nodes/impl/tensorrt/engine_builder.py new file mode 100644 index 0000000000..10b52ae294 --- /dev/null +++ b/backend/src/nodes/impl/tensorrt/engine_builder.py @@ -0,0 +1,266 @@ +"""TensorRT engine building utilities.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Literal + +import tensorrt as trt +from cuda.bindings import runtime as cudart + +from logger import logger + +from .memory import get_cuda_compute_capability, get_cuda_device_name +from .model import TensorRTEngine, TensorRTEngineInfo + + +@dataclass +class BuildConfig: + """Configuration for TensorRT engine building.""" + + precision: Literal["fp32", "fp16"] + workspace_size_gb: float + min_shape: tuple[int, int] # (height, width) + opt_shape: tuple[int, int] # (height, width) + max_shape: tuple[int, int] # (height, width) + use_dynamic_shapes: bool + + +class TrtLogger: + """Custom TensorRT logger that integrates with chaiNNer's logging.""" + + def __init__(self): + self.severity_map = { + trt.ILogger.Severity.INTERNAL_ERROR: logger.error, + trt.ILogger.Severity.ERROR: logger.error, + trt.ILogger.Severity.WARNING: logger.warning, + trt.ILogger.Severity.INFO: logger.info, + trt.ILogger.Severity.VERBOSE: logger.debug, + } + + def log(self, severity: Any, msg: str) -> None: + log_fn = self.severity_map.get(severity, logger.debug) + log_fn("[TensorRT] %s", msg) + + +def get_trt_logger(): + """Get a TensorRT logger instance.""" + + # Use a simple logger class + class SimpleLogger(trt.ILogger): + def log(self, severity: Any, msg: str) -> None: + if severity <= trt.ILogger.Severity.WARNING: + logger.warning("[TensorRT] %s", msg) + elif severity <= trt.ILogger.Severity.INFO: + logger.info("[TensorRT] %s", msg) + else: + logger.debug("[TensorRT] %s", msg) + + return SimpleLogger() + + +def parse_onnx_model( + onnx_bytes: bytes, + network: Any, + parser: Any, +) -> None: + """Parse an ONNX model into a TensorRT network.""" + success = parser.parse(onnx_bytes) + if not success: + error_msgs = [] + for i in range(parser.num_errors): + error = parser.get_error(i) + error_msgs.append(f" {error.desc()}") + errors_str = "\n".join(error_msgs) + raise RuntimeError(f"Failed to parse ONNX model:\n{errors_str}") + + +def configure_builder_config( + builder: Any, + config: BuildConfig, +) -> Any: + """Configure the TensorRT builder with the given settings.""" + builder_config = builder.create_builder_config() + + # Set workspace size + workspace_bytes = int(config.workspace_size_gb * (1024**3)) + builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes) + + # Set precision + if config.precision == "fp16": + if builder.platform_has_fast_fp16: + builder_config.set_flag(trt.BuilderFlag.FP16) + logger.info("FP16 mode enabled") + else: + logger.warning("FP16 not supported on this platform, falling back to FP32") + + return builder_config + + +def build_engine_from_onnx( + onnx_bytes: bytes, + config: BuildConfig, + gpu_index: int = 0, + timing_cache_path: str | None = None, +) -> TensorRTEngine: + """ + Build a TensorRT engine from an ONNX model. + + Args: + onnx_bytes: The ONNX model as bytes + config: Build configuration + gpu_index: GPU device index to use + timing_cache_path: Optional path to timing cache for faster rebuilds + + Returns: + A TensorRTEngine instance + """ + # Set the CUDA device + cudart.cudaSetDevice(gpu_index) + + trt_logger = trt.Logger(trt.Logger.WARNING) + + # Create builder and network + builder = trt.Builder(trt_logger) + network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + network = builder.create_network(network_flags) + parser = trt.OnnxParser(network, trt_logger) + + # Parse ONNX model + logger.info("Parsing ONNX model...") + parse_onnx_model(onnx_bytes, network, parser) + + # Get input/output info + input_tensor = network.get_input(0) + output_tensor = network.get_output(0) + + input_shape = input_tensor.shape + output_shape = output_tensor.shape + + # Detect channels (assuming NCHW format) + input_channels = input_shape[1] if len(input_shape) >= 4 else 3 + output_channels = output_shape[1] if len(output_shape) >= 4 else 3 + + # Calculate scale from input/output dimensions + scale = None + if len(input_shape) >= 4 and len(output_shape) >= 4: + in_h, in_w = input_shape[2], input_shape[3] + out_h, out_w = output_shape[2], output_shape[3] + + if in_h > 0 and in_w > 0 and out_h > 0 and out_w > 0: + scale_h = out_h // in_h if out_h % in_h == 0 else None + scale_w = out_w // in_w if out_w % in_w == 0 else None + if scale_h == scale_w and scale_h is not None: + scale = scale_h + + # Configure builder + builder_config = builder.create_builder_config() + workspace_bytes = int(config.workspace_size_gb * (1024**3)) + builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes) + + # Set precision + if config.precision == "fp16": + if builder.platform_has_fast_fp16: + builder_config.set_flag(trt.BuilderFlag.FP16) + logger.info("FP16 mode enabled") + else: + logger.warning("FP16 not supported on this platform, using FP32") + + # Configure dynamic shapes if needed + has_dynamic = any(d == -1 for d in input_shape) + if has_dynamic or config.use_dynamic_shapes: + logger.info("Configuring dynamic shapes...") + profile = builder.create_optimization_profile() + + min_h, min_w = config.min_shape + opt_h, opt_w = config.opt_shape + max_h, max_w = config.max_shape + + # Set the optimization profile for the input tensor + min_dims = (1, input_channels, min_h, min_w) + opt_dims = (1, input_channels, opt_h, opt_w) + max_dims = (1, input_channels, max_h, max_w) + + profile.set_shape(input_tensor.name, min_dims, opt_dims, max_dims) + builder_config.add_optimization_profile(profile) + + # Load timing cache if available + if timing_cache_path: + try: + with open(timing_cache_path, "rb") as f: + cache_data = f.read() + timing_cache = builder_config.create_timing_cache(cache_data) + builder_config.set_timing_cache(timing_cache, ignore_mismatch=False) + logger.info("Loaded timing cache from %s", timing_cache_path) + except FileNotFoundError: + logger.debug("No timing cache found at %s", timing_cache_path) + except Exception as e: + logger.warning("Failed to load timing cache: %s", e) + + # Build the engine + logger.info("Building TensorRT engine (this may take a while)...") + serialized_engine = builder.build_serialized_network(network, builder_config) + + if serialized_engine is None: + raise RuntimeError("Failed to build TensorRT engine") + + # Save timing cache + if timing_cache_path: + try: + timing_cache = builder_config.get_timing_cache() + if timing_cache: + cache_data = timing_cache.serialize() + with open(timing_cache_path, "wb") as f: + f.write(cache_data) + logger.info("Saved timing cache to %s", timing_cache_path) + except Exception as e: + logger.warning("Failed to save timing cache: %s", e) + + # Get GPU info + major, minor = get_cuda_compute_capability(gpu_index) + gpu_arch = f"sm_{major}{minor}" + gpu_name = get_cuda_device_name(gpu_index) + logger.info("Built engine for %s (%s)", gpu_name, gpu_arch) + + # Create engine info + info = TensorRTEngineInfo( + precision=config.precision, + input_channels=input_channels, + output_channels=output_channels, + scale=scale, + gpu_architecture=gpu_arch, + tensorrt_version=trt.__version__, + has_dynamic_shapes=has_dynamic or config.use_dynamic_shapes, + min_shape=(config.min_shape[1], config.min_shape[0]) + if config.use_dynamic_shapes + else None, + opt_shape=(config.opt_shape[1], config.opt_shape[0]) + if config.use_dynamic_shapes + else None, + max_shape=(config.max_shape[1], config.max_shape[0]) + if config.use_dynamic_shapes + else None, + ) + + return TensorRTEngine(bytes(serialized_engine), info) + + +def load_engine_from_bytes( + engine_bytes: bytes, + gpu_index: int = 0, +) -> tuple[object, object]: + """ + Load a TensorRT engine from bytes. + + Returns the runtime and deserialized engine. + """ + cudart.cudaSetDevice(gpu_index) + + trt_logger = trt.Logger(trt.Logger.WARNING) + runtime = trt.Runtime(trt_logger) + + engine = runtime.deserialize_cuda_engine(engine_bytes) + if engine is None: + raise RuntimeError("Failed to deserialize TensorRT engine") + + return runtime, engine diff --git a/backend/src/nodes/impl/tensorrt/inference.py b/backend/src/nodes/impl/tensorrt/inference.py new file mode 100644 index 0000000000..5dbfce4380 --- /dev/null +++ b/backend/src/nodes/impl/tensorrt/inference.py @@ -0,0 +1,237 @@ +"""TensorRT inference execution utilities.""" + +from __future__ import annotations + +import types + +import numpy as np +import tensorrt as trt +from cuda.bindings import runtime as cudart + +from .memory import CudaMemoryManager +from .model import TensorRTEngine + + +class TensorRTSession: + """ + A session for running TensorRT inference. + + Manages the execution context and memory buffers. + """ + + def __init__( + self, + engine: TensorRTEngine, + gpu_index: int = 0, + ): + self.engine = engine + self.gpu_index = gpu_index + self._trt_engine = None + self._runtime = None + self._context = None + self._memory_manager: CudaMemoryManager | None = None + self._stream = None + self._is_loaded = False + + def load(self) -> None: + """Load the engine and create execution context.""" + if self._is_loaded: + return + + cudart.cudaSetDevice(self.gpu_index) + + trt_logger = trt.Logger(trt.Logger.WARNING) + self._runtime = trt.Runtime(trt_logger) + self._trt_engine = self._runtime.deserialize_cuda_engine(self.engine.bytes) + + if self._trt_engine is None: + raise RuntimeError("Failed to deserialize TensorRT engine") + + self._context = self._trt_engine.create_execution_context() + if self._context is None: + raise RuntimeError("Failed to create execution context") + + self._memory_manager = CudaMemoryManager(self.gpu_index) + self._stream = self._memory_manager.create_stream() + self._is_loaded = True + + def unload(self) -> None: + """Unload the engine and free resources.""" + if not self._is_loaded: + return + + if self._memory_manager: + self._memory_manager.cleanup() + self._memory_manager = None + + self._context = None + self._trt_engine = None + self._runtime = None + self._stream = None + self._is_loaded = False + + def __enter__(self): + self.load() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: types.TracebackType | None, + ) -> bool: + self.unload() + return False + + @property + def is_loaded(self) -> bool: + return self._is_loaded + + def _get_binding_info(self) -> tuple[str, str, tuple, tuple]: + """Get input and output binding information.""" + input_name = self._trt_engine.get_tensor_name(0) + output_name = self._trt_engine.get_tensor_name(1) + + input_shape = self._trt_engine.get_tensor_shape(input_name) + output_shape = self._trt_engine.get_tensor_shape(output_name) + + return input_name, output_name, tuple(input_shape), tuple(output_shape) + + def infer(self, input_array: np.ndarray) -> np.ndarray: + """ + Run inference on an input array. + + Args: + input_array: Input image in NCHW format (batch, channels, height, width) + + Returns: + Output array in NCHW format + """ + if not self._is_loaded: + raise RuntimeError("Session not loaded. Call load() first.") + + input_name, output_name, _, _ = self._get_binding_info() + + # Get actual input shape + batch, channels, height, width = input_array.shape + + # Set input shape for dynamic inputs + if self.engine.has_dynamic_shapes: + self._context.set_input_shape(input_name, (batch, channels, height, width)) + + # Get output shape (may depend on input shape for dynamic models) + output_shape = self._context.get_tensor_shape(output_name) + + # Determine precision + is_fp16 = self.engine.precision == "fp16" + dtype = np.float16 if is_fp16 else np.float32 + + # Ensure input is contiguous and correct dtype + input_array = np.ascontiguousarray(input_array.astype(dtype)) + + # Allocate output array + output_array = np.zeros(output_shape, dtype=dtype) + + # Allocate device memory + assert self._memory_manager is not None + input_buffer = self._memory_manager.allocate_like(input_array) + output_buffer = self._memory_manager.allocate_like(output_array) + + try: + # Copy input to device + self._memory_manager.copy_to_device(input_array, input_buffer) + + # Set tensor addresses + self._context.set_tensor_address(input_name, input_buffer.device_ptr) + self._context.set_tensor_address(output_name, output_buffer.device_ptr) + + # Execute inference + success = self._context.execute_async_v3(self._stream) + if not success: + raise RuntimeError("TensorRT inference execution failed") + + # Synchronize stream + self._memory_manager.synchronize_stream() + + # Copy output to host + self._memory_manager.copy_to_host(output_buffer, output_array) + + finally: + # Free temporary buffers + input_buffer.free() + output_buffer.free() + # Remove from tracked buffers + self._memory_manager.remove_buffer(input_buffer) + self._memory_manager.remove_buffer(output_buffer) + + return output_array.astype(np.float32) + + +# Session cache to avoid repeatedly loading engines +_session_cache: dict[int, TensorRTSession] = {} + + +def get_tensorrt_session( + engine: TensorRTEngine, + gpu_index: int = 0, +) -> TensorRTSession: + """ + Get a TensorRT session, using caching to avoid reloading. + + The session is cached based on the engine bytes identity. + """ + cache_key = id(engine.bytes) + + if cache_key not in _session_cache: + session = TensorRTSession(engine, gpu_index) + session.load() + _session_cache[cache_key] = session + + return _session_cache[cache_key] + + +def clear_session_cache() -> None: + """Clear the session cache and unload all engines.""" + for session in _session_cache.values(): + session.unload() + _session_cache.clear() + + +def run_inference( + img: np.ndarray, + engine: TensorRTEngine, + gpu_index: int = 0, +) -> np.ndarray: + """ + Run TensorRT inference on an image. + + Args: + img: Input image in HWC format (height, width, channels), float32, 0-1 range + engine: TensorRT engine + gpu_index: GPU device index + + Returns: + Output image in HWC format + """ + session = get_tensorrt_session(engine, gpu_index) + + # Convert HWC to NCHW + if img.ndim == 2: + # Grayscale + img = img[np.newaxis, np.newaxis, :, :] + elif img.ndim == 3: + # HWC -> NCHW + img = img.transpose(2, 0, 1)[np.newaxis, :, :, :] + else: + raise ValueError(f"Unexpected image dimensions: {img.ndim}") + + # Run inference + output = session.infer(img) + + # Convert NCHW back to HWC + if output.ndim == 4: + output = output.squeeze(0).transpose(1, 2, 0) + elif output.ndim == 3: + output = output.transpose(1, 2, 0) + + return output diff --git a/backend/src/nodes/impl/tensorrt/memory.py b/backend/src/nodes/impl/tensorrt/memory.py new file mode 100644 index 0000000000..4ef082e791 --- /dev/null +++ b/backend/src/nodes/impl/tensorrt/memory.py @@ -0,0 +1,165 @@ +"""CUDA memory management utilities for TensorRT inference.""" + +from __future__ import annotations + +from contextlib import contextmanager +from dataclasses import dataclass + +import numpy as np +from cuda.bindings import runtime as cudart + + +@dataclass +class CudaBuffer: + """Represents a CUDA device memory buffer.""" + + device_ptr: int + size: int + dtype: np.dtype + + def free(self) -> None: + """Free the device memory.""" + if self.device_ptr != 0: + cudart.cudaFree(self.device_ptr) + self.device_ptr = 0 + + +class CudaMemoryManager: + """ + Manages CUDA memory allocation for TensorRT inference. + + Uses RAII pattern with context managers to ensure proper cleanup. + """ + + def __init__(self, device_id: int = 0): + self.device_id = device_id + self._buffers: list[CudaBuffer] = [] + self._stream: int | None = None + + def _check_cuda_error(self, result: tuple) -> None: + """Check CUDA runtime API result for errors.""" + err = result[0] + if err.value != 0: + err_name = cudart.cudaGetErrorName(err)[1] + err_string = cudart.cudaGetErrorString(err)[1] + raise RuntimeError(f"CUDA Error {err_name}: {err_string}") + + def allocate(self, size: int, dtype: np.dtype = np.float32) -> CudaBuffer: + """Allocate device memory.""" + result = cudart.cudaMalloc(size) + self._check_cuda_error(result) + device_ptr = result[1] + buffer = CudaBuffer(device_ptr, size, dtype) + self._buffers.append(buffer) + return buffer + + def allocate_like(self, array: np.ndarray) -> CudaBuffer: + """Allocate device memory matching the size and dtype of an array.""" + return self.allocate(array.nbytes, array.dtype) + + def copy_to_device(self, host_array: np.ndarray, device_buffer: CudaBuffer) -> None: + """Copy data from host to device.""" + host_ptr = host_array.ctypes.data + self._check_cuda_error( + cudart.cudaMemcpy( + device_buffer.device_ptr, + host_ptr, + host_array.nbytes, + cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + + def copy_to_host( + self, device_buffer: CudaBuffer, host_array: np.ndarray + ) -> np.ndarray: + """Copy data from device to host.""" + host_ptr = host_array.ctypes.data + self._check_cuda_error( + cudart.cudaMemcpy( + host_ptr, + device_buffer.device_ptr, + host_array.nbytes, + cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, + ) + ) + return host_array + + def create_stream(self) -> int: + """Create a CUDA stream.""" + result = cudart.cudaStreamCreate() + self._check_cuda_error(result) + self._stream = result[1] + return self._stream + + def synchronize(self) -> None: + """Synchronize the CUDA device.""" + self._check_cuda_error(cudart.cudaDeviceSynchronize()) + + def synchronize_stream(self) -> None: + """Synchronize the CUDA stream.""" + if self._stream is not None: + self._check_cuda_error(cudart.cudaStreamSynchronize(self._stream)) + + def remove_buffer(self, buffer: CudaBuffer) -> None: + """Remove a buffer from tracked buffers without freeing it.""" + if buffer in self._buffers: + self._buffers.remove(buffer) + + def cleanup(self) -> None: + """Free all allocated resources.""" + for buffer in self._buffers: + buffer.free() + self._buffers.clear() + + if self._stream is not None: + cudart.cudaStreamDestroy(self._stream) + self._stream = None + + +@contextmanager +def cuda_memory_context(device_id: int = 0): + """ + Context manager for CUDA memory operations. + + Ensures all allocated memory is freed when the context exits. + """ + manager = CudaMemoryManager(device_id) + try: + yield manager + finally: + manager.cleanup() + + +def check_cuda_available() -> bool: + """Check if CUDA is available.""" + try: + result = cudart.cudaGetDeviceCount() + return result[0].value == 0 and result[1] > 0 + except Exception: + return False + + +def get_cuda_device_name(device_id: int = 0) -> str: + """Get the name of a CUDA device.""" + try: + result = cudart.cudaGetDeviceProperties(device_id) + if result[0].value == 0: + return ( + result[1].name.decode("utf-8") + if isinstance(result[1].name, bytes) + else result[1].name + ) + return "Unknown" + except Exception: + return "Unknown" + + +def get_cuda_compute_capability(device_id: int = 0) -> tuple[int, int]: + """Get the compute capability of a CUDA device.""" + try: + result = cudart.cudaGetDeviceProperties(device_id) + if result[0].value == 0: + return result[1].major, result[1].minor + return (0, 0) + except Exception: + return (0, 0) diff --git a/backend/src/nodes/impl/tensorrt/model.py b/backend/src/nodes/impl/tensorrt/model.py new file mode 100644 index 0000000000..1a9b5ca3dd --- /dev/null +++ b/backend/src/nodes/impl/tensorrt/model.py @@ -0,0 +1,56 @@ +# This class defines an interface. +# It is important that it does not contain types that depend on TensorRT. +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + + +@dataclass +class TensorRTEngineInfo: + """Metadata about a TensorRT engine.""" + + precision: Literal["fp32", "fp16", "int8"] + input_channels: int + output_channels: int + scale: int | None + gpu_architecture: str + tensorrt_version: str + has_dynamic_shapes: bool + min_shape: tuple[int, int, int, int] | None # (-, channels, width, height) + opt_shape: tuple[int, int, int, int] | None # (-, channels, width, height) + max_shape: tuple[int, int, int, int] | None # (-, channels, width, height) + + +class TensorRTEngine: + """ + A wrapper class for TensorRT engine data. + + This class holds the serialized engine bytes and metadata without + requiring TensorRT to be imported. The actual engine loading and + inference are handled by separate utility functions. + """ + + def __init__(self, engine_bytes: bytes, info: TensorRTEngineInfo): + self.bytes: bytes = engine_bytes + self.info: TensorRTEngineInfo = info + + @property + def precision(self) -> Literal["fp32", "fp16", "int8"]: + return self.info.precision + + @property + def scale(self) -> int | None: + return self.info.scale + + @property + def input_channels(self) -> int: + return self.info.input_channels + + @property + def output_channels(self) -> int: + return self.info.output_channels + + @property + def has_dynamic_shapes(self) -> bool: + return self.info.has_dynamic_shapes diff --git a/backend/src/nodes/impl/upscale/tiler.py b/backend/src/nodes/impl/upscale/tiler.py index 3741b5b1b7..7f0f6d791c 100644 --- a/backend/src/nodes/impl/upscale/tiler.py +++ b/backend/src/nodes/impl/upscale/tiler.py @@ -70,3 +70,49 @@ def split(self, tile_size: Size) -> Size: f"Splits are not supported for exact size ({self.exact_size[0]}x{self.exact_size[1]}px) splitting." f" This typically means that your machine does not have enough VRAM to run the current model." ) + + +class BoundedTileSize(Tiler): + """ + A tiler that respects min/max dimension constraints and uses exact tile sizes. + + This is useful for inference engines (like TensorRT or some ONNX models) that have + specific min/max bounds on input dimensions. The tiler ensures that tile sizes stay + within those bounds and uses exact tile sizes (with padding for smaller images) to + guarantee the engine receives properly sized inputs. + """ + + def __init__( + self, + tile_size: int, + min_size: Size | None = None, + max_size: Size | None = None, + ) -> None: + self.min_w, self.min_h = min_size if min_size else (1, 1) + self.max_w, self.max_h = max_size if max_size else (2**31, 2**31) + + # Automatically clamp tile size to be within bounds + self.tile_size = max( + self.min_w, self.min_h, min(tile_size, self.max_w, self.max_h) + ) + + def allow_smaller_tile_size(self) -> bool: + # Use exact tile sizes to ensure the engine gets properly sized inputs + return False + + def starting_tile_size(self, width: int, height: int, channels: int) -> Size: + return self.tile_size, self.tile_size + + def split(self, tile_size: Size) -> Size: + w, h = tile_size + new_w = max(self.min_w, w // 2) + new_h = max(self.min_h, h // 2) + + # Check if we can actually split further + if new_w == w and new_h == h: + raise ValueError( + f"Cannot reduce tile size below the minimum size ({self.min_w}x{self.min_h}). " + f"This typically means your machine does not have enough VRAM." + ) + + return new_w, new_h diff --git a/backend/src/nodes/properties/inputs/__init__.py b/backend/src/nodes/properties/inputs/__init__.py index ee136edcbd..ac63e30d46 100644 --- a/backend/src/nodes/properties/inputs/__init__.py +++ b/backend/src/nodes/properties/inputs/__init__.py @@ -16,3 +16,7 @@ from .pytorch_inputs import * except Exception: pass +try: + from .tensorrt_inputs import * +except Exception: + pass diff --git a/backend/src/nodes/properties/inputs/file_inputs.py b/backend/src/nodes/properties/inputs/file_inputs.py index e5df528a3b..4cd77d50c9 100644 --- a/backend/src/nodes/properties/inputs/file_inputs.py +++ b/backend/src/nodes/properties/inputs/file_inputs.py @@ -19,6 +19,7 @@ | Literal["param"] | Literal["pt"] | Literal["pth"] + | Literal["tensorrt"] | Literal["video"] ) @@ -188,6 +189,19 @@ def OnnxFileInput(primary_input: bool = False) -> FileInput: ) +def TensorRTFileInput(primary_input: bool = False) -> FileInput: + """Input for submitting a local TensorRT engine file""" + return FileInput( + label="TensorRT Engine File", + file_kind="tensorrt", + filetypes=[".engine", ".trt", ".plan"], + primary_input=primary_input, + ).with_docs( + "TensorRT engine files (.engine, .trt, .plan) are pre-built for a specific GPU architecture.", + "Engines built on one GPU may not work on a different GPU architecture.", + ) + + _INVALID_PATH_CHARS = re.compile(r'[<>:"|?*\x00-\x1F]') diff --git a/backend/src/nodes/properties/inputs/generic_inputs.py b/backend/src/nodes/properties/inputs/generic_inputs.py index ce68b4dc47..7ccfb5550d 100644 --- a/backend/src/nodes/properties/inputs/generic_inputs.py +++ b/backend/src/nodes/properties/inputs/generic_inputs.py @@ -581,15 +581,45 @@ def TileSizeDropdown( ) -> DropDownInput: options = [] if estimate: - options.append({"option": "Auto (estimate)", "value": ESTIMATE}) + options.append( + { + "option": "Auto (estimate)", + "value": ESTIMATE, + "type": f"TileSize {{ value: {int(ESTIMATE)} }}", + } + ) - options.append({"option": "Maximum", "value": MAX_TILE_SIZE}) - options.append({"option": "No Tiling", "value": NO_TILING}) + options.append( + { + "option": "Maximum", + "value": MAX_TILE_SIZE, + "type": f"TileSize {{ value: {int(MAX_TILE_SIZE)} }}", + } + ) + options.append( + { + "option": "No Tiling", + "value": NO_TILING, + "type": f"TileSize {{ value: {int(NO_TILING)} }}", + } + ) for size in [128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096]: - options.append({"option": str(size), "value": size}) + options.append( + { + "option": str(size), + "value": size, + "type": f"TileSize {{ value: {size} }}", + } + ) - options.append({"option": "Custom", "value": CUSTOM}) + options.append( + { + "option": "Custom", + "value": CUSTOM, + "type": f"TileSize {{ value: {int(CUSTOM)} }}", + } + ) return DropDownInput( input_type="TileSize", diff --git a/backend/src/nodes/properties/inputs/tensorrt_inputs.py b/backend/src/nodes/properties/inputs/tensorrt_inputs.py new file mode 100644 index 0000000000..11cc440614 --- /dev/null +++ b/backend/src/nodes/properties/inputs/tensorrt_inputs.py @@ -0,0 +1,57 @@ +import navi +from api import BaseInput + +from ...impl.tensorrt.model import TensorRTEngine +from .generic_inputs import DropDownInput + + +class TensorRTEngineInput(BaseInput): + """Input for TensorRT engine.""" + + def __init__( + self, + label: str = "TensorRT Engine", + input_type: navi.ExpressionJson = "TensorRTEngine", + ): + super().__init__(input_type, label) + self.associated_type = TensorRTEngine + + +def TensorRTPrecisionDropdown() -> DropDownInput: + """Dropdown for selecting TensorRT precision mode.""" + return DropDownInput( + input_type="TrtPrecision", + label="Precision", + options=[ + { + "option": "FP32", + "value": "fp32", + "type": "TrtPrecision::fp32", + }, + { + "option": "FP16", + "value": "fp16", + "type": "TrtPrecision::fp16", + }, + ], + ) + + +def TensorRTShapeModeDropdown() -> DropDownInput: + """Dropdown for selecting TensorRT shape mode.""" + return DropDownInput( + input_type="TrtShapeMode", + label="Shape Mode", + options=[ + { + "option": "Fixed", + "value": "fixed", + "type": "TrtShapeMode::fixed", + }, + { + "option": "Dynamic", + "value": "dynamic", + "type": "TrtShapeMode::dynamic", + }, + ], + ) diff --git a/backend/src/nodes/properties/outputs/__init__.py b/backend/src/nodes/properties/outputs/__init__.py index ea124b26db..006d46f166 100644 --- a/backend/src/nodes/properties/outputs/__init__.py +++ b/backend/src/nodes/properties/outputs/__init__.py @@ -14,3 +14,7 @@ from .pytorch_outputs import * except Exception: pass +try: + from .tensorrt_outputs import * +except Exception: + pass diff --git a/backend/src/nodes/properties/outputs/tensorrt_outputs.py b/backend/src/nodes/properties/outputs/tensorrt_outputs.py new file mode 100644 index 0000000000..37270ca12d --- /dev/null +++ b/backend/src/nodes/properties/outputs/tensorrt_outputs.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import navi +from api import BaseOutput, OutputKind +from nodes.utils.format import format_channel_numbers + +from ...impl.tensorrt.model import TensorRTEngine + + +class TensorRTEngineOutput(BaseOutput): + """Output for TensorRT engine.""" + + def __init__( + self, + model_type: navi.ExpressionJson = "TensorRTEngine", + label: str = "TensorRT Engine", + kind: OutputKind = "generic", + ): + super().__init__(model_type, label, kind=kind, associated_type=TensorRTEngine) + + def get_broadcast_data(self, value: TensorRTEngine): + i = value.info + + tags: list[str] = [] + + # Add channel info + tags.append(format_channel_numbers(i.input_channels, i.output_channels)) + + # Add scale info + if i.scale is not None: + tags.append(f"{i.scale}x") + + # Add precision + tags.append(i.precision.upper()) + + # Add architecture + # tags.append(i.gpu_architecture) + + # Add shape info + if i.has_dynamic_shapes: + tags.append("Dynamic") + if i.min_shape is not None: + tags.append("Min: " + str(i.min_shape[2]) + "x" + str(i.min_shape[3])) + if i.opt_shape is not None: + tags.append("Opt: " + str(i.opt_shape[2]) + "x" + str(i.opt_shape[3])) + if i.max_shape is not None: + tags.append("Max: " + str(i.max_shape[2]) + "x" + str(i.max_shape[3])) + else: + tags.append("Fixed") + if i.opt_shape is not None: + tags.append(str(i.opt_shape[2]) + "x" + str(i.opt_shape[3])) + + return {"tags": tags} + + def get_broadcast_type(self, value: TensorRTEngine): + fields: dict[str, navi.ExpressionJson] = { + "precision": navi.literal(value.info.precision), + } + + i = value.info + if i.scale is not None: + fields["scale"] = i.scale + if i.input_channels: + fields["inputChannels"] = i.input_channels + if i.output_channels: + fields["outputChannels"] = i.output_channels + + # Add shape constraints (min/max height/width) + # Shape tuples are NCHW format: (batch, channels, height, width) + if i.min_shape is not None: + fields["minHeight"] = i.min_shape[2] + fields["minWidth"] = i.min_shape[3] + else: + fields["minHeight"] = "null" + fields["minWidth"] = "null" + + if i.max_shape is not None: + fields["maxHeight"] = i.max_shape[2] + fields["maxWidth"] = i.max_shape[3] + else: + fields["maxHeight"] = "null" + fields["maxWidth"] = "null" + + if i.opt_shape is not None: + fields["optHeight"] = i.opt_shape[2] + fields["optWidth"] = i.opt_shape[3] + else: + fields["optHeight"] = "null" + fields["optWidth"] = "null" + + return navi.named("TensorRTEngine", fields) diff --git a/backend/src/packages/chaiNNer_tensorrt/__init__.py b/backend/src/packages/chaiNNer_tensorrt/__init__.py new file mode 100644 index 0000000000..23cc408b42 --- /dev/null +++ b/backend/src/packages/chaiNNer_tensorrt/__init__.py @@ -0,0 +1,58 @@ +from api import GB, MB, Dependency, add_package +from gpu import nvidia +from logger import logger +from system import is_arm_mac + +package_description = ( + "TensorRT provides native NVIDIA TensorRT support for optimized GPU inference. " + "It offers significant performance improvements over ONNX Runtime by using " + "NVIDIA's proprietary optimization engine. Engines are built specifically for " + "your GPU architecture for maximum performance." +) + +install_hint = ( + "TensorRT uses NVIDIA's TensorRT engine for GPU-accelerated inference. " + "Requires an NVIDIA GPU with CUDA support." +) + +# Always register the package so dependencies are installed in CI +package = add_package( + __file__, + id="chaiNNer_tensorrt", + name="TensorRT", + description=package_description, + dependencies=[ + Dependency( + display_name="TensorRT", + pypi_name="tensorrt", + version="10.15.1.29", + size_estimate=int(1.2 * GB), + auto_update=False, + ), + Dependency( + display_name="CUDA Python", + pypi_name="cuda-python", + version="13.1.1", + size_estimate=20 * MB, + ), + ], + icon="BsNvidia", + color="#76B900", +) + +# Only add category/nodes if NVIDIA GPU is available and not on ARM Mac +if nvidia.is_available and not is_arm_mac: + tensorrt_category = package.add_category( + name="TensorRT", + description="Nodes for using NVIDIA TensorRT for optimized GPU inference.", + icon="BsNvidia", + color="#76B900", + install_hint=install_hint, + ) + logger.debug("Loaded package %s", package.name) +else: + tensorrt_category = None # type: ignore + if is_arm_mac: + logger.debug("TensorRT package registered but not available on ARM Mac") + else: + logger.debug("TensorRT package registered but no NVIDIA GPU detected") diff --git a/backend/src/packages/chaiNNer_tensorrt/settings.py b/backend/src/packages/chaiNNer_tensorrt/settings.py new file mode 100644 index 0000000000..500deaacfd --- /dev/null +++ b/backend/src/packages/chaiNNer_tensorrt/settings.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass + +from api import CacheSetting, DropdownSetting, NodeContext +from gpu import nvidia +from logger import logger + +from . import package + +if package is not None: + package.add_setting( + DropdownSetting( + label="GPU", + key="gpu_index", + description="Which GPU to use for TensorRT. This is only relevant if you have multiple GPUs.", + options=[{"label": d.name, "value": str(d.index)} for d in nvidia.devices], + default="0", + ) + ) + + package.add_setting( + CacheSetting( + label="Timing Cache", + key="timing_cache", + description="Directory for TensorRT timing cache. Speeds up engine building for similar models.", + directory="tensorrt_timing_cache", + ) + ) + + +@dataclass(frozen=True) +class TensorRTSettings: + gpu_index: int + timing_cache_path: str | None + + +def get_settings(context: NodeContext) -> TensorRTSettings: + settings = context.settings + + timing_cache_path = settings.get_cache_location("timing_cache") + if timing_cache_path and not os.path.exists(timing_cache_path): + os.makedirs(timing_cache_path) + logger.info("Created TensorRT timing cache at: %s", timing_cache_path) + + return TensorRTSettings( + gpu_index=settings.get_int("gpu_index", 0, parse_str=True), + timing_cache_path=timing_cache_path, + ) diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/__init__.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/__init__.py new file mode 100644 index 0000000000..17d4021625 --- /dev/null +++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/__init__.py @@ -0,0 +1,10 @@ +from .. import tensorrt_category + +if tensorrt_category is not None: + io_group = tensorrt_category.add_node_group("Input & Output") + processing_group = tensorrt_category.add_node_group("Processing") + utility_group = tensorrt_category.add_node_group("Utility") +else: + io_group = None # type: ignore + processing_group = None # type: ignore + utility_group = None # type: ignore diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py new file mode 100644 index 0000000000..4fcff48e99 --- /dev/null +++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/load_engine.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import os +from pathlib import Path + +import tensorrt as trt + +from api import NodeContext +from logger import logger +from nodes.impl.tensorrt.memory import get_cuda_compute_capability +from nodes.impl.tensorrt.model import TensorRTEngine, TensorRTEngineInfo +from nodes.properties.inputs import TensorRTFileInput +from nodes.properties.outputs import ( + DirectoryOutput, + FileNameOutput, + TensorRTEngineOutput, +) +from nodes.utils.utils import split_file_path + +from ...settings import get_settings +from .. import io_group + +if io_group is not None: + + @io_group.register( + schema_id="chainner:tensorrt:load_engine", + name="Load Engine", + description=( + "Load a TensorRT engine file (.engine, .trt, .plan). " + "TensorRT engines are built for a specific GPU architecture and may not work " + "on different GPUs. The node will warn you if there's a potential compatibility issue." + ), + icon="BsNvidia", + inputs=[TensorRTFileInput(primary_input=True)], + outputs=[ + TensorRTEngineOutput(kind="tagged").suggest(), + DirectoryOutput("Directory", of_input=0).with_id(2), + FileNameOutput("Name", of_input=0).with_id(1), + ], + side_effects=True, + node_context=True, + ) + def load_engine_node( + context: NodeContext, path: Path + ) -> tuple[TensorRTEngine, Path, str]: + assert os.path.exists(path), f"Engine file at location {path} does not exist" + assert os.path.isfile(path), f"Path {path} is not a file" + + logger.debug("Reading TensorRT engine from path: %s", path) + + settings = get_settings(context) + gpu_index = settings.gpu_index + + # Check GPU compatibility + current_major, current_minor = get_cuda_compute_capability(gpu_index) + current_arch = f"sm_{current_major}{current_minor}" + + # Load engine bytes + with open(path, "rb") as f: + engine_bytes = f.read() + + # Deserialize to get engine info + trt_logger = trt.Logger(trt.Logger.WARNING) # type: ignore + runtime = trt.Runtime(trt_logger) # type: ignore + engine = runtime.deserialize_cuda_engine(engine_bytes) + + if engine is None: + raise RuntimeError( + f"Failed to deserialize TensorRT engine from {path}. " + "This may happen if the engine was built with an incompatible TensorRT version " + "or for a different GPU architecture." + ) + + # Extract engine information + input_name = engine.get_tensor_name(0) + output_name = engine.get_tensor_name(1) + + input_shape = engine.get_tensor_shape(input_name) + logger.info("Input shape: %s", input_shape) + output_shape = engine.get_tensor_shape(output_name) + logger.info("Output shape: %s", output_shape) + + # Detect channels + input_channels = input_shape[1] if len(input_shape) >= 4 else 3 + output_channels = output_shape[1] if len(output_shape) >= 4 else 3 + + # Calculate scale + scale = None + if len(input_shape) >= 4 and len(output_shape) >= 4: + in_h, in_w = input_shape[2], input_shape[3] + out_h, out_w = output_shape[2], output_shape[3] + if in_h > 0 and in_w > 0 and out_h > 0 and out_w > 0: + scale_h = out_h // in_h if out_h % in_h == 0 else None + scale_w = out_w // in_w if out_w % in_w == 0 else None + if scale_h == scale_w and scale_h is not None: + scale = scale_h + + # Check for dynamic shapes + has_dynamic = any(d == -1 for d in input_shape) + + # Detect precision from the engine + precision = ( + "fp16" + if engine.get_tensor_dtype(input_name) == trt.DataType.HALF + else "fp32" + ) + + tensor_profile_name = input_name + min_shape, opt_shape, max_shape = engine.get_tensor_profile_shape( + tensor_profile_name, 0 + ) + + # Create info + info = TensorRTEngineInfo( + precision=precision, + input_channels=input_channels, + output_channels=output_channels, + scale=scale, + gpu_architecture=current_arch, + tensorrt_version=trt.__version__, + has_dynamic_shapes=has_dynamic, + min_shape=min_shape, + opt_shape=opt_shape, + max_shape=max_shape, + ) + + dirname, basename, _ = split_file_path(path) + return TensorRTEngine(engine_bytes, info), dirname, basename diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py new file mode 100644 index 0000000000..60ba1b22e8 --- /dev/null +++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/io/save_engine.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from pathlib import Path + +from logger import logger +from nodes.impl.tensorrt.model import TensorRTEngine +from nodes.properties.inputs import ( + DirectoryInput, + RelativePathInput, + TensorRTEngineInput, +) + +from .. import io_group + +if io_group is not None: + + @io_group.register( + schema_id="chainner:tensorrt:save_engine", + name="Save Engine", + description=( + "Save a TensorRT engine to a file (.engine). " + "The saved engine is specific to your GPU architecture and TensorRT version." + ), + icon="MdSave", + inputs=[ + TensorRTEngineInput(), + DirectoryInput(must_exist=False), + RelativePathInput("Engine Name"), + ], + outputs=[], + side_effects=True, + ) + def save_engine_node( + engine: TensorRTEngine, directory: Path, engine_name: str + ) -> None: + full_path = (directory / f"{engine_name}.engine").resolve() + logger.debug("Writing TensorRT engine to path: %s", full_path) + full_path.parent.mkdir(parents=True, exist_ok=True) + with open(full_path, "wb") as f: + f.write(engine.bytes) diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py new file mode 100644 index 0000000000..bcef15d58d --- /dev/null +++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/processing/upscale_image.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import numpy as np + +from api import NodeContext +from logger import logger +from nodes.groups import Condition, if_enum_group, if_group +from nodes.impl.tensorrt.auto_split import tensorrt_auto_split +from nodes.impl.tensorrt.inference import clear_session_cache +from nodes.impl.tensorrt.model import TensorRTEngine +from nodes.impl.upscale.auto_split_tiles import ( + CUSTOM, + MAX_TILE_SIZE, + NO_TILING, + TILE_SIZE_256, + TileSize, +) +from nodes.impl.upscale.convenient_upscale import convenient_upscale +from nodes.impl.upscale.tiler import BoundedTileSize, NoTiling, Tiler +from nodes.properties.inputs import ( + BoolInput, + ImageInput, + NumberInput, + TensorRTEngineInput, + TileSizeDropdown, +) +from nodes.properties.outputs import ImageOutput +from nodes.utils.utils import get_h_w_c + +from ...settings import get_settings +from .. import processing_group + + +def upscale( + img: np.ndarray, + engine: TensorRTEngine, + tiler: Tiler, + gpu_index: int, +) -> np.ndarray: + logger.debug("Upscaling image with TensorRT") + return tensorrt_auto_split(img, engine, tiler, gpu_index=gpu_index) + + +def create_tiler_for_engine( + engine: TensorRTEngine, tile_size: TileSize, custom_tile_size: int +) -> Tiler: + """ + Create an appropriate tiler based on the engine's constraints and tile size setting. + """ + # Extract min/max size constraints from engine info + # Shape tuples are NCHW format: (batch, channels, height, width) + info = engine.info + min_size = None + max_size = None + + if info.min_shape is not None: + min_size = (info.min_shape[3], info.min_shape[2]) # (width, height) + if info.max_shape is not None: + max_size = (info.max_shape[3], info.max_shape[2]) # (width, height) + + if tile_size == NO_TILING: + # No tiling - use the whole image (type validation ensures it fits within bounds) + return NoTiling() + elif tile_size == MAX_TILE_SIZE: + # Use the maximum size allowed by the engine + if max_size is not None: + size = min(max_size[0], max_size[1]) + else: + size = 2**31 # Effectively unlimited + return BoundedTileSize(size, min_size=min_size, max_size=max_size) + elif tile_size == CUSTOM: + return BoundedTileSize(custom_tile_size, min_size=min_size, max_size=max_size) + else: + # Numeric tile size + return BoundedTileSize(int(tile_size), min_size=min_size, max_size=max_size) + + +if processing_group is not None: + + @processing_group.register( + schema_id="chainner:tensorrt:upscale_image", + description=( + "Upscales an image using a TensorRT engine. TensorRT provides optimized GPU inference " + "for maximum performance. If you encounter out-of-memory errors, try using a smaller tile size." + ), + inputs=[ + ImageInput().with_id(1), + TensorRTEngineInput().with_id(0), + TileSizeDropdown(estimate=False, default=TILE_SIZE_256) + .with_id(2) + .with_docs( + "Tiled upscaling is used to allow large images to be upscaled without hitting memory limits.", + "This works by splitting the image into tiles (with overlap), upscaling each tile individually, and seamlessly recombining them.", + "Generally it's recommended to use the largest tile size possible for best performance, but depending on the model and image size, this may not be possible.", + ), + if_enum_group(2, CUSTOM)( + NumberInput( + "Custom Tile Size", + min=1, + max=None, + default=TILE_SIZE_256, + unit="px", + ) + ), + if_group(Condition.type(1, "Image { channels: 4 } "))( + BoolInput("Separate Alpha", default=False).with_docs( + "Upscale alpha separately from color. Enabling this option will cause the alpha of" + " the upscaled image to be less noisy and more accurate to the alpha of the original" + " image, but the image may suffer from dark borders near transparency edges" + " (transition from fully transparent to fully opaque).", + "Whether enabling this option will improve the upscaled image depends on the original" + " image. We generally recommend this option for images with smooth transitions between" + " transparent and opaque regions.", + ) + ), + ], + outputs=[ + ImageOutput( + "Image", + image_type=""" + let engine = Input0; + let image = Input1; + let tileSize = Input2; + + // Only validate image dimensions when "No Tiling" is selected (value == -1) + // Otherwise, the BoundedTileSize tiler handles constraints at runtime + let noTiling = tileSize.value == -1; + + // Check minimum size constraints + let minWidthOk = match engine.minWidth { + null => true, + _ as w => image.width >= w + }; + let minHeightOk = match engine.minHeight { + null => true, + _ as h => image.height >= h + }; + + // Check maximum size constraints + let maxWidthOk = match engine.maxWidth { + null => true, + _ as w => image.width <= w + }; + let maxHeightOk = match engine.maxHeight { + null => true, + _ as h => image.height <= h + }; + + if noTiling and (not minWidthOk or not minHeightOk) { + error("Image is smaller than the minimum size supported by this TensorRT engine. Use tiling or resize the image.") + } else if noTiling and (not maxWidthOk or not maxHeightOk) { + error("Image is larger than the maximum size supported by this TensorRT engine. Use tiling or resize the image.") + } else { + convenientUpscaleTrt(engine, image) + } + """, + ) + ], + name="Upscale Image", + icon="BsNvidia", + node_context=True, + ) + def upscale_image_node( + context: NodeContext, + img: np.ndarray, + engine: TensorRTEngine, + tile_size: TileSize, + custom_tile_size: int, + separate_alpha: bool, + ) -> np.ndarray: + settings = get_settings(context) + gpu_index = settings.gpu_index + + context.add_cleanup(clear_session_cache, after="chain") + + in_nc = engine.input_channels + out_nc = engine.output_channels + + h, w, c = get_h_w_c(img) + logger.debug("Image is %dx%dx%d", h, w, c) + + # Create the appropriate tiler based on engine constraints and tile size setting + tiler = create_tiler_for_engine(engine, tile_size, custom_tile_size) + + return convenient_upscale( + img, + in_nc, + out_nc, + lambda i: upscale( + i, + engine, + tiler, + gpu_index, + ), + separate_alpha, + ) diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py new file mode 100644 index 0000000000..391b2f7f33 --- /dev/null +++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/build_engine.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +from enum import Enum + +from api import NodeContext +from logger import logger +from nodes.groups import if_enum_group +from nodes.impl.onnx.model import OnnxModel +from nodes.impl.tensorrt.engine_builder import BuildConfig, build_engine_from_onnx +from nodes.impl.tensorrt.model import TensorRTEngine +from nodes.properties.inputs import ( + EnumInput, + NumberInput, + OnnxModelInput, +) +from nodes.properties.outputs import TensorRTEngineOutput + +from ...settings import get_settings +from .. import utility_group + + +class Precision(Enum): + FP32 = "fp32" + FP16 = "fp16" + + +PRECISION_LABELS = { + Precision.FP32: "FP32 (Higher Precision)", + Precision.FP16: "FP16 (Faster)", +} + + +class ShapeMode(Enum): + FIXED = "fixed" + DYNAMIC = "dynamic" + + +SHAPE_MODE_LABELS = { + ShapeMode.FIXED: "Fixed (Single Size)", + ShapeMode.DYNAMIC: "Dynamic (Variable Sizes)", +} + + +if utility_group is not None: + + @utility_group.register( + schema_id="chainner:tensorrt:build_engine", + name="Build Engine", + description=[ + "Convert an ONNX model to a TensorRT engine.", + "Building an engine can take several minutes depending on the model size and optimization settings.", + "The built engine is optimized specifically for your GPU and TensorRT version.", + "It is recommended to save the built engine for reuse, as building is slow.", + ], + icon="BsNvidia", + inputs=[ + OnnxModelInput("ONNX Model"), + EnumInput( + Precision, + label="Precision", + default=Precision.FP16, + option_labels=PRECISION_LABELS, + ).with_docs( + "FP16: lower precision but faster and uses less memory, especially on RTX GPUs. FP16 also does not work with certain models.", + "FP32: higher precision but slower. Use especially if FP16 fails.", + ), + EnumInput( + ShapeMode, + label="Shape Mode", + default=ShapeMode.FIXED, + option_labels=SHAPE_MODE_LABELS, + ).with_docs( + "Fixed: Build engine for a single input size. Fastest inference.", + "Dynamic: Build engine for variable input sizes. More flexible but slightly slower.", + ), + if_enum_group(2, ShapeMode.DYNAMIC)( + NumberInput( + "Min Height", + default=64, + min=16, + max=4096, + unit="px", + ).with_docs("Minimum input height for dynamic shapes."), + NumberInput( + "Min Width", + default=64, + min=16, + max=4096, + unit="px", + ).with_docs("Minimum input width for dynamic shapes."), + NumberInput( + "Optimal Height", + default=512, + min=16, + max=4096, + unit="px", + ).with_docs("Optimal input height (used for optimization)."), + NumberInput( + "Optimal Width", + default=512, + min=16, + max=4096, + unit="px", + ).with_docs("Optimal input width (used for optimization)."), + NumberInput( + "Max Height", + default=2048, + min=16, + max=8192, + unit="px", + ).with_docs("Maximum input height for dynamic shapes."), + NumberInput( + "Max Width", + default=2048, + min=16, + max=8192, + unit="px", + ).with_docs("Maximum input width for dynamic shapes."), + ), + if_enum_group(2, ShapeMode.FIXED)( + NumberInput( + "Height", + default=256, + min=16, + max=8192, + unit="px", + ).with_docs("Fixed input height."), + NumberInput( + "Width", + default=256, + min=16, + max=8192, + unit="px", + ).with_docs("Fixed input width."), + ), + NumberInput( + "Workspace (GB)", + default=4.0, + min=1.0, + max=32.0, + precision=1, + step=0.5, + ).with_docs( + "Maximum GPU memory for building. Larger values may allow better optimizations.", + hint=True, + ), + ], + outputs=[ + TensorRTEngineOutput(), + ], + node_context=True, + ) + def build_engine_node( + context: NodeContext, + onnx_model: OnnxModel, + precision: Precision, + shape_mode: ShapeMode, + min_height: int, + min_width: int, + opt_height: int, + opt_width: int, + max_height: int, + max_width: int, + static_height: int, + static_width: int, + workspace: float, + ) -> TensorRTEngine: + settings = get_settings(context) + gpu_index = settings.gpu_index + + # Determine timing cache path + timing_cache_path = None + if settings.timing_cache_path: + import hashlib + + # Create a cache key based on the model + model_hash = hashlib.md5(onnx_model.bytes[:1024]).hexdigest()[:8] + timing_cache_path = ( + f"{settings.timing_cache_path}/timing_{model_hash}.cache" + ) + + use_dynamic = shape_mode == ShapeMode.DYNAMIC + + if not use_dynamic: + min_height = static_height + min_width = static_width + opt_height = static_height + opt_width = static_width + max_height = static_height + max_width = static_width + + config = BuildConfig( + precision=precision.value, + workspace_size_gb=workspace, + min_shape=(min_height, min_width), + opt_shape=(opt_height, opt_width), + max_shape=(max_height, max_width), + use_dynamic_shapes=use_dynamic, + ) + + logger.info( + "Building TensorRT engine: precision=%s, dynamic=%s, workspace=%.1fGB", + precision.value, + use_dynamic, + workspace, + ) + + engine = build_engine_from_onnx( + onnx_model.bytes, + config, + gpu_index=gpu_index, + timing_cache_path=timing_cache_path, + ) + + return engine diff --git a/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py new file mode 100644 index 0000000000..d2db3238bd --- /dev/null +++ b/backend/src/packages/chaiNNer_tensorrt/tensorrt/utility/get_engine_info.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from nodes.impl.tensorrt.model import TensorRTEngine +from nodes.properties.inputs import TensorRTEngineInput +from nodes.properties.outputs import NumberOutput, TextOutput + +from .. import utility_group + +if utility_group is not None: + + @utility_group.register( + schema_id="chainner:tensorrt:engine_info", + name="Get Engine Info", + description="Returns information about a TensorRT engine.", + icon="ImInfo", + inputs=[TensorRTEngineInput("TensorRT Engine")], + outputs=[ + NumberOutput( + "Scale", + output_type=""" + if Input0.scale == null { -1 } else { Input0.scale } + """, + ), + TextOutput( + "Precision", + output_type="Input0.precision", + ), + TextOutput( + "GPU Architecture", + ), + TextOutput( + "TensorRT Version", + ), + TextOutput( + "Shape Info", + ), + ], + ) + def get_engine_info_node( + engine: TensorRTEngine, + ) -> tuple[int, str, str, str, str]: + info = engine.info + + scale = info.scale if info.scale is not None else -1 + precision = info.precision.upper() + gpu_arch = info.gpu_architecture + trt_version = info.tensorrt_version + + if info.has_dynamic_shapes: + if info.min_shape and info.max_shape and info.opt_shape: + shape_info = ( + f"Dynamic: " + f"Min: {info.min_shape[2]}x{info.min_shape[3]} " + f"Opt: {info.opt_shape[2]}x{info.opt_shape[3]} " + f"Max: {info.max_shape[2]}x{info.max_shape[3]}" + ) + else: + shape_info = "Dynamic" + elif info.opt_shape: + shape_info = f"Fixed: {info.opt_shape[2]}x{info.opt_shape[3]}" + else: + shape_info = "Fixed" + + return scale, precision, gpu_arch, trt_version, shape_info diff --git a/backend/src/server.py b/backend/src/server.py index 6c6fbd944c..d90489768f 100644 --- a/backend/src/server.py +++ b/backend/src/server.py @@ -634,6 +634,7 @@ async def import_packages( importlib.import_module("packages.chaiNNer_pytorch") importlib.import_module("packages.chaiNNer_ncnn") importlib.import_module("packages.chaiNNer_onnx") + importlib.import_module("packages.chaiNNer_tensorrt") importlib.import_module("packages.chaiNNer_external") logger.info("Loading Nodes...") diff --git a/pyrightconfig.json b/pyrightconfig.json index 3022853f83..52e72464b5 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -8,7 +8,10 @@ "exclude": [ "**/__pycache__" ], - "ignore": [], + "ignore": [ + "backend/src/nodes/impl/tensorrt", + "backend/src/packages/chaiNNer_tensorrt" + ], "typeCheckingMode": "basic", "useLibraryCodeForTypes": true, "strictListInference": true, @@ -29,5 +32,5 @@ "reportUnusedClass": "warning", "reportUnusedFunction": "warning", "reportUnusedImport": "warning", - "reportUnusedVariable": "warning", + "reportUnusedVariable": "warning" } diff --git a/src/common/types/chainner-scope.ts b/src/common/types/chainner-scope.ts index acee884469..8145175a79 100644 --- a/src/common/types/chainner-scope.ts +++ b/src/common/types/chainner-scope.ts @@ -92,6 +92,22 @@ let OnnxGenericModel = OnnxModel { subType: "Generic", }; +struct TensorRTEngine { + scale: int(1..), + inputChannels: int(1..), + outputChannels: int(1..), + precision: string, + minHeight: int(1..) | null, + minWidth: int(1..) | null, + optHeight: int(1..) | null, + optWidth: int(1..) | null, + maxHeight: int(1..) | null, + maxWidth: int(1..) | null, +} + +enum TrtPrecision { fp32, fp16 } +enum TrtShapeMode { fixed, dynamic } + def pytorchToOnnx(model: PyTorchModel): OnnxModel { OnnxModel { scaleHeight: model.scale, @@ -114,7 +130,7 @@ struct ColorSpace { channels: 1 | 3 | 4, supportsAlpha: bool } struct DdsFormat; struct DdsMipMaps; struct RotateInterpolationMode; -struct TileSize; +struct TileSize { value: int } struct AudioStream; enum FpMode { fp32, fp16 } @@ -148,6 +164,17 @@ def convenientUpscaleOnnx(model: OnnxModel, image: Image) { } } } +def convenientUpscaleTrt(engine: TensorRTEngine, image: Image) { + Image { + width: engine.scale * image.width, + height: engine.scale * image.height, + channels: if engine.inputChannels == engine.outputChannels { + image.channels + } else { + engine.outputChannels + } + } +} struct SplitFilePath { dir: Directory, diff --git a/src/renderer/colors.scss b/src/renderer/colors.scss index f99b79ee40..41f1b98dbe 100644 --- a/src/renderer/colors.scss +++ b/src/renderer/colors.scss @@ -35,6 +35,7 @@ --type-color-torch: #dd6b20; --type-color-onnx: #63b3ed; --type-color-ncnn: #ed64a6; + --type-color-tensorrt: #76b900; } :root[data-theme='dark'] { @@ -373,4 +374,5 @@ --type-color-torch: #fbae2e; --type-color-onnx: #ac9fdc; --type-color-ncnn: #ae5354; + --type-color-tensorrt: #90c41c; } diff --git a/src/renderer/helpers/accentColors.ts b/src/renderer/helpers/accentColors.ts index 38dc236113..337d7d5d73 100644 --- a/src/renderer/helpers/accentColors.ts +++ b/src/renderer/helpers/accentColors.ts @@ -30,6 +30,7 @@ const colorList = () => { { type: resolveName('PyTorchModel'), color: getComputedColor('--type-color-torch') }, { type: resolveName('OnnxModel'), color: getComputedColor('--type-color-onnx') }, { type: resolveName('NcnnNetwork'), color: getComputedColor('--type-color-ncnn') }, + { type: resolveName('TensorRTEngine'), color: getComputedColor('--type-color-tensorrt') }, ]; }; // eslint-disable-next-line @typescript-eslint/no-unused-vars