diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..16a4f68
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,32 @@
+# Curated, pragmatic check set for a modern C++20 library. The goal is to catch real defects
+# (bugprone-*, the static analyzer, performance traps) without the noise that makes a tidy run
+# something people learn to ignore. Not run as -warnings-as-errors in CI; it reports on changed
+# files (see .github/workflows/format.yml).
+Checks: >
+ bugprone-*,
+ performance-*,
+ portability-*,
+ clang-analyzer-*,
+ misc-*,
+ modernize-use-nullptr,
+ modernize-use-override,
+ modernize-use-using,
+ modernize-use-emplace,
+ modernize-make-unique,
+ modernize-loop-convert,
+ readability-braces-around-statements,
+ readability-redundant-member-init,
+ readability-misleading-indentation,
+ -bugprone-easily-swappable-parameters,
+ -bugprone-exception-escape,
+ -bugprone-narrowing-conversions,
+ -misc-no-recursion,
+ -misc-non-private-member-variables-in-classes,
+ -misc-include-cleaner,
+ -performance-avoid-endl,
+ -clang-analyzer-optin.core.EnumCastOutOfRange
+
+# Library headers + sources only; never third-party (stb, gtest, pybind11) or generated headers.
+HeaderFilterRegex: '(include/tensorrt_cpp_api|src)/[^/]*\.(h|hpp)$'
+WarningsAsErrors: ''
+FormatStyle: file
diff --git a/.cmake-format.yaml b/.cmake-format.yaml
new file mode 100644
index 0000000..e95c6ef
--- /dev/null
+++ b/.cmake-format.yaml
@@ -0,0 +1,15 @@
+# cmake-format config (used by the pre-commit cmake-format hook). Tuned to the repo's existing
+# CMake style: a generous line width so the hand-wrapped install()/target_* calls are not
+# reflowed, lowercase commands, and dangling close-parens for multi-line calls.
+format:
+ line_width: 130
+ tab_size: 4
+ separate_ctrl_name_with_space: false
+ separate_fn_name_with_space: false
+ dangle_parens: true
+ command_case: lower
+ keyword_case: upper
+ max_subgroups_hwrap: 4
+ max_pargs_hwrap: 6
+markup:
+ enable_markup: false
diff --git a/.github/actions/setup-trt/action.yml b/.github/actions/setup-trt/action.yml
new file mode 100644
index 0000000..58924cf
--- /dev/null
+++ b/.github/actions/setup-trt/action.yml
@@ -0,0 +1,21 @@
+name: Setup CUDA + TensorRT
+description: >
+ Install a consistent CUDA 12.6 + TensorRT 10.7 (cuda12.6) toolchain from NVIDIA's apt repo.
+ The TensorRT version is PINNED: the unversioned libnvinfer-dev now resolves to TensorRT 11 built
+ for CUDA 13, which is incompatible with the CUDA 12.6 toolkit. No GPU is needed to compile.
+runs:
+ using: composite
+ steps:
+ - shell: bash
+ run: |
+ set -eux
+ wget -qO cuda-keyring.deb \
+ "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb"
+ sudo dpkg -i cuda-keyring.deb
+ sudo apt-get update
+ TRT=10.7.0.23-1+cuda12.6
+ sudo apt-get install -y --no-install-recommends \
+ cuda-cudart-dev-12-6 cuda-nvcc-12-6 \
+ libnvinfer-dev="$TRT" libnvinfer-headers-dev="$TRT" libnvinfer10="$TRT" \
+ libnvonnxparsers-dev="$TRT" libnvonnxparsers10="$TRT"
+ echo "/usr/local/cuda-12.6/bin" >> "$GITHUB_PATH"
diff --git a/.github/consumer/CMakeLists.txt b/.github/consumer/CMakeLists.txt
new file mode 100644
index 0000000..daf9df2
--- /dev/null
+++ b/.github/consumer/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.22)
+project(trtcpp_consumer LANGUAGES CXX)
+
+# Smoke test that an installed tensorrt_cpp_api is consumable via find_package: configure with
+# -DCMAKE_PREFIX_PATH=.
+find_package(tensorrt_cpp_api REQUIRED)
+
+add_executable(consumer main.cpp)
+target_link_libraries(consumer PRIVATE tensorrt_cpp_api::tensorrt_cpp_api)
diff --git a/.github/consumer/main.cpp b/.github/consumer/main.cpp
new file mode 100644
index 0000000..55ded44
--- /dev/null
+++ b/.github/consumer/main.cpp
@@ -0,0 +1,8 @@
+#include
+
+#include
+
+int main() {
+ std::printf("consumer linked trtcpp %s\n", trtcpp::versionString().c_str());
+ return trtcpp::libraryVersion().major == 7 ? 0 : 1;
+}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..9fc91c1
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,76 @@
+name: CI
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ workflow_dispatch:
+
+concurrency:
+ group: ci-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ # Build + the non-GPU test suite. GitHub-hosted runners have no NVIDIA GPU, so we compile against
+ # the CUDA/TensorRT headers and run only `ctest -LE gpu`. The CUDA + TensorRT toolchain comes from
+ # the pinned composite action (.github/actions/setup-trt). The excluded GPU-labeled integration
+ # tests run via the manual-dispatch `gpu tests` workflow (.github/workflows/gpu-tests.yml).
+ build-cpu:
+ name: build + cpu tests (ubuntu-24.04)
+ runs-on: ubuntu-24.04
+ steps:
+ - uses: actions/checkout@v4
+ - uses: ./.github/actions/setup-trt
+ - name: Configure
+ run: |
+ cmake -S . -B build \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DTRT_CPP_API_BUILD_TESTS=ON \
+ -DTRT_CPP_API_BUILD_PREPROC=ON \
+ -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.6/bin/nvcc
+ - name: Build
+ run: cmake --build build -j2
+ - name: CPU tests (exclude gpu-labeled)
+ run: ctest --test-dir build -LE gpu --output-on-failure
+ - name: Install smoke (find_package consumer)
+ run: |
+ cmake --install build --prefix "$PWD/_install"
+ cmake -S .github/consumer -B build/consumer -DCMAKE_PREFIX_PATH="$PWD/_install"
+ cmake --build build/consumer
+ ./build/consumer/consumer
+
+ # AddressSanitizer + UBSan on the CPU test suite (preproc/CUDA off; the CPU tests don't need it).
+ sanitizers:
+ name: asan + ubsan (cpu tests)
+ runs-on: ubuntu-24.04
+ steps:
+ - uses: actions/checkout@v4
+ - uses: ./.github/actions/setup-trt
+ - name: Configure (sanitized)
+ run: |
+ cmake -S . -B build-san \
+ -DCMAKE_BUILD_TYPE=Debug \
+ -DTRT_CPP_API_BUILD_TESTS=ON \
+ -DTRT_CPP_API_BUILD_PREPROC=OFF \
+ -DCMAKE_CXX_FLAGS="-fsanitize=address,undefined -fno-sanitize-recover=all -fno-omit-frame-pointer"
+ - name: Build CPU tests
+ run: cmake --build build-san -j2 --target trtcpp_core_tests
+ - name: Run CPU tests under sanitizers
+ run: ./build-san/tests/trtcpp_core_tests
+
+ # Python wheel build + import sanity (no GPU needed to import).
+ python-wheel:
+ name: python wheel + import
+ runs-on: ubuntu-24.04
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - uses: ./.github/actions/setup-trt
+ - name: Build + install the wheel
+ run: |
+ python -m pip install --upgrade pip
+ pip install . -v --config-settings=cmake.define.CMAKE_CUDA_COMPILER=/usr/local/cuda-12.6/bin/nvcc
+ - name: Import sanity
+ run: python -c "import trtcpp; print(trtcpp.version_string()); assert trtcpp.library_version()[0] == 7"
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
new file mode 100644
index 0000000..4055103
--- /dev/null
+++ b/.github/workflows/format.yml
@@ -0,0 +1,56 @@
+name: format
+
+on:
+ pull_request:
+ workflow_dispatch:
+
+jobs:
+ clang-format:
+ name: clang-format
+ runs-on: ubuntu-24.04
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - name: Install clang-format (pinned to match .pre-commit-config.yaml)
+ run: pip install "clang-format==18.1.8"
+ - name: Check formatting (library + examples; excludes vendored stb)
+ run: |
+ files=$(git ls-files '*.h' '*.hpp' '*.cpp' '*.cu' \
+ | grep -vE '(^|/)stb_image(_write)?\.h$')
+ echo "$files"
+ clang-format --dry-run --Werror $files
+
+ clang-tidy:
+ name: clang-tidy (changed files)
+ runs-on: ubuntu-24.04
+ if: ${{ github.event_name == 'pull_request' }}
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ - uses: ./.github/actions/setup-trt
+ - name: Install clang-tidy
+ run: sudo apt-get install -y clang-tidy-18
+ - name: Configure (export compile_commands.json)
+ run: |
+ cmake -S . -B build \
+ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ -DTRT_CPP_API_BUILD_TESTS=ON \
+ -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.6/bin/nvcc
+ - name: clang-tidy on changed library sources
+ run: |
+ base="${{ github.event.pull_request.base.sha }}"
+ # Lint changed C++ translation units under src/ (the shipping core). clang-tidy is run
+ # on source files, never on headers or .cu: a header has no compile command of its own,
+ # so clang-tidy infers flags from the nearest unit -- which for our headers is the CUDA
+ # kernel (preproc.cu), making it parse them in CUDA mode and fail to find CUDA toolkit
+ # headers. Headers are still checked via HeaderFilterRegex when a linted .cpp includes
+ # them. --diff-filter=d drops files the PR deletes.
+ changed=$(git diff --name-only --diff-filter=d "$base"...HEAD \
+ | grep -E '^src/.*\.(c|cc|cpp)$' \
+ | grep -vE '\.cu$' || true)
+ if [ -z "$changed" ]; then echo "no library source changes"; exit 0; fi
+ echo "$changed"
+ clang-tidy-18 -p build $changed
diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
new file mode 100644
index 0000000..b9f7fd1
--- /dev/null
+++ b/.github/workflows/gpu-tests.yml
@@ -0,0 +1,21 @@
+name: gpu tests
+
+# Manual-dispatch only. GitHub-hosted runners have no NVIDIA GPU, so the GPU-labeled CUDA /
+# TensorRT / OpenCV / preproc integration tests are excluded from PR CI (ctest -LE gpu). Run this
+# workflow from the Actions tab on a self-hosted runner labeled [self-hosted, gpu] to exercise the
+# full suite. It is intentionally not attached to pull_request, so it adds no per-PR check.
+on:
+ workflow_dispatch:
+
+jobs:
+ gpu-tests:
+ name: gpu tests (self-hosted)
+ runs-on: [self-hosted, gpu]
+ steps:
+ - uses: actions/checkout@v4
+ - name: Configure + build
+ run: |
+ cmake -S . -B build -DTRT_CPP_API_BUILD_TESTS=ON
+ cmake --build build -j4
+ - name: Full test suite
+ run: ctest --test-dir build --output-on-failure
diff --git a/.gitignore b/.gitignore
index 8a328d5..8e8026a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,4 +38,22 @@ cmake-build-*
.vscode/
*.onnx
+*.pt
+
+# Keep the tiny committed test fixtures (override the global *.onnx ignore)
+!tests/models/*.onnx
+
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.pytest_cache/
+dist/
+wheelhouse/
+
+# Generated API docs (doxygen Doxyfile)
+docs/api/
+
+# CTest working dir (created when running ctest from the repo root)
+Testing/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 273d14b..f956278 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,23 @@
+# Run `pre-commit install` once; hooks then run on `git commit`.
+# Vendored third-party single-header libs (stb) are excluded from formatting.
+exclude: '(^|/)stb_image(_write)?\.h$'
+
repos:
-- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: 'v17.0.3' # Use the sha / tag you want to point at
- hooks:
- - id: clang-format
- types_or: [c++, c, cuda]
+ - repo: https://github.com/pre-commit/mirrors-clang-format
+ rev: 'v18.1.8' # must match the pip clang-format pinned in .github/workflows/format.yml
+ hooks:
+ - id: clang-format
+ types_or: [c++, c, cuda]
+
+ - repo: https://github.com/cheshirekow/cmake-format-precommit
+ rev: v0.6.13
+ hooks:
+ - id: cmake-format
+
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.6.0
+ hooks:
+ - id: trailing-whitespace
+ - id: end-of-file-fixer
+ - id: check-yaml
+ - id: mixed-line-ending
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..b9b915f
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,52 @@
+# AGENTS.md
+
+Orientation for AI coding agents (Claude Code, Codex, Cursor) working in this repo. Humans should
+read [`README.md`](README.md); if the two ever disagree, README wins for humans and this file wins
+for agents — keep them in lockstep.
+
+## What this is
+
+`tensorrt_cpp_api` is a C++ library that wraps NVIDIA TensorRT for CNN-class vision models: build a
+TensorRT engine from ONNX, cache it on disk, and run inference. It has optional fused GPU
+preprocessing, optional OpenCV interop, and optional zero-copy Python bindings. **Linux + NVIDIA
+GPU only.** It is **not** an LLM/transformer serving framework, and Windows is out of scope.
+
+Targets TensorRT >= 10 (written to the TensorRT 11 surface, version-gated), CUDA 12, C++20.
+
+## Build & test
+
+```bash
+cmake -S . -B build -DTRT_CPP_API_BUILD_TESTS=ON # add -DTensorRT_DIR= for a tarball TensorRT
+cmake --build build -j
+ctest --test-dir build -LE gpu # CPU-only tests; drop -LE gpu to run the full suite (needs a GPU)
+```
+
+`scripts/install_deps.sh` (one-time, needs sudo) and `scripts/verify_deps.sh` help set up a host.
+If `nvcc` is not on `PATH`, pass `-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc`.
+
+## Layout
+
+- `include/tensorrt_cpp_api/` — the public headers (one umbrella `all.h`). No `nvinfer1`, OpenCV, or
+ spdlog types appear here; the optional `preproc.h` / `opencv_interop.h` are separate.
+- `src/` — implementation; `src/detail/` is internal (TensorRT glue, cache, buffers, execution).
+- `python/` — the pybind11 `trtcpp` extension. `examples/` — reference programs. `tests/` — GoogleTest.
+
+## Conventions
+
+- **Formatting:** `.clang-format` (LLVM-based, 4-space, 140 col) and `.cmake-format.yaml`. Run
+ `pre-commit run --all-files` (or `clang-format -i`) before committing. CI enforces clang-format.
+- **Comments:** default to none; comment only the non-obvious *why*, never narrate code.
+- **No emojis** anywhere — source, comments, or commit messages.
+- **Public API:** no-throw — every fallible call returns `Status` or `Result` (no exceptions);
+ name-keyed tensor IO (`unordered_map`); caller-provided CUDA streams; no
+ third-party types leaked through public headers (PImpl + version-gating).
+- **Git:** never force-push, never amend a pushed commit, never skip hooks.
+
+## More
+
+- Usage & concepts: [`docs/quickstart.md`](docs/quickstart.md)
+- Install options: [`docs/install.md`](docs/install.md)
+- Migrating from v6: [`docs/upgrading_from_v6.md`](docs/upgrading_from_v6.md)
+
+Reference downstream consumers: [`YOLOv8-TensorRT-CPP`](https://github.com/cyrusbehr/YOLOv8-TensorRT-CPP)
+and [`YOLOv9-TensorRT-CPP`](https://github.com/cyrusbehr/YOLOv9-TensorRT-CPP).
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..0a332ff
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,79 @@
+# Changelog
+
+## v7.0.0-rc1 (2026-05-29)
+
+First release candidate of the v7 rewrite — a ground-up reimplementation into a reusable,
+installable C++ library. **This is a clean break from v6 — the API is not source-compatible.**
+See [docs/upgrading_from_v6.md](docs/upgrading_from_v6.md).
+
+### Added
+- No-throw API: every fallible call returns `Status` or `Result` (no exceptions); `TRTCPP_TRY`
+ sugar for propagation.
+- Name-keyed IO (`unordered_map`) at the inference boundary; non-templated
+ `Tensor`/`TensorView` with a runtime `DType` (replaces v6's `Engine` and nested vectors).
+- Caller-owned CUDA streams (`Stream`, including `Stream::wrap` for an external handle); explicit
+ host/device transfers (`Tensor::toHost`/`to`/`copyFrom`) — never an implicit D2H copy.
+- Safe engine cache: `EngineBuilder::buildOrLoad`/`buildAndLoad` keyed by ONNX content hash +
+ build options + TensorRT version + GPU UUID, JSON sidecar, atomic write, stale-cache detection.
+- Dynamic shapes via per-input min/opt/max optimization profiles and a `-1`-aware `Shape`.
+- `EnginePool` for concurrent multi-stream inference (one optimization profile per context).
+- Quantization that is version-aware and never a silent no-op (`Precision::kFp16`/`kInt8Qdq`/
+ `kFp8`/…); legacy INT8 calibration is available only when built against TensorRT < 11.
+- Optional fused preprocessing sublibrary (`tensorrt_cpp_api::preproc`): a single CUDA kernel for
+ letterbox-resize → BGR↔RGB → per-channel normalize → HWC→NCHW → cast.
+- Optional OpenCV interop header (zero-copy views over `cv::Mat` / `cv::cuda::GpuMat`), strictly
+ opt-in behind `-DTRT_CPP_API_WITH_OPENCV`.
+- Optional Python bindings (`trtcpp`, pybind11 + scikit-build-core) with zero-copy
+ `__cuda_array_interface__`/DLPack interop, caller streams, and GIL release during inference.
+- CMake install/export: `find_package(tensorrt_cpp_api)` with a relocatable package config and a
+ bundled `FindTensorRT` module (apt or tarball).
+- Reference examples (classification, detection, segmentation, zero-copy Python) and GitHub Actions
+ CI (build + CPU tests, sanitizers, Python wheel, lint).
+
+### Changed
+- Minimum TensorRT is 10.0; the code is written to the TensorRT 11 surface and version-gates the
+ removed-in-11 features (legacy calibrators, weak typing, `IPluginV2`).
+- C++20 (was C++17); namespace `trtcpp`; include root `tensorrt_cpp_api/`; CMake target
+ `tensorrt_cpp_api::tensorrt_cpp_api`.
+
+### Removed
+- The v6 API in its entirety — no source-compatibility shim (see the upgrade guide):
+ - the monolithic templated `Engine` and its `.inl`-in-header implementation;
+ - the single `Options` struct, replaced by `BuildOptions` + `EngineOptions`;
+ - OpenCV `cv::cuda::GpuMat` in the inference signatures and the triply-nested `std::vector`
+ inputs/outputs, replaced by name-keyed `TensorView`/`Tensor`;
+ - `bool`/exception error handling, replaced by `Status`/`Result`;
+ - per-call stream creation, replaced by caller-owned `Stream`;
+ - the `run_inference_benchmark` CLI executable and the v6 `include/` header layout.
+
+---
+
+## v6.0
+- Implementation now requires TensorRT >= 10.0.
+
+## v5.0
+- `Engine` became a class template parameterized on the model's output data type (`float`,
+ `__half`, `int8_t`, `int32_t`, `bool`, `uint8_t`).
+- Added loading a TensorRT engine file directly (without compiling from ONNX).
+- Added a command-line parser.
+
+## v4.1
+- Support for fixed batch size > 1.
+
+## v4.0
+- Added INT8 precision support.
+
+## v3.0
+- Updated to the TensorRT 8.6 API (`IExecutionContext::enqueueV3()`).
+- Benchmark executable renamed to `run_inference_benchmark`; takes the ONNX path as an argument.
+- Auto-detect supported batch sizes; stop limiting workspace memory.
+
+## v2.2
+- Serialize the model name as part of the engine file.
+
+## v2.1
+- Support for models with multiple inputs.
+
+## v2.0
+- Requires OpenCV with CUDA.
+- Support for models with more than one output and for non-batchable models; more error checking.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3377253..35d24bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,34 +1,198 @@
-cmake_minimum_required(VERSION 3.18)
-project(tensorrt_cpp_api)
+cmake_minimum_required(VERSION 3.22)
+project(tensorrt_cpp_api VERSION 7.0.0 LANGUAGES CXX)
-# Use ccache to speed up rebuilds
-include(cmake/ccache.cmake)
+# The engine core depends only on CUDA + TensorRT. The preprocessing sublibrary, OpenCV interop,
+# Python bindings, tests, and examples are independent, opt-in components; all default OFF except
+# the preprocessing sublibrary.
-# Set C++ version and optimization level
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations")
+option(TRT_CPP_API_BUILD_TESTS "Build unit/integration tests" OFF)
+option(TRT_CPP_API_BUILD_EXAMPLES "Build reference examples" OFF)
+option(TRT_CPP_API_BUILD_PREPROC "Build the fused preprocessing sublibrary" ON)
+option(TRT_CPP_API_BUILD_PYTHON "Build the pybind11 Python bindings" OFF)
+option(TRT_CPP_API_WITH_OPENCV "Build the optional OpenCV interop" OFF)
+option(TRT_CPP_API_WITH_SPDLOG "Build the optional spdlog logger adapter" OFF)
-# For finding FindTensorRT.cmake
-set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-# TODO: Specify the path to TensorRT root dir
-if (NOT TensorRT_DIR)
- set(TensorRT_DIR /home/cyrus/work/libs/TensorRT-10.0.0.6/)
+# Enable CTest at the top level so tests registered in subdirectories (tests/, python/) are
+# discoverable. Called early -- before add_subdirectory(python) registers the binding test.
+if(TRT_CPP_API_BUILD_TESTS)
+ enable_testing()
endif()
-# Use the correct version of CUDA
-set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
-# We require CUDA, OpenCV, and TensorRT
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+endif()
+
+# Modern ccache hook (replaces v6's generator-specific RULE_LAUNCH_COMPILE).
+find_program(CCACHE_PROGRAM ccache)
+if(CCACHE_PROGRAM)
+ set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+endif()
+
+# CUDA runtime: a hard dependency of the engine core (streams, allocation, Tensor cross
+# the public API). Plain C++ against the CUDA runtime API -- nvcc is only needed later for
+# the preprocessing kernels (.cu), at which point CUDA is enabled as a language.
+find_package(CUDAToolkit REQUIRED)
+
+# TensorRT (apt or tarball). Linked PRIVATE: nvinfer1 types never appear in public headers
+# (PImpl/forward-decl), so consumers need TRT at runtime but not at compile time. Pass
+# -DTensorRT_DIR= for a tarball install.
find_package(TensorRT REQUIRED)
-find_package(CUDA REQUIRED)
-find_package(OpenCV REQUIRED)
-find_package(fmt REQUIRED)
-add_library(tensorrt_cpp_api SHARED
- src/engine.cpp)
+# Threads: the logger and EnginePool use std::mutex/condition_variable. Linked PRIVATE
+# (PImpl, so no thread types in public headers); the package config re-resolves it.
+find_package(Threads REQUIRED)
+
+# Generate build_config.h recording the library + TensorRT versions, so public headers can
+# version-gate (e.g. the legacy calibrator) without including a TensorRT header.
+configure_file(
+ ${CMAKE_CURRENT_SOURCE_DIR}/cmake/build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/generated/tensorrt_cpp_api/build_config.h
+ @ONLY
+)
+
+add_library(tensorrt_cpp_api)
+add_library(tensorrt_cpp_api::tensorrt_cpp_api ALIAS tensorrt_cpp_api)
+target_sources(
+ tensorrt_cpp_api
+ PRIVATE src/core.cpp
+ src/logger.cpp
+ src/cuda.cpp
+ src/allocator.cpp
+ src/device_tensor.cpp
+ src/engine_builder.cpp
+ src/engine.cpp
+ src/engine_pool.cpp
+ src/calibrator.cpp
+ src/detail/buffers.cpp
+ src/detail/calibrator_bridge.cpp
+ src/detail/engine_cache.cpp
+ src/detail/execution.cpp
+ src/detail/sha256.cpp
+ src/detail/trt_common.cpp
+)
+target_include_directories(tensorrt_cpp_api PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+target_link_libraries(tensorrt_cpp_api PUBLIC CUDA::cudart PRIVATE TensorRT::TensorRT Threads::Threads)
+# Build-tree include roots only. The install-tree include dir is supplied once by the
+# INCLUDES DESTINATION of install(TARGETS) below (both the public headers and the generated
+# build_config.h land under the same /include tree), so no $ here.
+target_include_directories(
+ tensorrt_cpp_api PUBLIC $
+ $
+)
+target_compile_features(tensorrt_cpp_api PUBLIC cxx_std_20)
+# POSITION_INDEPENDENT_CODE: the static core is routinely linked into shared objects -- the Python
+# extension and downstream shared libraries such as the YOLO sibling repos' libYoloV8_TRT.so -- so
+# it must be PIC. (Negligible cost; standard for an embeddable static library.)
+set_target_properties(
+ tensorrt_cpp_api PROPERTIES CXX_EXTENSIONS OFF POSITION_INDEPENDENT_CODE ON VERSION ${PROJECT_VERSION}
+ SOVERSION ${PROJECT_VERSION_MAJOR}
+)
+if(NOT MSVC)
+ target_compile_options(tensorrt_cpp_api PRIVATE -Wall -Wextra)
+endif()
+
+# Optional spdlog logger adapter. PUBLIC so makeSpdlogLogger() is declared and
+# spdlog is linked for consumers of a spdlog-enabled install.
+if(TRT_CPP_API_WITH_SPDLOG)
+ find_package(spdlog REQUIRED)
+ target_link_libraries(tensorrt_cpp_api PUBLIC spdlog::spdlog)
+ target_compile_definitions(tensorrt_cpp_api PUBLIC TRT_CPP_API_WITH_SPDLOG)
+endif()
+
+# Optional OpenCV interop. Requests only the `core` module (which carries cv::Mat
+# and cv::cuda::GpuMat) to avoid pulling opencv_dnn and its cuDNN dependency. Strictly
+# opt-in; the engine core has no OpenCV in any non-gated header.
+if(TRT_CPP_API_WITH_OPENCV)
+ find_package(OpenCV REQUIRED COMPONENTS core)
+ target_sources(tensorrt_cpp_api PRIVATE src/opencv_interop.cpp)
+ target_link_libraries(tensorrt_cpp_api PUBLIC ${OpenCV_LIBS})
+ target_include_directories(tensorrt_cpp_api PUBLIC ${OpenCV_INCLUDE_DIRS})
+ target_compile_definitions(tensorrt_cpp_api PUBLIC TRT_CPP_API_WITH_OPENCV)
+endif()
+
+# Optional preprocessing sublibrary: one fused CUDA (.cu) kernel. Separate target --
+# the engine core does NOT depend on it. Enabling it turns on the CUDA language (needs nvcc).
+if(TRT_CPP_API_BUILD_PREPROC)
+ # Architectures must be set BEFORE enable_language(CUDA): CMake validates the variable
+ # during language init, and an unset value on hosts where nvcc can't self-detect a default
+ # aborts configuration.
+ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+ set(CMAKE_CUDA_ARCHITECTURES 75-real 80-real 86-real 89-real 90-virtual)
+ endif()
+ enable_language(CUDA)
+ add_library(tensorrt_cpp_api_preproc src/preproc.cu)
+ add_library(tensorrt_cpp_api::preproc ALIAS tensorrt_cpp_api_preproc)
+ target_link_libraries(tensorrt_cpp_api_preproc PUBLIC tensorrt_cpp_api::tensorrt_cpp_api CUDA::cudart)
+ target_compile_features(tensorrt_cpp_api_preproc PUBLIC cxx_std_20)
+ set_target_properties(
+ tensorrt_cpp_api_preproc
+ PROPERTIES CUDA_STANDARD 20
+ CUDA_STANDARD_REQUIRED ON
+ CXX_EXTENSIONS OFF
+ POSITION_INDEPENDENT_CODE ON # embeddable into shared libs (Python ext, sibling .so)
+ EXPORT_NAME preproc # consumers link tensorrt_cpp_api::preproc (matches the in-tree alias)
+ VERSION ${PROJECT_VERSION}
+ SOVERSION ${PROJECT_VERSION_MAJOR}
+ )
+endif()
+
+# Python bindings. Added after the core/preproc targets exist so the extension can link
+# them. The subdirectory builds trtcpp._core.
+if(TRT_CPP_API_BUILD_PYTHON)
+ add_subdirectory(python)
+endif()
+
+# Reference examples. Normally built standalone against the installed package; this in-tree
+# path is a convenience that reuses the just-built targets.
+if(TRT_CPP_API_BUILD_EXAMPLES)
+ add_subdirectory(examples)
+endif()
+
+# ---- install / export (find_package(tensorrt_cpp_api) consumable) ----
+# Skipped under scikit-build-core (SKBUILD): a Python wheel must contain only the Python package
+# and the compiled extension, not the C++ dev headers / static libs / CMake package config.
+if(NOT SKBUILD)
+ include(GNUInstallDirs)
+ include(CMakePackageConfigHelpers)
+ set(TRTCPP_INSTALL_CMAKEDIR ${CMAKE_INSTALL_LIBDIR}/cmake/tensorrt_cpp_api)
+
+ set(TRTCPP_INSTALL_TARGETS tensorrt_cpp_api)
+ if(TARGET tensorrt_cpp_api_preproc)
+ list(APPEND TRTCPP_INSTALL_TARGETS tensorrt_cpp_api_preproc)
+ endif()
+
+ install(
+ TARGETS ${TRTCPP_INSTALL_TARGETS}
+ EXPORT tensorrt_cpp_apiTargets
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+ INCLUDES
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+ )
-target_include_directories(tensorrt_cpp_api PUBLIC ${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS} ${TensorRT_INCLUDE_DIRS} include include/interfaces)
-target_link_libraries(tensorrt_cpp_api PUBLIC ${OpenCV_LIBS} ${CUDA_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${TensorRT_LIBRARIES} fmt::fmt)
+ install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/tensorrt_cpp_api/build_config.h
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/tensorrt_cpp_api
+ )
+ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindTensorRT.cmake DESTINATION ${TRTCPP_INSTALL_CMAKEDIR})
-add_executable(run_inference_benchmark src/main.cpp)
-target_link_libraries(run_inference_benchmark tensorrt_cpp_api fmt::fmt)
\ No newline at end of file
+ install(EXPORT tensorrt_cpp_apiTargets NAMESPACE tensorrt_cpp_api:: DESTINATION ${TRTCPP_INSTALL_CMAKEDIR})
+
+ configure_package_config_file(
+ ${CMAKE_CURRENT_SOURCE_DIR}/cmake/tensorrt_cpp_api-config.cmake.in
+ ${CMAKE_CURRENT_BINARY_DIR}/tensorrt_cpp_api-config.cmake INSTALL_DESTINATION ${TRTCPP_INSTALL_CMAKEDIR}
+ )
+ write_basic_package_version_file(
+ ${CMAKE_CURRENT_BINARY_DIR}/tensorrt_cpp_api-config-version.cmake VERSION ${PROJECT_VERSION}
+ COMPATIBILITY SameMajorVersion
+ )
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tensorrt_cpp_api-config.cmake
+ ${CMAKE_CURRENT_BINARY_DIR}/tensorrt_cpp_api-config-version.cmake DESTINATION ${TRTCPP_INSTALL_CMAKEDIR}
+ )
+endif() # NOT SKBUILD
+
+if(TRT_CPP_API_BUILD_TESTS)
+ add_subdirectory(tests)
+endif()
diff --git a/Doxyfile b/Doxyfile
new file mode 100644
index 0000000..836bbe9
--- /dev/null
+++ b/Doxyfile
@@ -0,0 +1,36 @@
+# Doxygen config for the tensorrt_cpp_api public API reference.
+# doxygen Doxyfile # -> HTML in docs/api/html/index.html
+# Only the public headers are documented; src/ (impl + detail) and third-party are excluded.
+
+PROJECT_NAME = "tensorrt_cpp_api"
+PROJECT_NUMBER = 7.0.0
+PROJECT_BRIEF = "A no-throw C++ TensorRT inference library for CNN models"
+OUTPUT_DIRECTORY = docs/api
+
+INPUT = include README.md docs/quickstart.md
+USE_MDFILE_AS_MAINPAGE = README.md
+RECURSIVE = YES
+FILE_PATTERNS = *.h *.hpp
+EXCLUDE_PATTERNS = */build_config.h # generated at build time
+
+# The headers use /// and /** */ comments; treat the first sentence as the brief.
+JAVADOC_AUTOBRIEF = YES
+QT_AUTOBRIEF = YES
+EXTRACT_ALL = YES
+EXTRACT_STATIC = YES
+HIDE_UNDOC_MEMBERS = NO
+SHOW_NAMESPACES = YES
+
+# C++20; let Doxygen see the project's own macros without a full build.
+ENABLE_PREPROCESSING = YES
+MACRO_EXPANSION = YES
+EXPAND_ONLY_PREDEF = NO
+PREDEFINED = TRT_CPP_API_WITH_OPENCV=1 \
+ TRT_CPP_API_WITH_SPDLOG=1
+
+GENERATE_HTML = YES
+GENERATE_LATEX = NO
+GENERATE_TREEVIEW = YES
+SORT_MEMBER_DOCS = NO
+QUIET = YES
+WARN_IF_UNDOCUMENTED = NO
diff --git a/README.md b/README.md
index 741190e..83583f6 100644
--- a/README.md
+++ b/README.md
@@ -1,178 +1,146 @@
[![Stargazers][stars-shield]][stars-url]
-
-[](#contributors-)
-
[![Issues][issues-shield]][issues-url]
[![LinkedIn][linkedin-shield]][linkedin-url]
-
-
TensorRT C++ API Tutorial
+ TensorRT C++ API
-
- How to use TensorRT C++ API for high performance GPU machine-learning inference.
-
-
- Supports models with single / multiple inputs and single / multiple outputs with batching.
-
-
- Project Overview Video
- .
- Code Deep-Dive Video
+ A modern, no-throw C++ library for high-performance GPU inference of CNN models with NVIDIA TensorRT — with optional zero-copy Python bindings.
-## Looking for Maintainers 🚀
-
-This project is actively seeking maintainers to help guide its growth and improvement. If you're passionate about this project and interested in contributing, I’d love to hear from you!
-
-Please feel free to reach out via [LinkedIn](https://www.linkedin.com/in/cyrus-behroozi/) to discuss how you can get involved.
-
-
-# TensorRT C++ Tutorial
-*I read all the NVIDIA TensorRT docs so that you don't have to!*
-
-This project demonstrates how to use the TensorRT C++ API for high performance GPU inference on image data. It covers how to do the following:
-- How to install TensorRT 10 on Ubuntu 20.04 / 22.04.
-- How to generate a TensorRT engine file optimized for your GPU.
-- How to specify a simple optimization profile.
-- How to run FP32, FP16, or INT8 precision inference.
-- How to read / write data from / into GPU memory and work with GPU images.
-- How to use cuda stream to run async inference and later synchronize.
-- How to work with models with static and dynamic batch sizes.
-- How to work with models with single or multiple output tensors.
-- How to work with models with multiple inputs.
-- Includes a [Video walkthrough](https://youtu.be/Z0n5aLmcRHQ) where I explain every line of code.
-- The code can be used as a base for any model which takes a fixed size image / images as input, including [Insightface](https://github.com/deepinsight/insightface) [ArcFace](https://github.com/onnx/models/tree/main/vision/body_analysis/arcface), [YoloV8](https://github.com/ultralytics/ultralytics), [SCRFD](https://insightface.ai/scrfd) face detection.
- - You will just need to implement the appropriate post-processing code.
-- TODO: Add support for models with dynamic input shapes.
-- TODO: Add support for Windows
-
-## Getting Started
-The following instructions assume you are using Ubuntu 20.04 or 22.04.
-You will need to supply your own onnx model for this sample code or you can download the sample model (see Sanity Check section below).
-
-### Prerequisites
-- Tested and working on Ubuntu 20.04 and 22.04 (Windows is **not** supported at this time)
-- Install CUDA 11 or 12, instructions [here](https://developer.nvidia.com/cuda-downloads).
- - Recommended >= 12.0
- - Required >= 11.0
-- Install cudnn, instructions [here](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#download).
- - Required >= 8
- - Required < 9 (OpenCV GPU does not yet support)
-- `sudo apt install build-essential`
-- `sudo snap install cmake --classic`
-- `sudo apt install libspdlog-dev libfmt-dev` (for logging)
-- Install OpenCV with cuda support. To compile OpenCV from source, run the `build_opencv.sh` script provided in `./scripts/`.
- - If you use the provided script and you have installed cuDNN to a non-standard location, you must modify the `CUDNN_INCLUDE_DIR` and `CUDNN_LIBRARY` variables in the script.
- - Recommended >= 4.8
-- Download TensorRT 10 from [here](https://developer.nvidia.com/tensorrt/download/10x).
- - Required >= 10.0
-- Navigate to the `CMakeLists.txt` file and replace the `TODO` with the path to your TensorRT installation.
-
-### Building the Library
-- `mkdir build`
-- `cd build`
-- `cmake ..`
-- `make -j$(nproc)`
-
-### Running the Executable
-- Navigate to the build directory
-- Run the executable and provide the path to your onnx model.
-- ex. `./run_inference_benchmark --onnx_model ../models/yolov8n.onnx`
- - Note: See sanity check section below for instructions on how to obtain the yolov8n model.
-- The first time you run the executable for a given model and options, a TensorRT engine file will be built from your onnx model. This process is fairly slow and can take 5+ minutes for some models (ex. yolo models).
-- Alternatively, you can choose to supply your own TensorRT engine file directly:
-- ex. `./run_inference_benchmark --trt_model ../models/yolov8n.engine.NVIDIAGeForceRTX3080LaptopGPU.fp16.1.1`
- - Note: See V5.0 changelog below for warnings when supply your own TensorRT engine file.
-
-### Sanity Check
-- To perform a sanity check, download the `YOLOv8n` model from [here](https://github.com/ultralytics/ultralytics#models).
-- Next, convert it from pytorch to onnx using the following script:
- - You will need to run `pip3 install ultralytics` first.
-
-```python
-from ultralytics import YOLO
-model = YOLO("./yolov8n.pt")
-model.fuse()
-model.info(verbose=False) # Print model information
-model.export(format="onnx", opset=12) # Export the model to onnx using opset 12
+---
+
+`tensorrt_cpp_api` turns an ONNX model into a cached, optimized TensorRT engine and runs it with
+a small, leak-free API: name-keyed tensors at the boundary, caller-owned CUDA streams, explicit
+host/device transfers, and a `Status`/`Result` error model — no exceptions, no OpenCV or
+TensorRT types in the public headers. It targets **TensorRT ≥ 10** (built to the TensorRT 11
+surface), CUDA 12, C++20, Linux.
+
+```cpp
+#include
+using namespace trtcpp;
+
+int main() {
+ // Build an FP16 engine from ONNX, or load it from the on-disk cache if one is already current.
+ BuildOptions opt;
+ opt.precision = Precision::kFp16;
+ opt.engineCacheDir = "engines";
+ auto engine = EngineBuilder{}.buildAndLoad("model.onnx", opt);
+ if (!engine) {
+ std::fprintf(stderr, "%s\n", engine.status().message().c_str());
+ return 1;
+ }
+
+ Stream stream; // owns a CUDA stream — or Stream::wrap(existingHandle) to use yours
+ auto input = Tensor::allocate(DType::kFloat32, Shape{1, 3, 640, 640}, Device::kCuda).value();
+ // ... fill `input` (e.g. via the fused preproc kernel) ...
+
+ auto output = engine->inferSingle({{engine->inputNames().front(), input.view()}}, stream);
+ if (!output) return 1;
+
+ auto host = output->toHost(stream).value(); // explicit D2H + sync; never implicit
+ std::span scores = host.as().value();
+ // ... post-process `scores` ...
+}
```
-- Place the resulting onnx model, `yolov8n.onnx`, in the `./models/` directory.
-- Running inference using said model and the image located in `./inputs/team.jpg` should produce the following feature vector:
- - Note: The feature vector will not be identical (but very similar) as [TensorRT is not deterministic](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#determinism).
-```text
-3.41113 16.5312 20.8828 29.8984 43.7266 54.9609 62.0625 65.8594 70.0312 72.9531 ...
-```
+## Features
+
+- **Engine cache that's actually safe.** Build-or-load keyed by ONNX content hash + build options
+ + TensorRT version + GPU UUID, with a JSON sidecar and atomic writes. A stale cache (changed
+ model, options, driver, or GPU) is detected and rebuilt instead of silently misused.
+- **Dynamic shapes, done right.** Per-input min/opt/max optimization profiles; `-1`-aware `Shape`;
+ one optimization profile per execution context for concurrent dynamic-shape inference.
+- **Concurrency.** `EnginePool` leases execution contexts for multi-stream inference; every call
+ runs on a caller-provided `Stream`. The engine is thread-compatible; the pool is thread-safe.
+- **No leaky abstractions.** No `nvinfer1`, OpenCV, or spdlog types in any public header (PImpl +
+ version-gated, generated `build_config.h`). Consumers need TensorRT at runtime, not compile time.
+- **Quantization without surprises.** `Precision::kFp16` / `kInt8Qdq` / `kFp8` …; precision is
+ version-aware and never a silent no-op (it errors clearly when a mode isn't achievable).
+- **Optional fused preprocessing** (`tensorrt_cpp_api::preproc`): one CUDA kernel does
+ letterbox-resize → BGR↔RGB → per-channel normalize → HWC→NCHW → cast, no intermediate buffers.
+- **Optional zero-copy Python bindings** (`trtcpp`): feed CuPy / PyTorch / Numba GPU arrays in and
+ get them back via `__cuda_array_interface__` / DLPack — no host round-trips, GIL released during
+ inference. See [`examples/python`](examples/python).
+- **Installable.** `cmake --install` produces a `find_package(tensorrt_cpp_api)`-consumable package.
+
+## Performance
+
+Single-stream inference latency on an **RTX 3080 Laptop GPU** (preallocated, zero-copy `enqueue`
+loop — `examples/benchmark`), TensorRT 10:
+
+| Model | Precision | Latency | Throughput |
+|---|---|---|---|
+| YOLOv8n | FP16 | 1.07 ms | 937 inf/s |
+| YOLOv8n | FP32 | 2.00 ms | 499 inf/s |
+| MobileNetV2 | FP16 | 0.31 ms | 3199 inf/s |
+
+Inference time is TensorRT-bound — it is the `enqueueV3` cost of the engine, so the wrapper adds
+**no** inference overhead (v6 and v7 run the identical engine on identical hardware in the same
+time). v7's gains are on the host side and in safety: zero-copy name-keyed IO with no per-call
+allocations or nested-vector copies, a stream-ordered allocator, and the no-throw `Status`/`Result`
+API. The Python bindings run the same path within ~13% of C++ (`examples/python/benchmark_parity.py`).
+
+> For reference, v6's published figures (a weaker RTX 3050 Ti Laptop GPU) were YOLOv8n FP16
+> 2.49 ms / FP32 4.73 ms; the headline difference above is the GPU, not the wrapper.
+
+## Install
+
+TensorRT and CUDA are system/externally provided. In brief:
-### INT8 Inference
-Enabling INT8 precision can further speed up inference at the cost of accuracy reduction due to reduced dynamic range.
-For INT8 precision, the user must supply calibration data which is representative of real data the model will see.
-It is advised to use 1K+ calibration images. To enable INT8 inference with the YoloV8 sanity check model, the following steps must be taken:
-- Change `options.precision = Precision::FP16;` to `options.precision = Precision::INT8;` in `main.cpp`
-- `options.calibrationDataDirectoryPath = "";` must be changed in `main.cpp` to specify path containing calibration data.
- - If using the YoloV8 model, it is advised to used the COCO validation dataset, which can be downloaded with `wget http://images.cocodataset.org/zips/val2017.zip`
-- Make sure the resizing code in the `Int8EntropyCalibrator2::getBatch` method in `engine.cpp` (see `TODO`) is correct for your model.
- - If using the YoloV8 model, the preprocessing code is correct and does not need to be changed.
-- Recompile, run the executable.
-- The calibration cache will be written to disk (`.calibration` extension) so that on subsequent model optimizations it can be reused. If you'd like to regenerate the calibration data, you must delete this cache file.
-- If you get an "out of memory in function allocate" error, then you must reduce `Options.calibrationBatchSize` so that the entire batch can fit in your GPU memory.
-
-### Benchmarks
-Benchmarks run on RTX 3050 Ti Laptop GPU, 11th Gen Intel(R) Core(TM) i9-11900H @ 2.50GHz.
-
-| Model | Precision | Batch Size | Avg Inference Time |
-|---------|-----------|------------|--------------------|
-| yolov8n | FP32 | 1 | 4.732 ms |
-| yolov8n | FP16 | 1 | 2.493 ms |
-| yolov8n | INT8 | 1 | 2.009 ms |
-| yolov8x | FP32 | 1 | 76.63 ms |
-| yolov8x | FP16 | 1 | 25.08 ms |
-| yolov8x | INT8 | 1 | 11.62 ms |
-
-### Sample Integration
-Wondering how to integrate this library into your project? Or perhaps how to read the outputs of the YoloV8 model to extract meaningful information?
-If so, check out my two latest projects, [YOLOv8-TensorRT-CPP](https://github.com/cyrusbehr/YOLOv8-TensorRT-CPP) and [YOLOv9-TensorRT-CPP](https://github.com/cyrusbehr/YOLOv9-TensorRT-CPP), which demonstrate how to use the TensorRT C++ API to run YoloV8/9 inference (supports object detection, semantic segmentation, and body pose estimation). They make use of this project in the backend!
-
-### Project Structure
```sh
-project-root/
-├── include/
-│ ├── engine/
-│ │ ├── EngineRunInference.inl
-│ │ ├── EngineUtilities.inl
-│ │ └── EngineBuildLoadNetwork.inl
-│ ├── util/...
-│ ├── ...
-├── src/
-| ├── ...
-│ ├── engine.cpp
-│ ├── engine.h
-│ └── main.cpp
-├── CMakeLists.txt
-└── README.md
+cmake -S . -B build -DTRT_CPP_API_BUILD_PREPROC=ON # add -DTensorRT_DIR= for a tarball
+cmake --build build -j$(nproc)
+cmake --install build --prefix /opt/trtcpp
```
-### Understanding the Code
-- The bulk of the implementation is located in `include/engine`. I have written lots of comments all throughout the code which should make it easy to understand what is going on.
-- The inference code is located in `include/engine/EngineRunInference.inl`.
-- The building and loading of the TensorRT engine file is located in `include/engine/EngineBuildLoadNetwork.inl`.
-- You can also check out my [deep-dive video](https://youtu.be/Z0n5aLmcRHQ) in which I explain every line of code.
+Then in a downstream project:
+
+```cmake
+find_package(tensorrt_cpp_api REQUIRED)
+target_link_libraries(myapp PRIVATE tensorrt_cpp_api::tensorrt_cpp_api tensorrt_cpp_api::preproc)
+```
+
+Python: `pip install .` (builds the `trtcpp` wheel via scikit-build-core). Full details —
+apt vs tarball TensorRT, build options, Python — are in [`docs/install.md`](docs/install.md).
+
+## Examples
+
+[`examples/`](examples) has four runnable reference programs, each consuming the installed package:
+**classification** (ImageNet top-5), **detection** (YOLOv8n + NMS), **segmentation** (DeepLabV3),
+and a **zero-copy Python** demo with a C++/Python perf-parity benchmark. `examples/download_models.sh`
+fetches the models.
+
+## Documentation
+
+- [Quickstart & core concepts](docs/quickstart.md)
+- [Installation](docs/install.md)
+- [Upgrading from v6](docs/upgrading_from_v6.md)
+- API reference: `doxygen Doxyfile` (HTML in `docs/api/`)
+
+## Sister projects
+
+This library is the inference backend for [YOLOv8-TensorRT-CPP](https://github.com/cyrusbehr/YOLOv8-TensorRT-CPP)
+and [YOLOv9-TensorRT-CPP](https://github.com/cyrusbehr/YOLOv9-TensorRT-CPP) (object detection,
+segmentation, pose).
-### How to Debug
-- The implementation uses the `spdlog` library for logging. You can change the log level by setting the environment variable `LOG_LEVEL` to one of the following values: `trace`, `debug`, `info`, `warn`, `error`, `critical`, `off`.
+## Scope
-- If you have issues creating the TensorRT engine file from the onnx model, consider setting the environment variable `LOG_LEVEL` to `trace` and re-run the application. This should give you more information on where exactly the build process is failing.
+Linux, CUDA 12, TensorRT ≥ 10, CNN-style vision models. Windows and LLM/transformer-specific
+features are out of scope.
-### Show your Appreciation
-If this project was helpful to you, I would appreciate if you could give it a star. That will encourage me to ensure it's up to date and solve issues quickly. I also do consulting work if you require more specific help. Connect with me on [LinkedIn](https://www.linkedin.com/in/cyrus-behroozi/).
+## Contributing
+
+Issues and PRs welcome. Install the hooks with `pre-commit install` (clang-format + cmake-format);
+CI runs the build, the CPU test suite, sanitizers, and a Python wheel build. If this project helps
+you, a ⭐ is appreciated — connect on [LinkedIn](https://www.linkedin.com/in/cyrus-behroozi/).
### Contributors
@@ -188,76 +156,20 @@ If this project was helpful to you, I would appreciate if you could give it a st
-
-
-### Changelog
-
-**V6.0**
-
-- Implementation now requires TensorRT >= 10.0.
-
-**V5.0**
-
-- `Engine` class has been modified to take a template parameter which specifies the models output data type. The implementation now supports outputs of type `float`, `__half`, `int8_t`, `int32_t`, `bool`, and `uint8_t`.
-- Added support for loading TensorRT engine file directly without needing to compile from onnx model. Howver, it is highly recommended that you use the API provided to build the engine file from the onnx model, instead of loading a TensorRT model directly. If you choose to load a TensorRT model file directly, you must hand-check that the `Options` have been set correctly for your model (for example, if your model has been compiled for FP32 but you try running FP16 inference, it will fail, potentially without a verbose error).
-- Added command line parser.
-
-**V4.1**
-
-- Added support for fixed batch size > 1.
+This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification.
-**V4.0**
+## License
-- Added support for INT8 precision.
-
-
-**V3.0**
-
-- Implementation has been updated to use TensorRT 8.6 API (ex. `IExecutionContext::enqueueV3()`).
-- Executable has renamed from `driver` to `run_inference_benchmark` and now must be passed path to onnx model as command line argument.
-- Removed `Options.doesSupportDynamicBatchSize`. Implementation now auto-detects supported batch sizes.
-- Removed `Options.maxWorkspaceSize`. Implementation now does not limit GPU memory during model constructions, allowing implementation to use as much of memory pool as is available for intermediate layers.
-
-**v2.2**
-
-- Serialize model name as part of engine file.
-
-**V2.1**
-
-- Added support for models with multiple inputs. Implementation now supports models with single inputs, multiple inputs, single outputs, multiple outputs, and batching.
-
-**V2.0**
-
-- Requires OpenCV cuda to be installed. To install, follow instructions [here](https://gist.github.com/raulqf/f42c718a658cddc16f9df07ecc627be7).
-- `Options.optBatchSizes` has been removed, replaced by `Options.optBatchSize`.
-- Support models with more than a single output (ex. SCRFD).
-- Added support for models which do not support batch inference (first input dimension is fixed).
-- More error checking.
-- Fixed a bunch of common issues people were running into with the original V1.0 version.
-- Remove whitespace from GPU device name
+See [LICENSE](LICENSE). Version history is in [CHANGELOG.md](CHANGELOG.md).
-
[stars-shield]: https://img.shields.io/github/stars/cyrusbehr/tensorrt-cpp-api.svg?style=flat-square
[stars-url]: https://github.com/cyrusbehr/tensorrt-cpp-api/stargazers
[issues-shield]: https://img.shields.io/github/issues/cyrusbehr/tensorrt-cpp-api.svg?style=flat-square
[issues-url]: https://github.com/cyrusbehr/tensorrt-cpp-api/issues
[linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=flat-square&logo=linkedin&colorB=555
[linkedin-url]: https://linkedin.com/in/cyrus-behroozi/
-
-## Contributors ✨
-
-Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
-
-
-
-
-
-
-
-
-This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
diff --git a/cmake/FindTensorRT.cmake b/cmake/FindTensorRT.cmake
index adfa71a..f89ce2f 100644
--- a/cmake/FindTensorRT.cmake
+++ b/cmake/FindTensorRT.cmake
@@ -1,81 +1,72 @@
-# source:
-# https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake
-
-# This module defines the following variables:
-#
-# ::
-#
-# TensorRT_INCLUDE_DIRS
-# TensorRT_LIBRARIES
-# TensorRT_FOUND
-#
-# ::
+# FindTensorRT.cmake -- locate a TensorRT install (NVIDIA apt repo OR tarball) and expose
+# the imported target TensorRT::TensorRT (nvinfer + nvonnxparser + headers). Supports
+# TensorRT 10.0 through 11.x and errors clearly otherwise. Relocatable: it bakes no
+# build-tree paths, so it can be installed alongside the package config (Phase E14/H).
#
-# TensorRT_VERSION_STRING - version (x.y.z)
-# TensorRT_VERSION_MAJOR - major version (x)
-# TensorRT_VERSION_MINOR - minor version (y)
-# TensorRT_VERSION_PATCH - patch version (z)
-#
-# Hints
-# ^^^^^
-# A user may set ``TensorRT_DIR`` to an installation root to tell this module where to look.
-#
-set(_TensorRT_SEARCHES)
+# Hints: set -DTensorRT_DIR= (or the env var) to point at a tarball; on a
+# host with libnvinfer-dev from the NVIDIA apt repo no hint is needed.
+set(_trt_hints)
if(TensorRT_DIR)
- set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_DIR} NO_DEFAULT_PATH)
- list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT)
+ list(APPEND _trt_hints "${TensorRT_DIR}")
endif()
-
-# appends some common paths
-set(_TensorRT_SEARCH_NORMAL
- PATHS "/usr"
- )
-list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL)
-
-# Include dir
-foreach(search ${_TensorRT_SEARCHES})
- find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include)
-endforeach()
-
-if(NOT TensorRT_LIBRARY)
- foreach(search ${_TensorRT_SEARCHES})
- find_library(TensorRT_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib)
- endforeach()
+if(DEFINED ENV{TensorRT_DIR})
+ list(APPEND _trt_hints "$ENV{TensorRT_DIR}")
endif()
-if(NOT TensorRT_NVONNXPARSER_LIBRARY)
- foreach(search ${_TensorRT_SEARCHES})
- find_library(TensorRT_NVONNXPARSER_LIBRARY NAMES nvonnxparser ${${search}} PATH_SUFFIXES lib)
- endforeach()
-endif()
+find_path(TensorRT_INCLUDE_DIR
+ NAMES NvInfer.h
+ HINTS ${_trt_hints}
+ PATH_SUFFIXES include
+ PATHS /usr/include/x86_64-linux-gnu /usr/include /usr/local/include /usr/local/tensorrt/include)
-mark_as_advanced(TensorRT_INCLUDE_DIR)
+find_library(TensorRT_nvinfer_LIBRARY
+ NAMES nvinfer
+ HINTS ${_trt_hints}
+ PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib
+ PATHS /usr/lib/x86_64-linux-gnu /usr/lib /usr/local/lib)
-if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h")
- file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
- file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
- file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
+find_library(TensorRT_nvonnxparser_LIBRARY
+ NAMES nvonnxparser
+ HINTS ${_trt_hints}
+ PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib
+ PATHS /usr/lib/x86_64-linux-gnu /usr/lib /usr/local/lib)
- string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
- string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
- string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
- set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
+if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h")
+ file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" _trt_ver_lines REGEX "#define NV_TENSORRT_(MAJOR|MINOR|PATCH) ")
+ string(REGEX REPLACE ".*NV_TENSORRT_MAJOR ([0-9]+).*" "\\1" TensorRT_VERSION_MAJOR "${_trt_ver_lines}")
+ string(REGEX REPLACE ".*NV_TENSORRT_MINOR ([0-9]+).*" "\\1" TensorRT_VERSION_MINOR "${_trt_ver_lines}")
+ string(REGEX REPLACE ".*NV_TENSORRT_PATCH ([0-9]+).*" "\\1" TensorRT_VERSION_PATCH "${_trt_ver_lines}")
+ set(TensorRT_VERSION "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
endif()
include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING)
+find_package_handle_standard_args(TensorRT
+ REQUIRED_VARS TensorRT_nvinfer_LIBRARY TensorRT_nvonnxparser_LIBRARY TensorRT_INCLUDE_DIR
+ VERSION_VAR TensorRT_VERSION)
if(TensorRT_FOUND)
- set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})
-
- if(NOT TensorRT_LIBRARIES)
- set(TensorRT_LIBRARIES ${TensorRT_LIBRARY} ${TensorRT_NVONNXPARSER_LIBRARY} ${TensorRT_NVPARSERS_LIBRARY})
+ if(TensorRT_VERSION VERSION_LESS "10.0" OR NOT TensorRT_VERSION VERSION_LESS "12.0")
+ message(FATAL_ERROR
+ "tensorrt_cpp_api requires TensorRT 10.0 - 11.x, but found ${TensorRT_VERSION} at "
+ "${TensorRT_INCLUDE_DIR}.\n"
+ " Point -DTensorRT_DIR= at a supported tarball, or install libnvinfer-dev from "
+ "the NVIDIA apt repo (scripts/install_deps.sh).")
endif()
+ if(NOT TARGET TensorRT::nvonnxparser)
+ add_library(TensorRT::nvonnxparser UNKNOWN IMPORTED)
+ set_target_properties(TensorRT::nvonnxparser PROPERTIES IMPORTED_LOCATION "${TensorRT_nvonnxparser_LIBRARY}")
+ endif()
if(NOT TARGET TensorRT::TensorRT)
add_library(TensorRT::TensorRT UNKNOWN IMPORTED)
- set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}")
- set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}")
+ set_target_properties(TensorRT::TensorRT PROPERTIES
+ IMPORTED_LOCATION "${TensorRT_nvinfer_LIBRARY}"
+ INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}"
+ INTERFACE_LINK_LIBRARIES TensorRT::nvonnxparser)
endif()
-endif()
\ No newline at end of file
+ set(TensorRT_LIBRARIES ${TensorRT_nvinfer_LIBRARY} ${TensorRT_nvonnxparser_LIBRARY})
+ set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})
+endif()
+
+mark_as_advanced(TensorRT_INCLUDE_DIR TensorRT_nvinfer_LIBRARY TensorRT_nvonnxparser_LIBRARY)
diff --git a/cmake/build_config.h.in b/cmake/build_config.h.in
new file mode 100644
index 0000000..186db94
--- /dev/null
+++ b/cmake/build_config.h.in
@@ -0,0 +1,12 @@
+#pragma once
+
+// Generated by CMake (configure_file). Records the library version and the TensorRT/CUDA
+// versions the library was BUILT against, so public headers can version-gate (e.g. the
+// legacy calibrator) without including any TensorRT header.
+
+#define TRT_CPP_API_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
+#define TRT_CPP_API_VERSION_MINOR @PROJECT_VERSION_MINOR@
+#define TRT_CPP_API_VERSION_PATCH @PROJECT_VERSION_PATCH@
+
+#define TRT_CPP_API_TENSORRT_VERSION_MAJOR @TensorRT_VERSION_MAJOR@
+#define TRT_CPP_API_TENSORRT_VERSION_MINOR @TensorRT_VERSION_MINOR@
diff --git a/cmake/tensorrt_cpp_api-config.cmake.in b/cmake/tensorrt_cpp_api-config.cmake.in
new file mode 100644
index 0000000..a37061b
--- /dev/null
+++ b/cmake/tensorrt_cpp_api-config.cmake.in
@@ -0,0 +1,20 @@
+@PACKAGE_INIT@
+
+# Re-resolve transitive dependencies in the consumer's scope (the exported targets reference
+# the imported CUDA/TensorRT/OpenCV/spdlog targets, which must exist downstream).
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") # installed FindTensorRT.cmake
+include(CMakeFindDependencyMacro)
+
+find_dependency(CUDAToolkit)
+find_dependency(TensorRT) # pass -DTensorRT_DIR= for a tarball install
+find_dependency(Threads)
+if(@TRT_CPP_API_WITH_OPENCV@)
+ find_dependency(OpenCV)
+endif()
+if(@TRT_CPP_API_WITH_SPDLOG@)
+ find_dependency(spdlog)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/tensorrt_cpp_apiTargets.cmake")
+
+check_required_components(tensorrt_cpp_api)
diff --git a/docs/install.md b/docs/install.md
new file mode 100644
index 0000000..bbc25c9
--- /dev/null
+++ b/docs/install.md
@@ -0,0 +1,90 @@
+# Installation
+
+`tensorrt_cpp_api` is a Linux / CUDA 12 / TensorRT ≥ 10 / C++20 library. TensorRT and the CUDA
+toolkit are **system/externally provided** — they are not fetched by the build.
+
+## Prerequisites
+
+- **CMake ≥ 3.22** and a C++20 compiler (GCC ≥ 11 or Clang ≥ 14).
+- **CUDA Toolkit 12.x** (`nvcc` is needed for the preprocessing kernel; the engine core itself
+ only needs the CUDA runtime). CUDA 13 requires driver ≥ 580.
+- **TensorRT 10.0 – 11.x**, installed one of two ways:
+ - **apt** (NVIDIA CUDA network repo): `libnvinfer-dev` + `libnvonnxparsers-dev`. The bundled
+ `FindTensorRT` module searches the standard `/usr/include/x86_64-linux-gnu` +
+ `/usr/lib/x86_64-linux-gnu` layout automatically.
+ - **tarball**: download from NVIDIA and pass `-DTensorRT_DIR=/path/to/TensorRT-10.x` at configure.
+- Optional: **spdlog** (`-DTRT_CPP_API_WITH_SPDLOG=ON`), **OpenCV** core (`-DTRT_CPP_API_WITH_OPENCV=ON`),
+ **pybind11** + a Python 3.9–3.13 dev environment (for the bindings).
+
+## Build options
+
+| Option | Default | Effect |
+|---|---|---|
+| `TRT_CPP_API_BUILD_PREPROC` | `ON` | Build the fused preprocessing sublibrary (`::preproc`). |
+| `TRT_CPP_API_BUILD_TESTS` | `OFF` | Build the GoogleTest suite (`ctest`; GPU tests are labeled `gpu`). |
+| `TRT_CPP_API_BUILD_EXAMPLES` | `OFF` | Build the reference examples in-tree. |
+| `TRT_CPP_API_BUILD_PYTHON` | `OFF` | Build the `trtcpp` pybind11 extension. |
+| `TRT_CPP_API_WITH_OPENCV` | `OFF` | Build the optional OpenCV interop header/source. |
+| `TRT_CPP_API_WITH_SPDLOG` | `OFF` | Build the optional spdlog logger adapter. |
+| `CMAKE_CUDA_ARCHITECTURES` | `75;80;86;89;90` | Override for your target GPUs. |
+
+## Build & install (C++)
+
+```sh
+cmake -S . -B build \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DTRT_CPP_API_BUILD_PREPROC=ON
+ # add -DTensorRT_DIR=/opt/TensorRT-10.x for a tarball TensorRT
+ # add -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc if nvcc is not on PATH
+cmake --build build -j$(nproc)
+cmake --install build --prefix /opt/trtcpp
+```
+
+The install lays down `include/tensorrt_cpp_api/…`, the static libraries, and a CMake package
+(`lib/cmake/tensorrt_cpp_api/`) including the bundled `FindTensorRT` module.
+
+## Consume it downstream
+
+```cmake
+find_package(tensorrt_cpp_api REQUIRED) # set CMAKE_PREFIX_PATH=/opt/trtcpp (and TensorRT_DIR if tarball)
+add_executable(myapp main.cpp)
+target_link_libraries(myapp PRIVATE
+ tensorrt_cpp_api::tensorrt_cpp_api # core
+ tensorrt_cpp_api::preproc) # optional preprocessing
+```
+
+The package config re-resolves CUDA, TensorRT, Threads (and OpenCV/spdlog if the install was built
+with them), so the only thing a consumer must provide is the location of a system TensorRT.
+
+## Python bindings
+
+```sh
+pip install . # builds the trtcpp wheel via scikit-build-core
+# tarball TensorRT: pip install . --config-settings=cmake.define.TensorRT_DIR=/opt/TensorRT-10.x
+python -c "import trtcpp; print(trtcpp.version_string())"
+```
+
+For zero-copy GPU interop install a matching CuPy (`pip install cupy-cuda12x`); see
+[`examples/python`](../examples/python).
+
+## Verifying
+
+```sh
+cmake -S . -B build -DTRT_CPP_API_BUILD_TESTS=ON
+cmake --build build -j$(nproc)
+ctest --test-dir build -LE gpu # CPU-only tests (no GPU needed)
+ctest --test-dir build # full suite (needs an NVIDIA GPU)
+```
+
+## Troubleshooting
+
+- **`Could NOT find CUDA: ... required is exact version "12.0"`** when building examples/consumers.
+ This comes from a CUDA-enabled OpenCV whose CMake config pins an exact CUDA version that differs
+ from your toolkit. The library core does **not** use OpenCV; build with the default
+ `-DTRT_CPP_API_WITH_OPENCV=OFF`, and prefer the stb-based examples (no OpenCV).
+- **`Failed to detect a default CUDA architecture` / `nvcc` not found.** Pass
+ `-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc`, or put `nvcc` on `PATH`.
+- **`Could NOT find TensorRT`.** Install `libnvinfer-dev`/`libnvonnxparsers-dev`, or pass
+ `-DTensorRT_DIR=`. TensorRT must be in the 10.0–11.x range.
+- **Stale-engine rebuilds.** Changing the ONNX, build options, driver/TensorRT version, or GPU
+ invalidates a cached engine on purpose; delete `engineCacheDir` to force a clean rebuild.
diff --git a/docs/quickstart.md b/docs/quickstart.md
new file mode 100644
index 0000000..1c69ed2
--- /dev/null
+++ b/docs/quickstart.md
@@ -0,0 +1,139 @@
+# Quickstart & core concepts
+
+This walks through the mental model and the common flows. For end-to-end programs see
+[`examples/`](../examples); for installation see [install.md](install.md).
+
+## The five types you'll use
+
+| Type | Role |
+|---|---|
+| `Status` / `Result` | No-throw error handling. `Status` is ok-or-error; `Result` is value-or-error. `if (!r) … r.status()`. |
+| `Shape` | A dynamic-aware shape (`-1` = dynamic). `Shape{1,3,640,640}`. |
+| `Tensor` | An **owning** device or pinned-host buffer (RAII, move-only). The library returns these for outputs. |
+| `TensorView` | A **non-owning** view over memory you (or another library) own. The zero-copy boundary type. |
+| `Stream` | A CUDA stream you own (or wrap). Every async call takes one; you control synchronization. |
+
+Everything lives in `namespace trtcpp` and is reachable via `#include `
+(the optional `preproc.h` / `opencv_interop.h` are included separately).
+
+## Error handling
+
+Every fallible call returns `Status` or `Result` — never throws.
+
+```cpp
+auto engine = EngineBuilder{}.buildAndLoad("model.onnx", opt);
+if (!engine) {
+ std::fprintf(stderr, "%s\n", engine.status().message().c_str());
+ return 1;
+}
+engine->inputNames(); // operator-> / operator* to reach the value
+
+// Or propagate with the helper (the enclosing function must return Status or a Result):
+TRTCPP_TRY(auto eng, EngineBuilder{}.buildAndLoad("model.onnx", opt));
+```
+
+## Build (or load) an engine
+
+`EngineBuilder` turns an ONNX model into an optimized engine and caches it on disk. `buildAndLoad`
+is the one-call entry point: it builds if there's no current cache, otherwise reuses it, then
+deserializes into a ready `Engine`.
+
+```cpp
+BuildOptions opt;
+opt.precision = Precision::kFp16; // kFp32 / kFp16 / kInt8Qdq / kFp8 …
+opt.engineCacheDir = "engines"; // where the .engine + .json sidecar live
+auto engine = EngineBuilder{}.buildAndLoad("model.onnx", opt);
+```
+
+The cache key is the ONNX content hash + build options + TensorRT version + GPU UUID. Change any of
+them (new model, different precision, driver upgrade, different GPU) and the stale cache is detected
+and rebuilt — it is never silently reused. To build and deserialize separately, use
+`buildOrLoad` (returns a path) + `Engine::loadFromFile`.
+
+## Run inference
+
+Inputs and outputs are **name-keyed** maps of `TensorView`. There are three entry points:
+
+```cpp
+Stream stream; // owns a non-blocking stream; or Stream::wrap(yourCudaStreamHandle)
+
+// 1) Library allocates outputs for you:
+auto outputs = engine->infer({{"images", inputView}}, stream); // Result