diff --git a/core/runtime/BUILD b/core/runtime/BUILD index 7f594ecea7..bb9f779929 100644 --- a/core/runtime/BUILD +++ b/core/runtime/BUILD @@ -88,6 +88,7 @@ cc_library( "RTDevice.cpp", "TRTEngine.cpp", "TRTEngineProfiler.cpp", + "TRTRuntimeConfig.cpp", "execute_engine.cpp", "runtime.cpp", "runtime_utils.cpp", @@ -95,15 +96,37 @@ cc_library( hdrs = [ "Platform.h", "RTDevice.h", - "TensorRTBindingNames.h", "TRTEngine.h", "TRTEngineProfiler.h", + "TRTRuntimeConfig.h", + "TensorRTBindingNames.h", "runtime.h", ], copts = if_torch_nccl(["-DUSE_C10D_NCCL"]), + defines = select({ + # nvinfer1::IRuntimeConfig (and the matching ICudaEngine::createRuntimeConfig + # / createExecutionContext(IRuntimeConfig*) overloads) was introduced in + # TensorRT 10.11. The TensorRT shipped with the Jetpack l4t-r36.4 toolchain + # (@tensorrt_l4t) predates 10.11 and does not export this type. Every other + # configuration here (RTX, SBSA, Windows, default x86_64 Linux) is on a + # TensorRT >= 10.11 bundle, so it gets the macro. + # + # Gate every IRuntimeConfig-using site in core/runtime with + # `#ifdef TRT_HAS_IRUNTIME_CONFIG`; the Jetpack path falls back to the + # legacy createExecutionContext() no-arg overload. + ":jetpack": [], + "//conditions:default": ["TRT_HAS_IRUNTIME_CONFIG"], + }), linkopts = [ "-lstdc++fs", ], + local_defines = select({ + # TensorRT-RTX builds: opt into feature-gated APIs that the runtime layer + # depends on (e.g. IExecutionContext::isStreamCapturable). + ":rtx_win": ["ENABLE_FEATURE_DISABLE_RUNTIME_ALLOCATION"], + ":rtx_x86_64": ["ENABLE_FEATURE_DISABLE_RUNTIME_ALLOCATION"], + "//conditions:default": [], + }), deps = [ ":tensorrt_binding_names", "//core/plugins:torch_tensorrt_plugins", @@ -135,9 +158,9 @@ cc_library( hdrs = [ "Platform.h", "RTDevice.h", - "TensorRTBindingNames.h", "TRTEngine.h", "TRTEngineProfiler.h", + "TensorRTBindingNames.h", "runtime.h", ], deps = [ @@ -151,9 +174,10 @@ filegroup( srcs = [ "Platform.h", "RTDevice.h", - "TensorRTBindingNames.h", "TRTEngine.h", "TRTEngineProfiler.h", + "TRTRuntimeConfig.h", + "TensorRTBindingNames.h", "runtime.h", ], visibility = ["//visibility:public"], diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp index 2b97af750c..873c758854 100644 --- a/core/runtime/TRTEngine.cpp +++ b/core/runtime/TRTEngine.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -23,6 +24,23 @@ namespace torch_tensorrt { namespace core { namespace runtime { +namespace { +// TensorRT marks unspecified dimensions in dynamic-shape engines with -1. +constexpr int32_t kDynamicDim = -1; + +// Returns true iff any of the listed input bindings (including shape tensors) has a +// dynamic dimension. +[[nodiscard]] bool engine_has_dynamic_inputs( + nvinfer1::ICudaEngine* cuda_engine, + std::vector const& in_binding_names) { + TORCHTRT_CHECK(cuda_engine != nullptr, "engine_has_dynamic_inputs requires a live ICudaEngine"); + return std::any_of(std::begin(in_binding_names), std::cend(in_binding_names), [cuda_engine](std::string const& name) { + auto const dims = cuda_engine->getTensorShape(name.c_str()); + return std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t d) { return d == kDynamicDim; }); + }); +} +} // namespace + std::string slugify(std::string s) { std::replace(s.begin(), s.end(), '.', '_'); return s; @@ -78,7 +96,8 @@ TRTEngine::TRTEngine( bool hardware_compatible, bool requires_output_allocator, const std::string& serialized_metadata, - const ResourceAllocationStrategy resource_allocation_strategy) + const ResourceAllocationStrategy resource_allocation_strategy, + TRTRuntimeConfig runtime_cfg) : TRTEngine( "deserialized_trt", serialized_engine, @@ -89,7 +108,8 @@ TRTEngine::TRTEngine( hardware_compatible, requires_output_allocator, serialized_metadata, - resource_allocation_strategy) {} + resource_allocation_strategy, + std::move(runtime_cfg)) {} TRTEngine::TRTEngine(std::vector serialized_info) : TRTEngine( @@ -104,7 +124,13 @@ TRTEngine::TRTEngine(std::vector serialized_info) serialized_info[SERIALIZED_METADATA_IDX], (static_cast(std::stoi(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX])) ? ResourceAllocationStrategy::kDynamic - : ResourceAllocationStrategy::kStatic)) { + : ResourceAllocationStrategy::kStatic), + make_runtime_config_from_serialized(serialized_info)) { + // Single visible marker that this engine was instantiated through the C++ runtime + // entry point (i.e. torch.classes.tensorrt.Engine), distinguishing it from the Python + // TRTEngine path. Tests look for this string in captured stderr to verify the + // expected backend was exercised. + LOG_INFO("[torch-TensorRT C++ runtime] TRTEngine constructed from serialized info"); this->requires_native_multidevice = std::stoi(serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX]); if (this->requires_native_multidevice) { LOG_INFO("Loaded distributed TRT engine (contains NCCL collectives); NCCL comm will be bound on first execution"); @@ -121,7 +147,9 @@ TRTEngine::TRTEngine( bool hardware_compatible, bool requires_output_allocator, const std::string& serialized_metadata, - const ResourceAllocationStrategy resource_allocation_strategy) { + const ResourceAllocationStrategy resource_allocation_strategy, + TRTRuntimeConfig runtime_cfg) { + this->runtime_cfg = std::move(runtime_cfg); TORCHTRT_CHECK( is_supported_on_current_platform(target_platform), "This engine was not built to run on this platform (built for: " << target_platform << ", current platform: " @@ -157,13 +185,7 @@ TRTEngine::TRTEngine( LOG_DEBUG( "Resource allocation strategy: " << (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static")); - if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) { - this->exec_ctx = - make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); - } else { - this->exec_ctx = make_trt(cuda_engine->createExecutionContext()); - } - TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context"); + recreate_execution_context(); // Pre-allocate placeholder for empty tensors (TensorRT requires non-null addresses) cudaMalloc(&empty_tensor_placeholder, 1); @@ -251,6 +273,8 @@ TRTEngine::TRTEngine( num_io = std::make_pair(inputs_size, outputs); } + runtime_cfg.has_dynamic_inputs = engine_has_dynamic_inputs(cuda_engine.get(), in_binding_names); + #ifndef NDEBUG this->enable_profiling(); #endif @@ -270,6 +294,9 @@ TRTEngine::TRTEngine( } TRTEngine::~TRTEngine() { + // Marked noexcept so safe to invoke from a destructor without + // explicit try/catch; any I/O error is logged internally. + runtime_cfg.save_runtime_cache(); trt_engine_profiler.reset(); exec_ctx.reset(); cuda_engine.reset(); @@ -283,8 +310,7 @@ void TRTEngine::disable_profiling() { torch::cuda::synchronize(device_info.id); profile_execution = false; trt_engine_profiler.reset(); - exec_ctx = make_trt(cuda_engine->createExecutionContext()); - TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context"); + recreate_execution_context(); } void TRTEngine::dump_engine_layer_info_to_file(const std::string& path) { @@ -381,10 +407,7 @@ bool TRTEngine::set_device_memory_budget(int64_t budget) { trt_engine_profiler.reset(); } bool result = cuda_engine->setWeightStreamingBudgetV2(budget); - exec_ctx = make_trt(cuda_engine->createExecutionContext()); - TORCHTRT_CHECK( - (exec_ctx.get() != nullptr), - "Unable to recreate TensorRT execution context after setting new device memory budget"); + recreate_execution_context(); if (profile_execution) { enable_profiling(); } @@ -441,6 +464,7 @@ std::string TRTEngine::to_str() const { ss << " Target Platform: " << target_platform << std::endl; ss << " Resource Allocation Strategy: " << (resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static") << std::endl; ss << " Multi-Device Engine: " << (requires_native_multidevice) << std::endl; + ss << runtime_cfg.to_str(); // clang-format on return ss.str(); } @@ -487,7 +511,11 @@ FlattenedState TRTEngine::__obj_flatten__() { std::tuple("requires_output_allocator", serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]), std::tuple("target_platform", serialized_info[TARGET_PLATFORM_IDX]), std::tuple("resource_allocation_strategy", serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX]), - std::tuple("requires_native_multidevice", serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX])); + std::tuple("requires_native_multidevice", serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX]), + std::tuple("has_runtime_cfg", serialized_info[HAS_RUNTIME_CFG_IDX]), + std::tuple("runtime_cache_path", serialized_info[RUNTIME_CACHE_PATH_IDX]), + std::tuple("dynamic_shapes_kernel_strategy", serialized_info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX]), + std::tuple("cuda_graph_strategy", serialized_info[CUDA_GRAPH_STRATEGY_IDX])); } std::vector TRTEngine::serialize() { @@ -514,6 +542,16 @@ std::vector TRTEngine::serialize() { this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "1" : "0"; serialized_info[REQUIRES_NATIVE_MULTIDEVICE_IDX] = this->requires_native_multidevice ? "1" : "0"; // rank/world_size are runtime facts (may differ at load time); not serialized. +#ifdef TRT_MAJOR_RTX + serialized_info[HAS_RUNTIME_CFG_IDX] = "1"; +#else + serialized_info[HAS_RUNTIME_CFG_IDX] = "0"; +#endif + serialized_info[RUNTIME_CACHE_PATH_IDX] = runtime_cfg.runtime_cache_path; + serialized_info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX] = std::to_string( + static_cast>(runtime_cfg.dynamic_shapes_kernel_strategy)); + serialized_info[CUDA_GRAPH_STRATEGY_IDX] = + std::to_string(static_cast>(runtime_cfg.cuda_graph_strategy)); return serialized_info; } @@ -525,14 +563,11 @@ void TRTEngine::reset_captured_graph() { void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationStrategy new_strategy) { if (new_strategy != this->resource_allocation_strategy) { this->resource_allocation_strategy = new_strategy; - if (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) { - LOG_DEBUG("Setting resource allocation strategy to dynamic"); - this->exec_ctx = - make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); - } else { - LOG_DEBUG("Setting resource allocation strategy to static"); - this->exec_ctx = make_trt(cuda_engine->createExecutionContext()); - } + LOG_DEBUG( + "Setting resource allocation strategy to " + << (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic ? "dynamic" + : "static")); + recreate_execution_context(); } } @@ -629,19 +664,40 @@ void TRTEngine::release_nccl_comm() { LOG_INFO("Releasing NCCL communicator from engine '" << this->name << "'"); torch::cuda::synchronize(device_info.id); this->exec_ctx.reset(); - if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) { - this->exec_ctx = - make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); - } else { - this->exec_ctx = make_trt(cuda_engine->createExecutionContext()); - } - TORCHTRT_CHECK( - (exec_ctx.get() != nullptr), "Unable to recreate TensorRT execution context after releasing NCCL comm"); + recreate_execution_context(); this->nccl_initialized = false; LOG_INFO("NCCL communicator released from engine '" << this->name << "'"); } #endif // ENABLE_TRT_NCCL_COLLECTIVES +bool TRTEngine::is_monolithic_capturable(cudaStream_t stream) const { + return runtime_cfg.is_monolithic_capturable(exec_ctx.get(), stream); +} + +void TRTEngine::disable_rtx_native_cudagraphs() { + bool was_disabled = runtime_cfg.rtx_native_cudagraphs_disabled; + runtime_cfg.disable_rtx_native_cudagraphs(name); + if (!was_disabled && runtime_cfg.rtx_native_cudagraphs_disabled) { + // The CUDA graph strategy on the IRuntimeConfig has been flipped; rebuild exec_ctx + // so the new strategy takes effect for subsequent enqueueV3 calls. + recreate_execution_context(); + } +} + +void TRTEngine::recreate_execution_context() { + // Flush any kernels the previous execution context may have compiled into the + // runtime cache before creating the replacement. The destructor also saves, but + // doing it here guards against losing compiled kernels across profiling toggles, + // allocator changes, or process kills that happen between allocator changes and + // teardown. No-op on standard TensorRT or when no cache path is configured. + runtime_cfg.save_runtime_cache(); + const auto allocation_strategy = resource_allocation_strategy == ResourceAllocationStrategy::kDynamic + ? nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED + : nvinfer1::ExecutionContextAllocationStrategy::kSTATIC; + exec_ctx = runtime_cfg.create_execution_context(cuda_engine.get(), allocation_strategy); + TORCHTRT_CHECK(exec_ctx.get() != nullptr, "Unable to (re)create TensorRT execution context"); +} + } // namespace runtime } // namespace core } // namespace torch_tensorrt diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h index c6d06dfb40..47917e9c37 100644 --- a/core/runtime/TRTEngine.h +++ b/core/runtime/TRTEngine.h @@ -15,6 +15,7 @@ #include "torch/custom_class.h" #include "core/runtime/TRTEngineProfiler.h" +#include "core/runtime/TRTRuntimeConfig.h" #include "core/runtime/TensorRTBindingNames.h" #include "core/util/prelude.h" @@ -47,7 +48,12 @@ using FlattenedState = std::tuple< std::tuple, // serialized metadata std::tuple, // Platform std::tuple, // Resource Allocation Strategy - std::tuple>; // requires_native_multidevice + std::tuple, // requires_native_multidevice + std::tuple, // has_runtime_cfg (gates next three) + std::tuple, // Runtime Cache Path (TRT-RTX) + std::tuple, // Dynamic Shapes Kernel Strategy (TRT-RTX) + std::tuple // CUDA Graph Strategy (TRT-RTX) + >; struct TorchTRTRuntimeStates { // Indicates whether CUDAGraphs were enabled in the previous execute_engine @@ -151,7 +157,8 @@ struct TRTEngine : torch::CustomClassHolder { bool requires_output_allocator = false, const std::string& serialized_metadata = "", const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy = - TRTEngine::ResourceAllocationStrategy::kStatic); + TRTEngine::ResourceAllocationStrategy::kStatic, + TRTRuntimeConfig runtime_cfg = TRTRuntimeConfig{}); TRTEngine(std::vector serialized_info); @@ -166,7 +173,8 @@ struct TRTEngine : torch::CustomClassHolder { bool requires_output_allocator = false, const std::string& serialized_metadata = "", const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy = - TRTEngine::ResourceAllocationStrategy::kStatic); + TRTEngine::ResourceAllocationStrategy::kStatic, + TRTRuntimeConfig runtime_cfg = TRTRuntimeConfig{}); std::string to_str() const; static void verify_serialization_fmt(const std::vector& serialized_info); @@ -273,6 +281,23 @@ struct TRTEngine : torch::CustomClassHolder { ResourceAllocationStrategy resource_allocation_strategy = kStatic; void set_resource_allocation_strategy(ResourceAllocationStrategy new_strategy); ResourceAllocationStrategy get_resource_allocation_strategy(); + + // Owns the IRuntimeConfig (where supported) and TRT-RTX runtime state. On older TRT + // without IRuntimeConfig (e.g. Jetpack) this just carries strategy values that get + // passed to the legacy createExecutionContext overload. + TRTRuntimeConfig runtime_cfg; + + // Monolithic-capturability check used when this engine is wrapped by an outer whole-graph + // capture (e.g. CudaGraphsTorchTensorRTModule). Non-RTX builds always return true. + bool is_monolithic_capturable(cudaStream_t stream) const; + + // Disable TensorRT-RTX native CUDA graph capture on this engine (one-shot, invoked when + // an outer stream capture is detected around execute_engine). No-op on non-RTX. + void disable_rtx_native_cudagraphs(); + + private: + // Single entry point that (re)creates exec_ctx via runtime_cfg.create_execution_context. + void recreate_execution_context(); }; } // namespace runtime diff --git a/core/runtime/TRTRuntimeConfig.cpp b/core/runtime/TRTRuntimeConfig.cpp new file mode 100644 index 0000000000..6f64a95cbd --- /dev/null +++ b/core/runtime/TRTRuntimeConfig.cpp @@ -0,0 +1,257 @@ +#include "core/runtime/TRTRuntimeConfig.h" + +#include +#include +#include +#include +#include + +#include "core/runtime/runtime.h" +#include "core/util/prelude.h" + +namespace torch_tensorrt { +namespace core { +namespace runtime { + +// File-local helpers. Kept out of the header because they are only used by this +// translation unit -- TRTEngine now consumes a TRTRuntimeConfig directly and does not +// need the enum conversion helpers. +namespace { + +[[nodiscard]] std::string to_string(DynamicShapesKernelStrategy s) { + switch (s) { + case DynamicShapesKernelStrategy::kLazy: + return "lazy"; + case DynamicShapesKernelStrategy::kEager: + return "eager"; + case DynamicShapesKernelStrategy::kNone: + return "none"; + } + TORCHTRT_CHECK( + false, + "Unexpected DynamicShapesKernelStrategy value: " + << static_cast>(s)); +} + +[[nodiscard]] std::string to_string(CudaGraphStrategyOption s) { + switch (s) { + case CudaGraphStrategyOption::kDisabled: + return "disabled"; + case CudaGraphStrategyOption::kWholeGraphCapture: + return "whole_graph_capture"; + } + TORCHTRT_CHECK( + false, + "Unexpected CudaGraphStrategyOption value: " << static_cast>(s)); +} + +[[nodiscard]] DynamicShapesKernelStrategy to_dynamic_shapes_kernel_strategy( + std::underlying_type_t v) { + TORCHTRT_CHECK( + v >= 0 && v <= 2, + "Invalid dynamic shapes kernel strategy value: " << v << ". Expected 0 (lazy), 1 (eager), or 2 (none)."); + return static_cast(v); +} + +[[nodiscard]] CudaGraphStrategyOption to_cuda_graph_strategy_option(std::underlying_type_t v) { + TORCHTRT_CHECK( + v >= 0 && v <= 1, + "Invalid CUDA graph strategy value: " << v << ". Expected 0 (disabled) or 1 (whole_graph_capture)."); + return static_cast(v); +} + +#ifdef TRT_MAJOR_RTX +// Raw cache I/O helpers. Exception-propagating; the caller wraps in try/catch at the +// TRTRuntimeConfig member level. Kept file-local because the IRuntimeCache type is +// itself TensorRT-RTX-only and tests reach this path through the member wrappers. +void load_runtime_cache(const std::string& path, nvinfer1::IRuntimeCache* cache) { + TORCHTRT_CHECK(cache != nullptr, "load_runtime_cache requires a non-null IRuntimeCache"); + if (!std::filesystem::exists(path)) { + LOG_DEBUG("No existing runtime cache at " << path); + return; + } + std::ifstream f(path, std::ios::binary); + std::vector buf((std::istreambuf_iterator(f)), std::istreambuf_iterator()); + if (buf.empty()) { + return; + } + TORCHTRT_CHECK(cache->deserialize(buf.data(), buf.size()), "IRuntimeCache::deserialize returned false for " << path); + LOG_INFO("Loaded runtime cache from " << path << " (" << buf.size() << " bytes)"); +} + +void save_runtime_cache_impl(const std::string& path, nvinfer1::IRuntimeCache* cache) { + TORCHTRT_CHECK(cache != nullptr, "save_runtime_cache requires a non-null IRuntimeCache"); + auto host_mem = make_trt(cache->serialize()); + if (!host_mem || host_mem->size() == 0) { + return; + } + std::filesystem::path fs_path(path); + if (fs_path.has_parent_path()) { + std::filesystem::create_directories(fs_path.parent_path()); + } + std::filesystem::path tmp_path = fs_path; + tmp_path += ".tmp"; + { + std::ofstream out(tmp_path, std::ios::binary); + out.write(reinterpret_cast(host_mem->data()), host_mem->size()); + } + std::filesystem::rename(tmp_path, fs_path); + LOG_INFO("Saved runtime cache to " << path << " (" << host_mem->size() << " bytes)"); +} +#endif // TRT_MAJOR_RTX + +} // namespace + +void TRTRuntimeConfig::ensure_initialized(TORCHTRT_UNUSED nvinfer1::ICudaEngine* cuda_engine) { +#ifdef TRT_HAS_IRUNTIME_CONFIG + if (config) { + return; + } + TORCHTRT_CHECK(cuda_engine != nullptr, "Cannot initialize TRTRuntimeConfig without a live ICudaEngine"); + config = make_trt(cuda_engine->createRuntimeConfig()); + TORCHTRT_CHECK(config.get() != nullptr, "Unable to create TensorRT IRuntimeConfig"); + +#ifdef TRT_MAJOR_RTX + // Runtime cache -- TRT-RTX only. + if (!runtime_cache_path.empty()) { + runtime_cache = make_trt(config->createRuntimeCache()); + if (runtime_cache.get() == nullptr) { + LOG_WARNING("Failed to create TensorRT IRuntimeCache; runtime cache will be skipped."); + } else { + try { + load_runtime_cache(runtime_cache_path, runtime_cache.get()); + } catch (const std::exception& e) { + LOG_WARNING("Failed to load runtime cache from " << runtime_cache_path << ": " << e.what()); + } + if (config->setRuntimeCache(*runtime_cache)) { + LOG_DEBUG("TensorRT-RTX runtime cache configured at " << runtime_cache_path); + } else { + LOG_WARNING("Failed to attach runtime cache to IRuntimeConfig; cache will be unused."); + runtime_cache.reset(); + } + } + } else { + LOG_DEBUG("Runtime cache disabled (no path configured)."); + } + + // Dynamic shapes kernel specialization strategy -- TRT-RTX only. + config->setDynamicShapesKernelSpecializationStrategy( + static_cast(dynamic_shapes_kernel_strategy)); + LOG_DEBUG("Dynamic shapes kernel specialization strategy set to " << to_string(dynamic_shapes_kernel_strategy)); + + // CUDA graph strategy -- TRT-RTX only. + if (!config->setCudaGraphStrategy( + cuda_graph_strategy == CudaGraphStrategyOption::kWholeGraphCapture + ? nvinfer1::CudaGraphStrategy::kWHOLE_GRAPH_CAPTURE + : nvinfer1::CudaGraphStrategy::kDISABLED)) { + LOG_WARNING("Failed to set CUDA graph strategy; continuing with default."); + } +#endif +#endif // TRT_HAS_IRUNTIME_CONFIG +} + +std::shared_ptr TRTRuntimeConfig::create_execution_context( + nvinfer1::ICudaEngine* cuda_engine, + nvinfer1::ExecutionContextAllocationStrategy allocation_strategy) { + ensure_initialized(cuda_engine); +#ifdef TRT_HAS_IRUNTIME_CONFIG + config->setExecutionContextAllocationStrategy(allocation_strategy); + return make_trt(cuda_engine->createExecutionContext(config.get())); +#else + // Pre-10.11 TRT (e.g. Jetpack): use the legacy strategy overload directly. + return make_trt(cuda_engine->createExecutionContext(allocation_strategy)); +#endif +} + +bool TRTRuntimeConfig::uses_internal_capture(TORCHTRT_UNUSED bool cudagraphs_enabled) const { +#ifdef TRT_MAJOR_RTX + // On TRT-RTX the internal runtime handles capture/replay whenever a non-disabled + // strategy is set, or when subgraph cudagraphs are enabled globally. In both cases the + // caller should skip its manual at::cuda::CUDAGraph wrapper because TRT-RTX's internal + // capture would collide with it. + return cuda_graph_strategy != CudaGraphStrategyOption::kDisabled || cudagraphs_enabled; +#else + return false; +#endif +} + +void TRTRuntimeConfig::disable_rtx_native_cudagraphs(TORCHTRT_UNUSED const std::string& engine_name) noexcept { +#ifdef TRT_MAJOR_RTX + if (rtx_native_cudagraphs_disabled || cuda_graph_strategy == CudaGraphStrategyOption::kDisabled) { + return; + } + LOG_WARNING( + "Outer CUDA stream capture detected; disabling TensorRT-RTX native CUDA graph strategy on engine " + << engine_name << " for the remainder of its lifetime."); + // Persist any kernels the engine-internal capture has compiled so far; the outer + // capture will run without them otherwise, and we want future reloads to reuse them. + save_runtime_cache(); + cuda_graph_strategy = CudaGraphStrategyOption::kDisabled; + if (config && !config->setCudaGraphStrategy(nvinfer1::CudaGraphStrategy::kDISABLED)) { + LOG_WARNING("Failed to update CUDA graph strategy on IRuntimeConfig after disable."); + } + rtx_native_cudagraphs_disabled = true; +#endif +} + +bool TRTRuntimeConfig::is_monolithic_capturable( + TORCHTRT_UNUSED nvinfer1::IExecutionContext* exec_ctx, + TORCHTRT_UNUSED cudaStream_t stream) const { +#ifdef TRT_MAJOR_RTX + TORCHTRT_ASSERT(exec_ctx != nullptr, "is_monolithic_capturable requires a live IExecutionContext"); + if (!exec_ctx->isStreamCapturable(stream)) { + return false; + } + // "lazy" kernel specialization only swaps specialized kernels mid-run when an input + // has a dynamic dimension; for static-shape engines the kernels are fixed at setup and + // the captured graph stays valid. Mirrors the Python `_is_monolithic_capturable` check. + return !(dynamic_shapes_kernel_strategy == DynamicShapesKernelStrategy::kLazy && has_dynamic_inputs); +#else + return true; +#endif +} + +void TRTRuntimeConfig::save_runtime_cache() noexcept { +#ifdef TRT_MAJOR_RTX + if (!runtime_cache || runtime_cache_path.empty()) { + return; + } + try { + save_runtime_cache_impl(runtime_cache_path, runtime_cache.get()); + } catch (const std::exception& e) { + LOG_WARNING("Failed to save runtime cache to " << runtime_cache_path << ": " << e.what()); + } catch (...) { + LOG_WARNING("Failed to save runtime cache (unknown exception)."); + } +#endif +} + +std::string TRTRuntimeConfig::to_str() const { + std::ostringstream os; + os << "Runtime Cache Path: " << (runtime_cache_path.empty() ? "" : runtime_cache_path) << std::endl; + os << "Dynamic Shapes Kernel Strategy: " << to_string(dynamic_shapes_kernel_strategy) << std::endl; + os << "CUDA Graph Strategy: " << to_string(cuda_graph_strategy) << std::endl; + return os.str(); +} + +TRTRuntimeConfig make_runtime_config_from_serialized(const std::vector& info) { + TRTRuntimeConfig cfg; + if (info[HAS_RUNTIME_CFG_IDX] == "1") { + cfg.runtime_cache_path = info[RUNTIME_CACHE_PATH_IDX]; + cfg.dynamic_shapes_kernel_strategy = + to_dynamic_shapes_kernel_strategy(std::stoi(info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX])); + cfg.cuda_graph_strategy = to_cuda_graph_strategy_option(std::stoi(info[CUDA_GRAPH_STRATEGY_IDX])); + } + return cfg; +} + +std::ostream& operator<<(std::ostream& os, const TRTRuntimeConfig& cfg) { + os << "Runtime cfg {" << std::endl; + os << cfg.to_str(); + os << "}" << std::endl; + return os; +} + +} // namespace runtime +} // namespace core +} // namespace torch_tensorrt diff --git a/core/runtime/TRTRuntimeConfig.h b/core/runtime/TRTRuntimeConfig.h new file mode 100644 index 0000000000..6e7b8bc6ab --- /dev/null +++ b/core/runtime/TRTRuntimeConfig.h @@ -0,0 +1,101 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" + +namespace torch_tensorrt { +namespace core { +namespace runtime { + +// TensorRT-RTX-only configuration for how shape-specialized kernels are compiled. +enum class DynamicShapesKernelStrategy : int32_t { + kLazy = 0, + kEager = 1, + kNone = 2, +}; + +// TensorRT-RTX-only configuration for how CUDA graph capture/replay is handled. +enum class CudaGraphStrategyOption : int32_t { + kDisabled = 0, + kWholeGraphCapture = 1, +}; + +// Encapsulates the IRuntimeConfig and TRT-RTX runtime state for a TRTEngine. +// IRuntimeConfig and runtime-cache `#ifdef`s are confined to this TU; serialization- +// index plumbing keeps its own RTX gates elsewhere. +struct TRTRuntimeConfig { + // Settings - typically populated from engine deserialization before `ensure_initialized`. + std::string runtime_cache_path = ""; + DynamicShapesKernelStrategy dynamic_shapes_kernel_strategy = DynamicShapesKernelStrategy::kLazy; + CudaGraphStrategyOption cuda_graph_strategy = CudaGraphStrategyOption::kDisabled; + + // One-shot: set to true once an outer stream capture has been detected and the + // engine-internal CUDA graph strategy has been disabled for the remainder of the + // owning engine's lifetime. + bool rtx_native_cudagraphs_disabled = false; + + bool has_dynamic_inputs = true; + + // Live resources. The IRuntimeConfig is lazy-constructed on first `ensure_initialized` + // and is unavailable on TensorRT versions older than 10.11 (e.g. Jetpack). +#ifdef TRT_HAS_IRUNTIME_CONFIG + std::shared_ptr config; +#endif +#ifdef TRT_MAJOR_RTX + std::shared_ptr runtime_cache; +#endif + + // Lazily construct the IRuntimeConfig and apply RTX-specific settings. Idempotent. + // No-op on builds without IRuntimeConfig (e.g. Jetpack). + void ensure_initialized(nvinfer1::ICudaEngine* cuda_engine); + + // Lazy-initialize the IRuntimeConfig if needed and create an IExecutionContext that + // honors `allocation_strategy`. Selects the right `createExecutionContext` overload + // (IRuntimeConfig* vs ExecutionContextAllocationStrategy) so callers stay free of + // any TRT_HAS_IRUNTIME_CONFIG branching. + [[nodiscard]] std::shared_ptr create_execution_context( + nvinfer1::ICudaEngine* cuda_engine, + nvinfer1::ExecutionContextAllocationStrategy allocation_strategy); + + // Returns true if the TensorRT-RTX runtime owns capture/replay for this engine so the + // caller should bypass its own at::cuda::CUDAGraph capture around enqueueV3. Always + // false on non-RTX builds. + [[nodiscard]] bool uses_internal_capture(bool cudagraphs_enabled) const; + + // One-shot: disable engine-internal CUDA graph capture. Invoked when an outer stream + // capture is detected around execute_engine, so the outer capture can contain the + // kernel launches directly. Saves the runtime cache before recreating the context so + // compiled kernels from the present run are preserved for future reloads. + void disable_rtx_native_cudagraphs(const std::string& engine_name) noexcept; + + // Whether the execution context is safe to include in an outer monolithic capture. + // Non-RTX builds always return true. + [[nodiscard]] bool is_monolithic_capturable(nvinfer1::IExecutionContext* exec_ctx, cudaStream_t stream) const; + + // Save the runtime cache to disk. Signature is `noexcept` so this is safe from a + // destructor. The underlying file I/O is performed by free functions declared below + // (non-noexcept, exception-leaky for easier testing); this member wraps them and + // swallows any exceptions. + void save_runtime_cache() noexcept; + + // Returns a human-readable summary of the runtime config. + [[nodiscard]] std::string to_str() const; +}; + +// Construct a TRTRuntimeConfig from a flattened serialization vector. Reads the +// RTX-only indices only on RTX builds; standard TRT builds return a default-initialized +// struct. +[[nodiscard]] TRTRuntimeConfig make_runtime_config_from_serialized(const std::vector& info); + +std::ostream& operator<<(std::ostream& os, const TRTRuntimeConfig& cfg); + +} // namespace runtime +} // namespace core +} // namespace torch_tensorrt diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp index 6a070db3cf..80936951ef 100644 --- a/core/runtime/execute_engine.cpp +++ b/core/runtime/execute_engine.cpp @@ -241,6 +241,23 @@ std::vector execute_engine(std::vector inputs, c10::intr auto run_standard_execution = [&]() { bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS); + // effective_cudagraphs controls the manual at::cuda::CUDAGraph path below. On TRT-RTX + // builds the engine-internal runtime owns capture/replay inside enqueueV3 whenever the + // engine has a cuda_graph_strategy set or subgraph cudagraphs are enabled; the struct + // reports that via `uses_internal_capture` so the caller skips its manual wrapper. If + // an outer stream capture is already in progress (e.g. the caller wraps this module in + // CudaGraphsTorchTensorRTModule for whole-graph capture), engine-internal capture would + // collide, so we disable it one-shot here. + bool effective_cudagraphs = cudagraphs_enabled; + if (compiled_engine->runtime_cfg.uses_internal_capture(cudagraphs_enabled)) { + effective_cudagraphs = false; + cudaStreamCaptureStatus capture_status; + cudaStreamIsCapturing(compiled_engine->engine_stream.stream(), &capture_status); + if (capture_status != cudaStreamCaptureStatusNone) { + compiled_engine->disable_rtx_native_cudagraphs(); + } + } + bool shape_changed = _validate_shapes(inputs, compiled_engine); auto current_device_id = inputs.size() > 0 ? inputs[0].device().index() : at::cuda::current_device(); @@ -262,7 +279,7 @@ std::vector execute_engine(std::vector inputs, c10::intr // Whether cudagraphs needs to record the graph on this pass auto result = compiled_engine->runtime_states.set_runtime_states( - cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs, shape_changed); + effective_cudagraphs, compiled_engine->use_pre_allocated_outputs, shape_changed); bool need_cudagraphs_record = std::get<0>(result); bool can_use_pre_allocated_outputs = std::get<1>(result); @@ -282,7 +299,7 @@ std::vector execute_engine(std::vector inputs, c10::intr std::make_unique(compiled_engine->input_profile_path); } - setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record); + setup_input_tensors(inputs, compiled_engine, effective_cudagraphs, need_cudagraphs_record); // Check if input shapes can be inferred. int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()}; std::vector names(io_size); @@ -314,7 +331,7 @@ std::vector execute_engine(std::vector inputs, c10::intr compiled_engine->cudagraph_output_staging_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone()); } - if (cudagraphs_enabled) { + if (effective_cudagraphs) { TORCHTRT_CHECK( compiled_engine->exec_ctx->setTensorAddress( name.c_str(), compiled_engine->cudagraph_output_staging_buffers[pyt_idx].data_ptr()), @@ -346,8 +363,10 @@ std::vector execute_engine(std::vector inputs, c10::intr caller_exec_complete.block(compiled_engine->engine_stream); } - if (!cudagraphs_enabled) { - // Direct execution uses the caller buffers directly + if (!effective_cudagraphs) { + // Direct execution uses the caller buffers directly. On TRT-RTX with a + // cuda_graph_strategy set, the engine captures/replays internally during + // this enqueueV3 call. compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream); } else { if (need_cudagraphs_record) { @@ -384,7 +403,7 @@ std::vector execute_engine(std::vector inputs, c10::intr trt_exec_complete.block(compiled_engine->caller_stream); } - if (cudagraphs_enabled) { + if (effective_cudagraphs) { // If in CUDAGraph mode, copy persistent staging outputs to returned tensors on the caller stream. for (size_t o = 0; o < compiled_engine->cudagraph_output_staging_buffers.size(); o++) { outputs[o].copy_(compiled_engine->cudagraph_output_staging_buffers[o], false); diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp index 7eae8bfb91..44d1b314ca 100644 --- a/core/runtime/register_jit_hooks.cpp +++ b/core/runtime/register_jit_hooks.cpp @@ -147,6 +147,10 @@ TORCH_LIBRARY(tensorrt, m) { return false; #endif }); + m.def("HAS_RUNTIME_CFG_IDX", []() -> int64_t { return HAS_RUNTIME_CFG_IDX; }); + m.def("RUNTIME_CACHE_PATH_IDX", []() -> int64_t { return RUNTIME_CACHE_PATH_IDX; }); + m.def("DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX", []() -> int64_t { return DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX; }); + m.def("CUDA_GRAPH_STRATEGY_IDX", []() -> int64_t { return CUDA_GRAPH_STRATEGY_IDX; }); m.def("_platform_linux_x86_64", []() -> std::string { auto it = get_platform_name_map().find(Platform::PlatformEnum::kLINUX_X86_64); return it->second; diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h index a87bd2ca2a..2cbe73d6da 100644 --- a/core/runtime/runtime.h +++ b/core/runtime/runtime.h @@ -41,6 +41,11 @@ typedef enum { REQUIRES_OUTPUT_ALLOCATOR_IDX, RESOURCE_ALLOCATION_STRATEGY_IDX, REQUIRES_NATIVE_MULTIDEVICE_IDX, + // HAS_RUNTIME_CFG_IDX gates the next three slots. When "0", their values are ignored. + HAS_RUNTIME_CFG_IDX, + RUNTIME_CACHE_PATH_IDX, + DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX, + CUDA_GRAPH_STRATEGY_IDX, SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO } SerializedInfoIndex; @@ -57,6 +62,10 @@ inline constexpr std::array kSerializedInfoIndex "REQUIRES_OUTPUT_ALLOCATOR_IDX", "RESOURCE_ALLOCATION_STRATEGY_IDX", "REQUIRES_NATIVE_MULTIDEVICE_IDX", + "HAS_RUNTIME_CFG_IDX", + "RUNTIME_CACHE_PATH_IDX", + "DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX", + "CUDA_GRAPH_STRATEGY_IDX", }}; // For adding new serialized info indices, update above and update /dynamo/runtime/_serialized_engine_layout.py diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index 0386c97ea3..77253f3deb 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -14,8 +14,11 @@ from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( ABI_TARGET_IDX, ABI_VERSION, + CUDA_GRAPH_STRATEGY_IDX, DEVICE_IDX, + DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX, ENGINE_IDX, + HAS_RUNTIME_CFG_IDX, HW_COMPATIBLE_IDX, INPUT_BINDING_NAMES_IDX, NAME_IDX, @@ -23,6 +26,7 @@ REQUIRES_NATIVE_MULTIDEVICE_IDX, REQUIRES_OUTPUT_ALLOCATOR_IDX, RESOURCE_ALLOCATION_STRATEGY_IDX, + RUNTIME_CACHE_PATH_IDX, SERIALIZATION_LEN, SERIALIZED_METADATA_IDX, TARGET_PLATFORM_IDX, @@ -40,6 +44,16 @@ List[str], ] +_DYNAMIC_SHAPES_KERNEL_STRATEGY_MAP: Dict[str, int] = { + "lazy": 0, + "eager": 1, + "none": 2, +} +_CUDA_GRAPH_STRATEGY_MAP: Dict[str, int] = { + "disabled": 0, + "whole_graph_capture": 1, +} + class TorchTensorRTModule(torch.nn.Module): # type: ignore[misc] """``nn.Module`` that runs a TensorRT engine inside PyTorch. @@ -132,6 +146,25 @@ def __init__( self.execute_engine_op: Any = None self.requires_output_allocator = requires_output_allocator self.dynamically_allocate_resources = settings.dynamically_allocate_resources + self.runtime_cache_path = settings.runtime_cache_path + self.dynamic_shapes_kernel_specialization_strategy = ( + settings.dynamic_shapes_kernel_specialization_strategy + ) + if ( + self.dynamic_shapes_kernel_specialization_strategy + not in _DYNAMIC_SHAPES_KERNEL_STRATEGY_MAP + ): + raise ValueError( + f"Invalid dynamic_shapes_kernel_specialization_strategy " + f"{self.dynamic_shapes_kernel_specialization_strategy!r}; expected one of " + f"{list(_DYNAMIC_SHAPES_KERNEL_STRATEGY_MAP.keys())}" + ) + self.cuda_graph_strategy = settings.cuda_graph_strategy + if self.cuda_graph_strategy not in _CUDA_GRAPH_STRATEGY_MAP: + raise ValueError( + f"Invalid cuda_graph_strategy {self.cuda_graph_strategy!r}; expected one of " + f"{list(_CUDA_GRAPH_STRATEGY_MAP.keys())}" + ) self.symbolic_shape_expressions = symbolic_shape_expressions self.requires_native_multidevice = requires_native_multidevice self.target_platform = ( @@ -229,6 +262,16 @@ def _pack_engine_info(self) -> List[str | bytes]: int(self.requires_native_multidevice) ) # rank/world_size are runtime facts; queried from ProcessGroup at execution time + engine_info[HAS_RUNTIME_CFG_IDX] = "1" if ENABLED_FEATURES.tensorrt_rtx else "0" + engine_info[RUNTIME_CACHE_PATH_IDX] = self.runtime_cache_path or "" + engine_info[DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX] = str( + _DYNAMIC_SHAPES_KERNEL_STRATEGY_MAP[ + self.dynamic_shapes_kernel_specialization_strategy + ] + ) + engine_info[CUDA_GRAPH_STRATEGY_IDX] = str( + _CUDA_GRAPH_STRATEGY_MAP[self.cuda_graph_strategy] + ) return engine_info @@ -332,8 +375,9 @@ def decode_metadata(encoded_metadata: bytes) -> Any: def get_extra_state(self) -> SerializedTorchTensorRTModuleFmt: if self.engine: engine_info = self._pack_engine_info() - assert isinstance(engine_info[ENGINE_IDX], (bytes, bytearray)) - engine_info[ENGINE_IDX] = base64.b64encode(engine_info[ENGINE_IDX]) + engine_bytes = engine_info[ENGINE_IDX] + assert isinstance(engine_bytes, (bytes, bytearray)) + engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes) return ( self.name, engine_info, @@ -342,8 +386,9 @@ def get_extra_state(self) -> SerializedTorchTensorRTModuleFmt: ) elif self.serialized_engine: engine_info = self._pack_engine_info() - assert isinstance(engine_info[ENGINE_IDX], bytes) - engine_info[ENGINE_IDX] = base64.b64encode(engine_info[ENGINE_IDX]) + engine_bytes = engine_info[ENGINE_IDX] + assert isinstance(engine_bytes, bytes) + engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes) return ( self.name, engine_info, diff --git a/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py b/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py index d4f31ba8a8..c0bc6653b9 100644 --- a/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py +++ b/py/torch_tensorrt/dynamo/runtime/_serialized_engine_layout.py @@ -37,6 +37,11 @@ class SerializedInfoIndex(IntEnum): REQUIRES_OUTPUT_ALLOCATOR_IDX = 9 RESOURCE_ALLOCATION_STRATEGY_IDX = 10 REQUIRES_NATIVE_MULTIDEVICE_IDX = 11 + # HAS_RUNTIME_CFG_IDX gates the next three slots. When "0", their values are ignored. + HAS_RUNTIME_CFG_IDX = 12 + RUNTIME_CACHE_PATH_IDX = 13 + DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX = 14 + CUDA_GRAPH_STRATEGY_IDX = 15 # Module-level aliases for backward compatibility and concise access @@ -52,6 +57,12 @@ class SerializedInfoIndex(IntEnum): REQUIRES_OUTPUT_ALLOCATOR_IDX = SerializedInfoIndex.REQUIRES_OUTPUT_ALLOCATOR_IDX RESOURCE_ALLOCATION_STRATEGY_IDX = SerializedInfoIndex.RESOURCE_ALLOCATION_STRATEGY_IDX REQUIRES_NATIVE_MULTIDEVICE_IDX = SerializedInfoIndex.REQUIRES_NATIVE_MULTIDEVICE_IDX +HAS_RUNTIME_CFG_IDX = SerializedInfoIndex.HAS_RUNTIME_CFG_IDX +RUNTIME_CACHE_PATH_IDX = SerializedInfoIndex.RUNTIME_CACHE_PATH_IDX +DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX = ( + SerializedInfoIndex.DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX +) +CUDA_GRAPH_STRATEGY_IDX = SerializedInfoIndex.CUDA_GRAPH_STRATEGY_IDX SERIALIZATION_LEN = len(SerializedInfoIndex) SERIALIZED_ENGINE_BINDING_DELIM = "%" @@ -73,6 +84,10 @@ class SerializedInfoIndex(IntEnum): ("REQUIRES_OUTPUT_ALLOCATOR_IDX", "REQUIRES_OUTPUT_ALLOCATOR_IDX", int), ("RESOURCE_ALLOCATION_STRATEGY_IDX", "RESOURCE_ALLOCATION_STRATEGY_IDX", int), ("REQUIRES_NATIVE_MULTIDEVICE_IDX", "REQUIRES_NATIVE_MULTIDEVICE_IDX", int), + ("HAS_RUNTIME_CFG_IDX", "HAS_RUNTIME_CFG_IDX", int), + ("RUNTIME_CACHE_PATH_IDX", "RUNTIME_CACHE_PATH_IDX", int), + ("DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX", "DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX", int), + ("CUDA_GRAPH_STRATEGY_IDX", "CUDA_GRAPH_STRATEGY_IDX", int), ("SERIALIZATION_LEN", "SERIALIZATION_LEN", int), ("SERIALIZED_ENGINE_BINDING_DELIM", "SERIALIZED_ENGINE_BINDING_DELIM", str), ("SERIALIZED_RT_DEVICE_DELIM", "SERIALIZED_RT_DEVICE_DELIM", str), diff --git a/tests/py/dynamo/runtime/test_000_runtime_cache.py b/tests/py/dynamo/runtime/test_000_runtime_cache.py index 05637a6146..2e9855b9a4 100644 --- a/tests/py/dynamo/runtime/test_000_runtime_cache.py +++ b/tests/py/dynamo/runtime/test_000_runtime_cache.py @@ -1,5 +1,4 @@ import gc -import logging import os import shutil import tempfile @@ -7,10 +6,11 @@ import torch import torch_tensorrt as torchtrt +from parameterized import parameterized from torch.testing._internal.common_utils import TestCase, run_tests from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt.dynamo._defaults import RUNTIME_CACHE_PATH, TIMING_CACHE_PATH -from torch_tensorrt.dynamo._settings import CompilationSettings +from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity class SimpleModel(torch.nn.Module): @@ -18,39 +18,58 @@ def forward(self, x): return torch.relu(x) + 1.0 -class TwoLayerModel(torch.nn.Module): +class ConvModel(torch.nn.Module): def __init__(self): super().__init__() - self.linear = torch.nn.Linear(8, 8) + self.conv = torch.nn.Conv2d(3, 8, 3, padding=1) def forward(self, x): - return torch.relu(self.linear(x)) + return torch.relu(self.conv(x)) -def _compile_simple(runtime_cache_path=None): - """Helper: compile SimpleModel with Python runtime, return (compiled_module, inputs).""" - model = SimpleModel().eval().cuda() - inputs = [torch.randn(2, 3).cuda()] +def _fresh_conv_model_and_inputs(seed=0): + """Deterministic ConvModel + input pair for end-to-end cache tests on either runtime.""" + torch.manual_seed(seed) + return ConvModel().eval().cuda(), [torch.randn(2, 3, 16, 16).cuda()] + + +def _compile(model, inputs, *, use_python_runtime, runtime_cache_path=None): + """Compile ``model`` through either runtime. Returns the compiled module.""" kwargs = { "ir": "dynamo", "inputs": inputs, - "use_python_runtime": True, + "use_python_runtime": use_python_runtime, "min_block_size": 1, } if runtime_cache_path is not None: kwargs["runtime_cache_path"] = runtime_cache_path compiled = torchtrt.compile(model, **kwargs) torch._dynamo.reset() - return compiled, inputs + return compiled + + +def _compile_simple(runtime_cache_path=None): + """Compile SimpleModel on the Python runtime (used by introspection setup tests).""" + model = SimpleModel().eval().cuda() + inputs = [torch.randn(2, 3).cuda()] + return ( + _compile( + model, + inputs, + use_python_runtime=True, + runtime_cache_path=runtime_cache_path, + ), + inputs, + ) def _find_python_trt_engine(compiled): - """Walk the compiled graph module and return the Python ``TRTEngine`` instance. + """Return the Python ``TRTEngine`` instance from a compiled module, if any. The C++ and Python runtimes are now both driven through ``TorchTensorRTModule`` - (``use_python_runtime`` selects which backend is constructed). For tests that - target the Python runtime specifically we look for the wrapping module and - return its ``.engine`` attribute when it's a Python ``TRTEngine``. + (``use_python_runtime`` selects which backend is constructed). Tests that target + Python-runtime introspection use this helper; C++-runtime tests rely on + externally observable behavior (cache file on disk, inference correctness). """ from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import TorchTensorRTModule from torch_tensorrt.dynamo.runtime._TRTEngine import TRTEngine @@ -61,6 +80,16 @@ def _find_python_trt_engine(compiled): return None +# Parameterize end-to-end cache persistence tests over both runtime paths. The C++ +# variant is skipped inside the test body when the C++ runtime is not available. +_RUNTIMES = [("python", True), ("cpp", False)] + + +def _skip_if_cpp_unavailable(testcase, use_python_runtime): + if not use_python_runtime and not ENABLED_FEATURES.torch_tensorrt_runtime: + testcase.skipTest("C++ runtime is not available") + + @unittest.skipIf( not ENABLED_FEATURES.tensorrt_rtx, "Runtime cache is only available with TensorRT-RTX", @@ -108,7 +137,7 @@ def test_runtime_cache_path_custom(self): "Runtime cache is only available with TensorRT-RTX", ) class TestRuntimeCachePersistence(TestCase): - """Tests that runtime cache is correctly saved to and loaded from disk.""" + """Load-on-setup / save-on-destructor contract, exercised on both runtimes.""" def setUp(self): self.cache_dir = tempfile.mkdtemp() @@ -117,9 +146,16 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.cache_dir, ignore_errors=True) - def test_cache_saved_on_del(self): - compiled, inputs = _compile_simple(runtime_cache_path=self.cache_path) - # Run inference to populate the cache + @parameterized.expand(_RUNTIMES) + def test_cache_saved_on_del(self, _name, use_python_runtime): + _skip_if_cpp_unavailable(self, use_python_runtime) + model, inputs = _fresh_conv_model_and_inputs() + compiled = _compile( + model, + inputs, + use_python_runtime=use_python_runtime, + runtime_cache_path=self.cache_path, + ) _ = compiled(*[inp.clone() for inp in inputs]) self.assertFalse( os.path.isfile(self.cache_path), @@ -132,8 +168,16 @@ def test_cache_saved_on_del(self): "Cache file should be created after module cleanup", ) - def test_cache_file_nonempty(self): - compiled, inputs = _compile_simple(runtime_cache_path=self.cache_path) + @parameterized.expand(_RUNTIMES) + def test_cache_file_nonempty(self, _name, use_python_runtime): + _skip_if_cpp_unavailable(self, use_python_runtime) + model, inputs = _fresh_conv_model_and_inputs() + compiled = _compile( + model, + inputs, + use_python_runtime=use_python_runtime, + runtime_cache_path=self.cache_path, + ) _ = compiled(*[inp.clone() for inp in inputs]) del compiled gc.collect() @@ -143,30 +187,54 @@ def test_cache_file_nonempty(self): "Cache file should have nonzero size", ) - def test_cache_roundtrip(self): - """Compile, infer, save. Then compile again with same cache path and verify correctness.""" - model = SimpleModel().eval().cuda() - inputs = [torch.randn(2, 3).cuda()] - ref_output = model(*inputs) - - # First compilation — populates and saves cache - compiled1, _ = _compile_simple(runtime_cache_path=self.cache_path) - _ = compiled1(*[inp.clone() for inp in inputs]) + @parameterized.expand(_RUNTIMES) + def test_cache_roundtrip(self, _name, use_python_runtime): + """Populate + save, then recompile and confirm correctness against eager output.""" + _skip_if_cpp_unavailable(self, use_python_runtime) + model, inputs = _fresh_conv_model_and_inputs() + with torch.no_grad(): + ref_output = model(*inputs) + + compiled1 = _compile( + model, + inputs, + use_python_runtime=use_python_runtime, + runtime_cache_path=self.cache_path, + ) + out1 = compiled1(*[inp.clone() for inp in inputs]) + self.assertGreater( + cosine_similarity(ref_output, out1), + COSINE_THRESHOLD, + "First compiled output should match eager", + ) del compiled1 gc.collect() self.assertTrue(os.path.isfile(self.cache_path)) - # Second compilation — should load cached data - compiled2, _ = _compile_simple(runtime_cache_path=self.cache_path) - output = compiled2(*[inp.clone() for inp in inputs]) - max_diff = float(torch.max(torch.abs(ref_output - output))) - self.assertAlmostEqual( - max_diff, 0, places=3, msg="Output mismatch after cache roundtrip" + compiled2 = _compile( + model, + inputs, + use_python_runtime=use_python_runtime, + runtime_cache_path=self.cache_path, + ) + out2 = compiled2(*[inp.clone() for inp in inputs]) + self.assertGreater( + cosine_similarity(ref_output, out2), + COSINE_THRESHOLD, + "Second compiled output (warm cache) should still match eager", ) - def test_save_creates_directory(self): + @parameterized.expand(_RUNTIMES) + def test_save_creates_directory(self, _name, use_python_runtime): + _skip_if_cpp_unavailable(self, use_python_runtime) nested_path = os.path.join(self.cache_dir, "a", "b", "c", "runtime_cache.bin") - compiled, inputs = _compile_simple(runtime_cache_path=nested_path) + model, inputs = _fresh_conv_model_and_inputs() + compiled = _compile( + model, + inputs, + use_python_runtime=use_python_runtime, + runtime_cache_path=nested_path, + ) _ = compiled(*[inp.clone() for inp in inputs]) del compiled gc.collect() @@ -292,5 +360,39 @@ def test_timing_cache_still_created(self): ) +@unittest.skipIf( + not ENABLED_FEATURES.torch_tensorrt_runtime, + "C++ runtime is not available", +) +class TestSerializationIndices(TestCase): + """The HAS_RUNTIME_CFG flag + TRTRuntimeConfig slots are present on both backends.""" + + def test_indices_match_python_layout(self): + from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ( + CUDA_GRAPH_STRATEGY_IDX, + DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX, + HAS_RUNTIME_CFG_IDX, + RUNTIME_CACHE_PATH_IDX, + SERIALIZATION_LEN, + ) + + self.assertEqual(int(torch.ops.tensorrt.SERIALIZATION_LEN()), SERIALIZATION_LEN) + self.assertEqual( + int(torch.ops.tensorrt.HAS_RUNTIME_CFG_IDX()), int(HAS_RUNTIME_CFG_IDX) + ) + self.assertEqual( + int(torch.ops.tensorrt.RUNTIME_CACHE_PATH_IDX()), + int(RUNTIME_CACHE_PATH_IDX), + ) + self.assertEqual( + int(torch.ops.tensorrt.DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX()), + int(DYNAMIC_SHAPES_KERNEL_STRATEGY_IDX), + ) + self.assertEqual( + int(torch.ops.tensorrt.CUDA_GRAPH_STRATEGY_IDX()), + int(CUDA_GRAPH_STRATEGY_IDX), + ) + + if __name__ == "__main__": run_tests() diff --git a/tests/py/dynamo/runtime/test_001_cuda_graph_strategy.py b/tests/py/dynamo/runtime/test_001_cuda_graph_strategy.py index 8b534be362..29d38d60b1 100644 --- a/tests/py/dynamo/runtime/test_001_cuda_graph_strategy.py +++ b/tests/py/dynamo/runtime/test_001_cuda_graph_strategy.py @@ -2,10 +2,44 @@ import torch import torch_tensorrt as torchtrt +from parameterized import parameterized from torch.testing._internal.common_utils import TestCase, run_tests from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt.dynamo._settings import CompilationSettings +_RUNTIMES = [("python", True), ("cpp", False)] + + +def _skip_if_cpp_unavailable(testcase, use_python_runtime): + if not use_python_runtime and not ENABLED_FEATURES.torch_tensorrt_runtime: + testcase.skipTest("C++ runtime is not available") + + +class CudaGraphConvModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 8, 3, padding=1) + + def forward(self, x): + return torch.relu(self.conv(x)) + + +def _compile_conv(strategy, *, use_python_runtime): + """Compile CudaGraphConvModel through the selected runtime with the given strategy.""" + model = CudaGraphConvModel().eval().cuda() + inputs = [torch.randn(2, 3, 16, 16).cuda()] + compiled = torchtrt.compile( + model, + ir="dynamo", + inputs=inputs, + enabled_precisions={torch.float32}, + use_python_runtime=use_python_runtime, + min_block_size=1, + cuda_graph_strategy=strategy, + ) + torch._dynamo.reset() + return compiled, inputs + class SimpleModel(torch.nn.Module): def forward(self, x): @@ -353,5 +387,80 @@ def test_setting_ignored_on_non_rtx(self): self.assertEqual(output.shape, (2, 3)) +_STRATEGY_RUNTIME_MATRIX = [ + (strategy, runtime_name, use_python_runtime) + for strategy in ("disabled", "whole_graph_capture") + for (runtime_name, use_python_runtime) in _RUNTIMES +] + + +@unittest.skipIf( + not ENABLED_FEATURES.tensorrt_rtx, + "CUDA graph strategy is a TensorRT-RTX feature", +) +class TestCudaGraphStrategyInference(TestCase): + """End-to-end: compile + infer with each strategy on both runtime paths.""" + + def tearDown(self): + torchtrt.runtime.set_cudagraphs_mode(False) + + @parameterized.expand(_STRATEGY_RUNTIME_MATRIX) + def test_strategy_inference(self, strategy, _runtime_name, use_python_runtime): + _skip_if_cpp_unavailable(self, use_python_runtime) + compiled, inputs = _compile_conv( + strategy, use_python_runtime=use_python_runtime + ) + y = compiled(*[inp.clone() for inp in inputs]) + self.assertEqual(tuple(y.shape), (2, 8, 16, 16)) + self.assertTrue(torch.isfinite(y).all().item()) + + @parameterized.expand(_RUNTIMES) + def test_whole_graph_capture_with_subgraph_cudagraphs( + self, _name, use_python_runtime + ): + """Subgraph cudagraph mode + RTX strategy: RTX-native should take over without errors.""" + _skip_if_cpp_unavailable(self, use_python_runtime) + compiled, inputs = _compile_conv( + "whole_graph_capture", use_python_runtime=use_python_runtime + ) + torchtrt.runtime.set_cudagraphs_mode(True) + y = compiled(*[inp.clone() for inp in inputs]) + self.assertEqual(tuple(y.shape), (2, 8, 16, 16)) + self.assertTrue(torch.isfinite(y).all().item()) + + @parameterized.expand(_RUNTIMES) + def test_repeated_inference(self, _name, use_python_runtime): + """Repeated inference exercises the RTX-native capture/replay path.""" + _skip_if_cpp_unavailable(self, use_python_runtime) + compiled, inputs = _compile_conv( + "whole_graph_capture", use_python_runtime=use_python_runtime + ) + ref = compiled(*[inp.clone() for inp in inputs]) + for _ in range(4): + out = compiled(*[inp.clone() for inp in inputs]) + self.assertEqual(out.shape, ref.shape) + self.assertTrue(torch.isfinite(out).all().item()) + + +class TestCudaGraphStrategyInvalidValue(TestCase): + """Invalid strategy names are rejected at TorchTensorRTModule.__init__ on any backend.""" + + @parameterized.expand(_RUNTIMES) + def test_invalid_strategy_raises(self, _name, use_python_runtime): + _skip_if_cpp_unavailable(self, use_python_runtime) + model = CudaGraphConvModel().eval().cuda() + inputs = [torch.randn(2, 3, 16, 16).cuda()] + with self.assertRaises((ValueError, RuntimeError)): + torchtrt.compile( + model, + ir="dynamo", + inputs=inputs, + enabled_precisions={torch.float32}, + use_python_runtime=use_python_runtime, + min_block_size=1, + cuda_graph_strategy="not_a_real_strategy", + ) + + if __name__ == "__main__": run_tests() diff --git a/tests/py/dynamo/runtime/test_001_dynamic_shapes_kernel_strategy.py b/tests/py/dynamo/runtime/test_001_dynamic_shapes_kernel_strategy.py index 11998f9b7a..408fb159fc 100644 --- a/tests/py/dynamo/runtime/test_001_dynamic_shapes_kernel_strategy.py +++ b/tests/py/dynamo/runtime/test_001_dynamic_shapes_kernel_strategy.py @@ -2,16 +2,59 @@ import torch import torch_tensorrt as torchtrt +from parameterized import parameterized from torch.testing._internal.common_utils import TestCase, run_tests from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt.dynamo._settings import CompilationSettings +_STRATEGIES = [("lazy",), ("eager",), ("none",)] + class SimpleModel(torch.nn.Module): def forward(self, x): return torch.relu(x) + 1.0 +class DynamicConvModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(3, 16, 3, padding=1) + self.conv2 = torch.nn.Conv2d(16, 8, 3, padding=1) + + def forward(self, x): + return torch.relu(self.conv2(torch.relu(self.conv1(x)))) + + +_RUNTIMES = [("python", True), ("cpp", False)] + + +def _skip_if_cpp_unavailable(testcase, use_python_runtime): + if not use_python_runtime and not ENABLED_FEATURES.torch_tensorrt_runtime: + testcase.skipTest("C++ runtime is not available") + + +def _compile_dynamic_conv(strategy, *, use_python_runtime): + """Compile DynamicConvModel through the selected runtime with the given strategy.""" + model = DynamicConvModel().eval().cuda() + inp = torchtrt.Input( + min_shape=(1, 3, 16, 16), + opt_shape=(2, 3, 16, 16), + max_shape=(4, 3, 16, 16), + dtype=torch.float32, + ) + compiled = torchtrt.compile( + model, + ir="dynamo", + inputs=[inp], + enabled_precisions={torch.float32}, + use_python_runtime=use_python_runtime, + min_block_size=1, + dynamic_shapes_kernel_specialization_strategy=strategy, + ) + torch._dynamo.reset() + return compiled + + def _compile_simple(**extra_kwargs): """Helper: compile SimpleModel with dynamic shapes and Python runtime.""" model = SimpleModel().eval().cuda() @@ -147,5 +190,66 @@ def test_setting_ignored_on_non_rtx(self): self.assertEqual(output.shape, (2, 3)) +_STRATEGY_RUNTIME_MATRIX = [ + (strategy, runtime_name, use_python_runtime) + for (strategy,) in _STRATEGIES + for (runtime_name, use_python_runtime) in _RUNTIMES +] + + +@unittest.skipIf( + not ENABLED_FEATURES.tensorrt_rtx, + "Dynamic shapes kernel strategy is a TensorRT-RTX feature", +) +class TestDynamicShapesKernelStrategyInference(TestCase): + """End-to-end: compile + infer with each strategy on both runtime paths.""" + + @parameterized.expand(_STRATEGY_RUNTIME_MATRIX) + def test_strategy_inference(self, strategy, _runtime_name, use_python_runtime): + _skip_if_cpp_unavailable(self, use_python_runtime) + compiled = _compile_dynamic_conv( + strategy, use_python_runtime=use_python_runtime + ) + x = torch.randn(2, 3, 16, 16, device="cuda") + y = compiled(x) + self.assertEqual(tuple(y.shape), (2, 8, 16, 16)) + self.assertTrue(torch.isfinite(y).all().item()) + + @parameterized.expand(_RUNTIMES) + def test_dynamic_shape_with_eager(self, _name, use_python_runtime): + """Exercise shape changes under eager kernel specialization.""" + _skip_if_cpp_unavailable(self, use_python_runtime) + compiled = _compile_dynamic_conv("eager", use_python_runtime=use_python_runtime) + for batch in (1, 2, 3, 4): + x = torch.randn(batch, 3, 16, 16, device="cuda") + y = compiled(x) + self.assertEqual(tuple(y.shape), (batch, 8, 16, 16)) + + +class TestDynamicShapesKernelStrategyInvalidValue(TestCase): + """Invalid strategy names are rejected at TorchTensorRTModule.__init__ on any backend.""" + + @parameterized.expand(_RUNTIMES) + def test_invalid_strategy_raises(self, _name, use_python_runtime): + _skip_if_cpp_unavailable(self, use_python_runtime) + model = DynamicConvModel().eval().cuda() + inp = torchtrt.Input( + min_shape=(1, 3, 16, 16), + opt_shape=(2, 3, 16, 16), + max_shape=(4, 3, 16, 16), + dtype=torch.float32, + ) + with self.assertRaises((ValueError, RuntimeError)): + torchtrt.compile( + model, + ir="dynamo", + inputs=[inp], + enabled_precisions={torch.float32}, + use_python_runtime=use_python_runtime, + min_block_size=1, + dynamic_shapes_kernel_specialization_strategy="not_a_real_strategy", + ) + + if __name__ == "__main__": run_tests()