pytorch · tp5uiuc · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 27, 2026
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -91,6 +91,7 @@ def cross_compile_for_windows(
     timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
     runtime_cache_path: str = _defaults.RUNTIME_CACHE_PATH,
     dynamic_shapes_kernel_specialization_strategy: str = _defaults.DYNAMIC_SHAPES_KERNEL_SPECIALIZATION_STRATEGY,
+    cuda_graph_strategy: str = _defaults.CUDA_GRAPH_STRATEGY,
     lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT,
     cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES,
     reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES,
@@ -171,6 +172,7 @@ def cross_compile_for_windows(
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation. Not used for TensorRT-RTX.
         runtime_cache_path (str): Path to the runtime cache for TensorRT-RTX JIT compilation results. Not used for standard TensorRT.
         dynamic_shapes_kernel_specialization_strategy (str): Strategy for dynamic shape kernel specialization at runtime (TensorRT-RTX only). Options: "lazy", "eager", "none". Default: "lazy".
+        cuda_graph_strategy (str): Strategy for CUDA graph capture/replay (TensorRT-RTX only). Options: "disabled" (manual capture), "whole_graph_capture" (TRT-RTX handles internally). Default: "disabled".
         lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
         cache_built_engines (bool): Whether to save the compiled TRT engines to storage
         reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
@@ -318,6 +320,7 @@ def cross_compile_for_windows(
         "timing_cache_path": timing_cache_path,
         "runtime_cache_path": runtime_cache_path,
         "dynamic_shapes_kernel_specialization_strategy": dynamic_shapes_kernel_specialization_strategy,
+        "cuda_graph_strategy": cuda_graph_strategy,
         "lazy_engine_init": lazy_engine_init,
         "cache_built_engines": cache_built_engines,
         "reuse_cached_engines": reuse_cached_engines,
@@ -432,6 +435,7 @@ def compile(
     timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
     runtime_cache_path: str = _defaults.RUNTIME_CACHE_PATH,
     dynamic_shapes_kernel_specialization_strategy: str = _defaults.DYNAMIC_SHAPES_KERNEL_SPECIALIZATION_STRATEGY,
+    cuda_graph_strategy: str = _defaults.CUDA_GRAPH_STRATEGY,
     lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT,
     cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES,
     reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES,
@@ -527,6 +531,7 @@ def compile(
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation. Not used for TensorRT-RTX.
         runtime_cache_path (str): Path to the runtime cache for TensorRT-RTX JIT compilation results. Not used for standard TensorRT.
         dynamic_shapes_kernel_specialization_strategy (str): Strategy for dynamic shape kernel specialization at runtime (TensorRT-RTX only). Options: "lazy", "eager", "none". Default: "lazy".
+        cuda_graph_strategy (str): Strategy for CUDA graph capture/replay (TensorRT-RTX only). Options: "disabled" (manual capture), "whole_graph_capture" (TRT-RTX handles internally). Default: "disabled".
         lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
         cache_built_engines (bool): Whether to save the compiled TRT engines to storage
         reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
@@ -706,6 +711,7 @@ def compile(
         "timing_cache_path": timing_cache_path,
         "runtime_cache_path": runtime_cache_path,
         "dynamic_shapes_kernel_specialization_strategy": dynamic_shapes_kernel_specialization_strategy,
+        "cuda_graph_strategy": cuda_graph_strategy,
         "lazy_engine_init": lazy_engine_init,
         "cache_built_engines": cache_built_engines,
         "reuse_cached_engines": reuse_cached_engines,
@@ -1226,6 +1232,7 @@ def convert_exported_program_to_serialized_trt_engine(
     timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
     runtime_cache_path: str = _defaults.RUNTIME_CACHE_PATH,
     dynamic_shapes_kernel_specialization_strategy: str = _defaults.DYNAMIC_SHAPES_KERNEL_SPECIALIZATION_STRATEGY,
+    cuda_graph_strategy: str = _defaults.CUDA_GRAPH_STRATEGY,
     lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT,
     cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES,
     reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES,
@@ -1302,6 +1309,7 @@ def convert_exported_program_to_serialized_trt_engine(
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation. Not used for TensorRT-RTX.
         runtime_cache_path (str): Path to the runtime cache for TensorRT-RTX JIT compilation results. Not used for standard TensorRT.
         dynamic_shapes_kernel_specialization_strategy (str): Strategy for dynamic shape kernel specialization at runtime (TensorRT-RTX only). Options: "lazy", "eager", "none". Default: "lazy".
+        cuda_graph_strategy (str): Strategy for CUDA graph capture/replay (TensorRT-RTX only). Options: "disabled" (manual capture), "whole_graph_capture" (TRT-RTX handles internally). Default: "disabled".
         lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
         cache_built_engines (bool): Whether to save the compiled TRT engines to storage
         reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
@@ -1458,6 +1466,7 @@ def convert_exported_program_to_serialized_trt_engine(
         "timing_cache_path": timing_cache_path,
         "runtime_cache_path": runtime_cache_path,
         "dynamic_shapes_kernel_specialization_strategy": dynamic_shapes_kernel_specialization_strategy,
+        "cuda_graph_strategy": cuda_graph_strategy,
         "lazy_engine_init": lazy_engine_init,
         "cache_built_engines": cache_built_engines,
         "reuse_cached_engines": reuse_cached_engines,

diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -70,6 +70,7 @@
 DECOMPOSE_ATTENTION = False
 ATTN_BIAS_IS_CAUSAL = True
 DYNAMIC_SHAPES_KERNEL_SPECIALIZATION_STRATEGY = "lazy"
+CUDA_GRAPH_STRATEGY = "disabled"
 USE_PYTHON_RUNTIME = False
 
 if platform.system() == "Linux":

diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -17,6 +17,7 @@
     AUTOCAST_MAX_OUTPUT_THRESHOLD,
     CACHE_BUILT_ENGINES,
     CPU_MEMORY_BUDGET,
+    CUDA_GRAPH_STRATEGY,
     DECOMPOSE_ATTENTION,
     DISABLE_TF32,
     DLA_GLOBAL_DRAM_SIZE,
@@ -95,6 +96,7 @@ class CompilationSettings:
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation. Not used for TensorRT-RTX (no autotuning).
         runtime_cache_path (str): Path to the runtime cache for TensorRT-RTX JIT compilation results. The cache is loaded on engine setup and saved on module cleanup. Uses file locking for concurrent access safety. Not used for standard TensorRT.
         dynamic_shapes_kernel_specialization_strategy (str): Strategy for compiling shape-specialized kernels at runtime for dynamic shapes (TensorRT-RTX only). Options: "lazy" (compile in background, use fallback until ready), "eager" (compile immediately, blocking), "none" (always use fallback kernels). Default: "lazy".
+        cuda_graph_strategy (str): Strategy for CUDA graph capture/replay (TensorRT-RTX only). Options: "disabled" (no native CUDA graphs, uses manual capture if cudagraphs mode is enabled), "whole_graph_capture" (TRT-RTX handles CUDA graph capture internally). When set to "whole_graph_capture", the manual torch CUDA graph capture/replay in forward() is bypassed. Default: "disabled".
         cache_built_engines (bool): Whether to save the compiled TRT engines to storage
         reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
         use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32.
@@ -150,6 +152,7 @@ class CompilationSettings:
     dynamic_shapes_kernel_specialization_strategy: str = (
         DYNAMIC_SHAPES_KERNEL_SPECIALIZATION_STRATEGY
     )
+    cuda_graph_strategy: str = CUDA_GRAPH_STRATEGY
     lazy_engine_init: bool = LAZY_ENGINE_INIT
     cache_built_engines: bool = CACHE_BUILT_ENGINES
     reuse_cached_engines: bool = REUSE_CACHED_ENGINES

diff --git a/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
@@ -127,6 +127,59 @@ def __del__(self) -> None:
     def set_use_output_allocator(self, enable: bool) -> None:
         self.use_output_allocator_outputs = enable
 
+    def _check_monolithic_capturability(self, stream: torch.cuda.Stream) -> None:
+        """Verify every TRT submodule is safe for monolithic stream capture.
+
+        Whole-graph CUDA graph mode wraps mixed TRT + PyTorch ops in a
+        single outer ``torch.cuda.CUDAGraph`` capture. On TRT-RTX, each
+        engine must opt out of RTX-native CUDA graphs (which would
+        interfere with the outer capture) and must pass the
+        ``IExecutionContext.is_stream_capturable`` check. Raises
+        ``RuntimeError`` if any TRT engine is not monolithically
+        capturable. No-op on non-RTX builds.
+        """
+        from torch_tensorrt._features import ENABLED_FEATURES
+
+        if not ENABLED_FEATURES.tensorrt_rtx:
+            return
+        from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import (
+            TorchTensorRTModule,
+        )
+        from torch_tensorrt.dynamo.runtime._TRTEngine import (
+            TRTEngine,
+            _get_cuda_graph_strategy,
+        )
+
+        for name, mod in self.compiled_module.named_modules():
+            if not (
+                isinstance(mod, TorchTensorRTModule)
+                and isinstance(mod.engine, TRTEngine)
+            ):
+                continue
+            engine = mod.engine
+            if not engine._is_monolithic_capturable(stream):
+                raise RuntimeError(
+                    f"CUDA graph capture failed: TRT submodule '{name}' is "
+                    "not monolithically capturable (lazy kernel "
+                    "specialization or non-capturable stream). Whole-graph "
+                    "CUDA graph mode with mixed TRT + PyTorch ops requires "
+                    "all TRT engines to be capturable. Consider using "
+                    "cuda_graph_strategy='whole_graph_capture' with "
+                    "set_cudagraphs_mode(True) instead of enable_cudagraphs()."
+                )
+            # Disable RTX-native CUDA graphs on this engine so they don't
+            # interfere with the outer monolithic capture.
+            if engine._rtx_native_cudagraphs:
+                engine.runtime_config.cuda_graph_strategy = _get_cuda_graph_strategy(
+                    "disabled"
+                )
+                engine.context = engine._create_execution_context()
+                engine._rtx_native_cudagraphs = False
+                logger.info(
+                    f"Disabled RTX-native CUDA graphs for '{name}' "
+                    "(using outer monolithic capture instead)"
+                )
+
     def forward(
         self, *args: Any, **kwargs: Any
     ) -> torch.Tensor | Tuple[torch.Tensor, ...]:
@@ -212,6 +265,7 @@ def forward(
 
             with torch.cuda.stream(self._engine_stream):
                 if need_cudagraphs_record:
+                    self._check_monolithic_capturability(self._engine_stream)
                     self.cudagraph = torch.cuda.CUDAGraph()
                     with torch.cuda.graph(self.cudagraph, stream=self._engine_stream):
                         self._output_buffers = self.compiled_module(*args, **kwargs)