Enabled op agnostic serialization for both runtime

cehongwang · cehongwang · commit 694755fc1106 · 2026-04-08T19:35:47.000Z
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
@@ -545,7 +545,7 @@ def convert_method_to_trt_engine(
             module, torchtrt_arg_inputs, kwarg_inputs=torchtrt_kwarg_inputs, **kwargs
         )
 
-        return dynamo_convert_exported_program_to_serialized_trt_engine(
+        return dynamo_convert_exported_program_to_serialized_trt_engine(  # type: ignore[no-any-return]
             exp_program,
             arg_inputs=tuple(arg_inputs),
             kwarg_inputs=torchtrt_kwarg_inputs,
@@ -594,35 +594,40 @@ def load(
     Raises:
         ValueError: If there is no file or the file is not either a TorchScript file or ExportedProgram file
     """
+    from torch_tensorrt.dynamo._exporter import replace_execute_engine_no_op_node
 
     try:
-        logger.debug(f"Loading the provided file {file_path} using torch.jit.load()")
-        ts_module = function_overload_with_kwargs(
+        logger.debug(f"Loading the provided file {file_path} using torch.export.load()")
+        exp_program = function_overload_with_kwargs(
             torch.export.load,
             file_path,
             extra_files=extra_files,
             **kwargs,
         )
-        return ts_module
-    except Exception:
+        gm = exp_program.graph_module
+        if any(
+            "no_op_placeholder_for_execute_engine" in n.name for n in gm.graph.nodes
+        ):
+            return replace_execute_engine_no_op_node(exp_program)
+        return exp_program
+    except Exception as e:
         logger.info(
-            f"Loading the provided file {file_path} via torch.export.load() failed with the following error",
+            f"Loading the provided file {file_path} via torch.export.load() failed with the following error: {e}",
             exc_info=True,
         )
-        pass
 
     try:
-        logger.debug(f"Loading the provided file {file_path} using torch.export.load()")
-        exp_program = function_overload_with_kwargs(
+        logger.debug(f"Loading the provided file {file_path} using torch.jit.load()")
+        ts_module = function_overload_with_kwargs(
             torch.jit.load,
             file_path,
             _extra_files=extra_files,
             **kwargs,
         )
-        return exp_program
-    except Exception:
+        return ts_module
+    except Exception as e:
         logger.info(
-            f"Loading the provided file {file_path} via torch.jit.load() (after failing to load with torch.export.load()) failed with the following error",
+            f"Loading the provided file {file_path} via torch.jit.load() (after failing to load with torch.export.load()) failed with the following error: {e}",
             exc_info=True,
         )
         raise ValueError(
@@ -805,8 +810,8 @@ def _all_are_input_objects(obj: Any) -> bool:
                     f"Inferred dynamic_shapes from torch_tensorrt.Input objects with min/opt/max specifications: {dynamic_shapes}"
                 )
 
-        arg_tensors = tuple(get_torch_inputs(arg_inputs, default_device()))  # type: ignore
-        kwarg_tensors = get_torch_inputs(kwarg_inputs, default_device())  # type: ignore
+        arg_tensors = tuple(get_torch_inputs(arg_inputs, default_device()))
+        kwarg_tensors = get_torch_inputs(kwarg_inputs, default_device())
 
     else:
         # Mixed case: some inputs are Tensors, some are Input objects
diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py
@@ -19,6 +19,7 @@
     OutputSpec,
     TensorArgument,
 )
+from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ENGINE_IDX, NAME_IDX
 
 
@@ -483,36 +484,18 @@ def inline_trt_modules(
                 f"trt_module_node: {trt_module_node.name} does not have the metadata which should be set during dynamo compile_module step."
             )
         num_outputs = len(trt_module_node.meta["val"])
-        # Insert a call_function node to perform inference on TRT engine
         with gm.graph.inserting_before(trt_module_node):
-            if cross_compile_module:
-                engine_info = trt_module._pack_engine_info()
-                engine_bytes = engine_info[ENGINE_IDX]
-                engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8")
-                # insert the no_placeholder node in the graph which should be replaced to the actual execute_engine node while load in the windows
-                trt_node = gm.graph.call_function(
-                    torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default,
-                    (trt_module_node.args, *engine_info),
-                )
-            else:
-                # for the normal workflow: use the execute_engine node
-                engine_name = f"{name}_engine"
-                # TODO: THROWS SOME WARNING ABOUT A LACK OF UNDERLYING REFERENCE TO THE OWNING GRAPH MODULE
-                # SAYS THERES 3 OPTIONS, SUBMODULE, PARAMETER, OR BUFFER, BUFFER SEEMS THE BEST BUT I THINK ITS KEYED TO TENSORS
-                setattr(gm, engine_name, trt_module.engine)
-                engine_node = gm.graph.get_attr(engine_name)
-
-                trt_node = gm.graph.call_function(
-                    torch.ops.tensorrt.execute_engine.default,
-                    (trt_module_node.args, engine_node),
-                )
-                # meta["val"] should be a lighter version of a tensor. For eg: it should be a FakeTensor (with output shape and dtype properties)
-                # Lighter version of a custom_obj is not defined clearly. meta["val"] does not have any type expectations but
-                # for custom object nodes, it should be CustomObjArgument
-                engine_node.meta["val"] = CustomObjArgument(
-                    name=engine_node.name, class_fqn=""
-                )
-            # set trt_node.meta with trt_module_node.meta
+            # Always embed engine data as primitive string args via no_op_placeholder
+            # so torch.export does not pickle torch.classes.tensorrt.Engine (which
+            # requires the C++ TorchBind class at load time).
+            # torch_tensorrt.load() lowers placeholders → execute_engine.
+            engine_info = trt_module._pack_engine_info()
+            engine_bytes = engine_info[ENGINE_IDX]
+            engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8")
+            trt_node = gm.graph.call_function(
+                torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default,
+                (trt_module_node.args, *engine_info),
+            )
             assert num_outputs > 0
             trt_node.meta["val"] = trt_module_node.meta["val"]
 
@@ -557,7 +540,12 @@ def replace_execute_engine_no_op_node(
             packed_engine_info[ENGINE_IDX] = base64.b64decode(
                 engine_bytes.encode("utf-8")
             )
-            trt_engine = torch.classes.tensorrt.Engine(tuple(packed_engine_info))
+            if ENABLED_FEATURES.torch_tensorrt_runtime:
+                trt_engine = torch.classes.tensorrt.Engine(tuple(packed_engine_info))
+            else:
+                from torch_tensorrt.dynamo.runtime._PythonTRTEngine import TRTEngine
+
+                trt_engine = TRTEngine(packed_engine_info)
             setattr(gm, engine_name, trt_engine)
             engine_node = gm.graph.get_attr(engine_name)
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py b/py/torch_tensorrt/dynamo/runtime/_PythonTRTEngine.py
@@ -707,3 +707,25 @@ def execute_engine(
     ) -> List[torch.Tensor]:
         outputs = engine.execute(input_tensors)
         return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs)
+
+    @torch.library.custom_op(  # type: ignore[misc]
+        "tensorrt::no_op_placeholder_for_execute_engine", mutates_args=()
+    )
+    def no_op_placeholder_for_execute_engine(
+        inputs: List[torch.Tensor],
+        abi_version: str,
+        name: str,
+        serialized_device_info: str,
+        serialized_engine: str,
+        serialized_in_binding_names: str,
+        serialized_out_binding_names: str,
+        serialized_hardware_compatible: str,
+        serialized_metadata: str,
+        serialized_target_platform: str,
+        serialized_require_output_allocator: str,
+        serialized_resource_allocation_strategy: str,
+    ) -> List[torch.Tensor]:
+        raise RuntimeError(
+            "TensorRT engine placeholder reached eager execution; load this artifact with "
+            "torch_tensorrt.load() so placeholders are lowered to execute_engine."
+        )
diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
@@ -354,7 +354,9 @@ def no_op_placeholder_for_execute_engine(
     serialized_metadata: str,
     serialized_target_platform: str,
     serialized_require_output_allocator: str,
+    serialized_resource_allocation_strategy: str,
 ) -> List[torch.Tensor]:
     raise RuntimeError(
-        "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api."
+        "TensorRT engine placeholder reached eager execution; load this artifact with "
+        "torch_tensorrt.load() so placeholders are lowered to execute_engine."
     )