From 88142f304b07ca7e2c4a8cb38aab961929dd4bad Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 19 Mar 2026 00:06:32 -0700
Subject: [PATCH] [ET Device Support] Parse device info from serialized tensor
 in tensor_parser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Parse device info (device_type, device_index) from the serialized ExtraTensorInfo in .pte files into TensorImpl at runtime.
When a tensor's extra_tensor_info contains device annotations (e.g., CUDA), the tensor parser now reads and propagates them to the TensorImpl constructor. Tensors without extra_tensor_info default to CPU/0 for backward compatibility with older PTE files.、

Differential Revision: [D97199497](https://our.internmc.facebook.com/intern/diff/D97199497/)

[ghstack-poisoned]
---
 runtime/executor/tensor_parser_portable.cpp   |  16 +-
 runtime/executor/test/targets.bzl             |  16 ++
 .../test/tensor_parser_device_test.cpp        | 171 ++++++++++++++++++
 .../models/export_program_with_device_info.py | 142 +++++++++++++++
 test/models/targets.bzl                       |  33 ++++
 5 files changed, 377 insertions(+), 1 deletion(-)
 create mode 100644 runtime/executor/test/tensor_parser_device_test.cpp
 create mode 100644 test/models/export_program_with_device_info.py
diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
index 02cb019a1da..2fc9a2dc140 100644
--- a/runtime/executor/tensor_parser_portable.cpp
+++ b/runtime/executor/tensor_parser_portable.cpp
@@ -147,6 +147,18 @@ Result<Tensor> parseTensor(
       Internal,
       "dim_order_to_stride returned invalid status");
 
+  // Extract device info from serialized tensor metadata.
+  // Defaults to CPU/0 for backward compatibility when extra_tensor_info is
+  // absent (e.g., older PTE files without device annotations).
+  auto device_type = executorch::runtime::etensor::DeviceType::CPU;
+  executorch::runtime::etensor::DeviceIndex device_index = 0;
+  if (s_tensor->extra_tensor_info() != nullptr) {
+    device_type = static_cast<executorch::runtime::etensor::DeviceType>(
+        s_tensor->extra_tensor_info()->device_type());
+    device_index = static_cast<executorch::runtime::etensor::DeviceIndex>(
+        s_tensor->extra_tensor_info()->device_index());
+  }
+
   auto* tensor_impl = method_allocator->allocateInstance<TensorImpl>();
   if (tensor_impl == nullptr) {
     return Error::MemoryAllocationFailed;
@@ -161,7 +173,9 @@ Result<Tensor> parseTensor(
       /*data=*/nullptr,
       dim_order,
       strides,
-      dynamism);
+      dynamism,
+      device_type,
+      device_index);
 
   // Now that we know how big the tensor is, find and assign its memory.
   Result<void*> data_ptr = getTensorDataPtr(
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index d78b36351d8..f4534aefdea 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -312,3 +312,19 @@ def define_common_targets(is_fbcode = False):
             ],
             env = modules_env,
         )
+
+        runtime.cxx_test(
+            name = "tensor_parser_device_test",
+            srcs = [
+                "tensor_parser_device_test.cpp",
+            ],
+            deps = [
+                ":managed_memory_manager",
+                "//executorch/runtime/executor:program",
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/schema:program",
+            ],
+            env = {
+                "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])",
+            },
+        )
diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp
new file mode 100644
index 00000000000..46488eacd0b
--- /dev/null
+++ b/runtime/executor/test/tensor_parser_device_test.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Tests that device info (device_type) is correctly parsed from serialized
+ * tensors in .pte files into TensorImpl at runtime.
+ *
+ * Uses a .pte exported with DeviceAwarePartitioner (CUDA device annotation)
+ * so that delegate output tensors carry device_type=CUDA in ExtraTensorInfo.
+ */
+
+#include <executorch/runtime/executor/tensor_parser.h>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/executor/test/managed_memory_manager.h>
+#include <executorch/schema/program_generated.h>
+
+#include <gtest/gtest.h>
+
+using executorch::aten::Tensor;
+using executorch::runtime::Error;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::deserialization::parseTensor;
+using executorch::runtime::testing::ManagedMemoryManager;
+using torch::executor::util::FileDataLoader;
+
+constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
+constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024U;
+
+namespace executorch {
+namespace runtime {
+namespace testing {
+class ProgramTestFriend final {
+ public:
+  const static executorch_flatbuffer::Program* GetInternalProgram(
+      const Program* program) {
+    return program->internal_program_;
+  }
+};
+} // namespace testing
+} // namespace runtime
+} // namespace executorch
+
+using executorch::runtime::testing::ProgramTestFriend;
+
+class TensorParserDeviceTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+    ASSERT_NE(path, nullptr)
+        << "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set";
+    Result<FileDataLoader> loader = FileDataLoader::from(path);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+  }
+
+  std::unique_ptr<FileDataLoader> loader_;
+};
+
+TEST_F(TensorParserDeviceTest, CUDADeviceParsedFromPteFile) {
+  Result<Program> program =
+      Program::load(loader_.get(), Program::Verification::Minimal);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+
+  const executorch_flatbuffer::Program* internal_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+  auto* execution_plan =
+      internal_program->execution_plan()->GetMutableObject(0);
+  auto* flatbuffer_values = execution_plan->values();
+
+  int cuda_tensor_count = 0;
+  int cpu_tensor_count = 0;
+  int total_tensor_count = 0;
+
+  for (size_t i = 0; i < flatbuffer_values->size(); ++i) {
+    auto* serialization_value = flatbuffer_values->Get(i);
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+    total_tensor_count++;
+
+    auto* s_tensor = serialization_value->val_as_Tensor();
+
+    Result<Tensor> tensor = parseTensor(&program.get(), &mmm.get(), s_tensor);
+    if (!tensor.ok()) {
+      bool has_cuda = s_tensor->extra_tensor_info() != nullptr &&
+          s_tensor->extra_tensor_info()->device_type() ==
+              executorch_flatbuffer::DeviceType::CUDA;
+      if (has_cuda) {
+        cuda_tensor_count++;
+      }
+      continue;
+    }
+
+    Tensor t = tensor.get();
+    auto device_type = t.unsafeGetTensorImpl()->device_type();
+
+    if (device_type == executorch::runtime::etensor::DeviceType::CUDA) {
+      cuda_tensor_count++;
+      EXPECT_EQ(t.unsafeGetTensorImpl()->device_index(), 0)
+          << "CUDA tensor should have device_index=0";
+    } else {
+      EXPECT_EQ(device_type, executorch::runtime::etensor::DeviceType::CPU);
+      EXPECT_EQ(t.unsafeGetTensorImpl()->device_index(), 0)
+          << "CPU tensor should have device_index=0";
+      cpu_tensor_count++;
+    }
+  }
+
+  EXPECT_GT(total_tensor_count, 0) << "Should have at least one tensor";
+  // The model has add(a, b) delegated to CUDA — 2 inputs + 1 output = 3 CUDA
+  EXPECT_EQ(cuda_tensor_count, 3)
+      << "Expected 3 CUDA tensors (2 delegate inputs + 1 delegate output)";
+}
+
+TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) {
+  Result<Program> program =
+      Program::load(loader_.get(), Program::Verification::Minimal);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+
+  const executorch_flatbuffer::Program* internal_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+  auto* execution_plan =
+      internal_program->execution_plan()->GetMutableObject(0);
+  auto* flatbuffer_values = execution_plan->values();
+
+  for (size_t i = 0; i < flatbuffer_values->size(); ++i) {
+    auto* serialization_value = flatbuffer_values->Get(i);
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+
+    auto* s_tensor = serialization_value->val_as_Tensor();
+    bool has_cuda_device = s_tensor->extra_tensor_info() != nullptr &&
+        s_tensor->extra_tensor_info()->device_type() ==
+            executorch_flatbuffer::DeviceType::CUDA;
+
+    // Only check tensors that are NOT annotated as CUDA
+    if (has_cuda_device) {
+      continue;
+    }
+
+    Result<Tensor> tensor = parseTensor(&program.get(), &mmm.get(), s_tensor);
+    if (!tensor.ok()) {
+      continue;
+    }
+
+    Tensor t = tensor.get();
+    EXPECT_EQ(
+        t.unsafeGetTensorImpl()->device_type(),
+        executorch::runtime::etensor::DeviceType::CPU)
+        << "Tensor at index " << i
+        << " without CUDA annotation should default to CPU";
+    EXPECT_EQ(t.unsafeGetTensorImpl()->device_index(), 0)
+        << "Tensor at index " << i
+        << " without device annotation should have device_index=0";
+  }
+}
diff --git a/test/models/export_program_with_device_info.py b/test/models/export_program_with_device_info.py
new file mode 100644
index 00000000000..1abf73bfb73
--- /dev/null
+++ b/test/models/export_program_with_device_info.py
@@ -0,0 +1,142 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+"""Exports a simple model with device-annotated tensors for C++ testing.
+
+Uses DeviceAwarePartitioner (BackendWithCompilerDemo + target_device=cuda:0)
+so that delegate output tensors are annotated with CUDA device in the .pte.
+"""
+
+import argparse
+import os
+from typing import Dict, final
+
+import torch
+from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
+from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+    generate_pattern_op_partitions,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.backend.test.backend_with_compiler_demo import (
+    BackendWithCompilerDemo,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.passes.propagate_device_pass import TARGET_DEVICE_COMPILE_SPEC_KEY
+from torch import nn
+from torch.export import export
+from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+
+class _AddOperatorSupport(OperatorSupportBase):
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        return node.op == "call_function" and node.target in [
+            exir_ops.edge.aten.add.Tensor,
+        ]
+
+
+@final
+class _DeviceAwarePartitioner(Partitioner):
+    """Partitioner that tags add ops for delegation with target_device=cuda:0."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.delegation_spec = DelegationSpec(
+            BackendWithCompilerDemo.__name__,
+            [
+                CompileSpec("max_value", bytes([4])),
+                CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+            ],
+        )
+
+    def partition(self, exported_program) -> PartitionResult:
+        partition_tags: Dict[str, DelegationSpec] = {}
+        partition_list = generate_pattern_op_partitions(
+            exported_program.graph_module,
+            op_support=any_chain(_AddOperatorSupport()),
+        )
+        for partition in partition_list:
+            for node in partition.nodes:
+                tag = f"tag{partition.id}"
+                node.meta["delegation_tag"] = tag
+                partition_tags[tag] = self.delegation_spec
+        return PartitionResult(
+            tagged_exported_program=exported_program,
+            partition_tags=partition_tags,
+        )
+
+
+class ModuleAddWithDevice(nn.Module):
+    """Simple add model — the add op will be delegated with CUDA device annotation."""
+
+    def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        return torch.add(a, b)
+
+    def get_random_inputs(self):
+        return (torch.randn(2, 2), torch.randn(2, 2))
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--outdir", type=str, required=True)
+    args = parser.parse_args()
+
+    torch.manual_seed(0)
+    model = ModuleAddWithDevice()
+    inputs = model.get_random_inputs()
+
+    edge = to_edge(
+        export(model, inputs),
+        compile_config=EdgeCompileConfig(_check_ir_validity=False),
+    )
+    lowered = edge.to_backend(_DeviceAwarePartitioner())
+    et_prog = lowered.to_executorch(ExecutorchBackendConfig(emit_stacktrace=False))
+
+    os.makedirs(args.outdir, exist_ok=True)
+    outfile = os.path.join(args.outdir, "ModuleAddWithDevice.pte")
+
+    # Verify device annotations are present in the serialized program
+    from executorch.exir.schema import DeviceType, Tensor as SchemaTensor
+
+    program = et_prog._emitter_output.program
+    plan = program.execution_plan[0]
+    print(f"Delegates: {len(plan.delegates)}")
+    cuda_count = 0
+    for i, v in enumerate(plan.values):
+        if isinstance(v.val, SchemaTensor):
+            t = v.val
+            eti = t.extra_tensor_info
+            dev = eti.device_type if eti else "no_eti"
+            print(f"  Tensor[{i}]: sizes={list(t.sizes)}, device={dev}")
+            if eti and eti.device_type == DeviceType.CUDA:
+                cuda_count += 1
+    print(f"CUDA tensors: {cuda_count}")
+
+    # Also check graph module specs
+    from executorch.exir.delegate import executorch_call_delegate
+    from executorch.exir.tensor import TensorSpec
+
+    gm = et_prog.exported_program().graph_module
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == executorch_call_delegate:
+            specs = node.meta.get("spec")
+            print(
+                f"  Delegate node '{node.name}' spec.device = {specs.device if isinstance(specs, TensorSpec) else [s.device for s in specs if isinstance(s, TensorSpec)]}"
+            )
+
+    with open(outfile, "wb") as fp:
+        fp.write(et_prog.buffer)
+    print(f"Exported ModuleAddWithDevice to {outfile}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index 506d0a801a5..c9fb67b7d31 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -141,6 +141,27 @@ def define_common_targets():
         visibility = [],  # Private
     )
 
+    runtime.python_library(
+        name = "export_program_with_device_info_lib",
+        srcs = ["export_program_with_device_info.py"],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir/backend/test:backend_with_compiler_demo",
+            "//executorch/exir:lib",
+        ],
+        visibility = [],  # Private
+    )
+
+    runtime.python_binary(
+        name = "export_program_with_device_info",
+        main_module = "executorch.test.models.export_program_with_device_info",
+        par_style = "xar",
+        deps = [
+            ":export_program_with_device_info_lib",
+        ],
+        visibility = [],  # Private
+    )
+
     runtime.python_binary(
         name = "export_delegated_program",
         main_module = "executorch.test.models.export_delegated_program",
@@ -196,6 +217,18 @@ def define_common_targets():
         ],
     )
 
+    runtime.genrule(
+        name = "exported_program_with_device_info",
+        cmd = "$(exe :export_program_with_device_info) --outdir $OUT",
+        outs = {
+            "ModuleAddWithDevice.pte": ["ModuleAddWithDevice.pte"],
+        },
+        default_outs = ["."],
+        visibility = [
+            "//executorch/runtime/executor/test/...",
+        ],
+    )
+
     runtime.genrule(
         name = "exported_xnnp_delegated_programs",
         cmd = "$(exe :export_delegated_program)" +