pytorch · Gasoonjia · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 6, 2026
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
@@ -17,7 +17,7 @@ echo "Running backend test job for suite $SUITE, flow $FLOW."
 echo "Saving job artifacts to $ARTIFACT_DIR."
 
 eval "$(conda shell.bash hook)"
-CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+CONDA_ENV=$(conda env list --json | python -c "import sys, json; print(json.load(sys.stdin)['envs'][-1])")
 conda activate "${CONDA_ENV}"
 
 if [[ "$(uname)" == "Darwin" ]]; then
@@ -56,6 +56,32 @@ if [[ "$FLOW" == *vulkan* ]]; then
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
+if [[ "$FLOW" == *cuda* ]]; then
+    # When running with the PyTorch test-infra Docker image (which has nvcc),
+    # install executorch directly — it will auto-detect CUDA and install
+    # CUDA-enabled PyTorch.  Skip setup-linux.sh which expects the custom
+    # Docker image with pre-built pinned-commit torch.
+    echo "Installing ExecuTorch with CUDA support..."
+    ./install_executorch.sh --editable
+
+    # Verify PyTorch was installed with CUDA support
+    python -c "import torch; assert torch.cuda.is_available(), 'PyTorch CUDA not available after reinstall'; print(f'PyTorch {torch.__version__} with CUDA {torch.version.cuda}')" || {
+        echo "ERROR: PyTorch was not installed with CUDA support"
+        exit 1
+    }
+
+    # Fix libstdc++ GLIBCXX version for CUDA backend.
+    # The embedded .so files in the CUDA blob require GLIBCXX_3.4.30
+    # which the default conda libstdc++ doesn't have.
+    echo "Installing newer libstdc++ for CUDA backend..."
+    conda install -y -c conda-forge 'libstdcxx-ng>=12'
+    export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH:-}"
+
+    source .ci/scripts/utils.sh
+    CMAKE_ARGS="$EXTRA_BUILD_ARGS" build_executorch_runner cmake Release
+    CUDA_SETUP_DONE=1
+fi
+
 if [[ "$FLOW" == *arm* ]]; then
 
     # Setup ARM deps.
@@ -78,12 +104,14 @@ if [[ "$FLOW" == *arm* ]]; then
     fi
 fi
 
-if [[ $IS_MACOS -eq 1 ]]; then
-    SETUP_SCRIPT=.ci/scripts/setup-macos.sh
-else
-    SETUP_SCRIPT=.ci/scripts/setup-linux.sh
+if [[ "${CUDA_SETUP_DONE:-0}" != "1" ]]; then
+    if [[ $IS_MACOS -eq 1 ]]; then
+        SETUP_SCRIPT=.ci/scripts/setup-macos.sh
+    else
+        SETUP_SCRIPT=.ci/scripts/setup-linux.sh
+    fi
+    CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
 fi
-CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
 
 GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
 export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"

diff --git a/.github/workflows/test-backend-cuda.yml b/.github/workflows/test-backend-cuda.yml
@@ -0,0 +1,41 @@
+name: Test CUDA Backend
+
+on:
+  schedule:
+    - cron: 0 2 * * *
+  push:
+    branches:
+      - release/*
+    tags:
+      - ciflow/nightly/*
+  pull_request:
+    paths:
+      - .github/workflows/test-backend-cuda.yml
+      - .ci/scripts/test_backend.sh
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  test-cuda:
+    strategy:
+      fail-fast: false
+      matrix:
+        suite: [models, operators]
+
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: '12.6'
+      use-custom-docker-registry: false
+      submodules: recursive
+      timeout: 120
+      upload-artifact: test-report-cuda-${{ matrix.suite }}
+      script: |
+        set -eux
+
+        source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "cuda" "${RUNNER_ARTIFACT_DIR}"
diff --git a/backends/cuda/test/tester.py b/backends/cuda/test/tester.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, List, Optional, Tuple
+
+import executorch
+import executorch.backends.test.harness.stages as BaseStages
+import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.backends.test.harness import Tester as TesterBase
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.partitioner import Partitioner
+
+
+def _create_default_partitioner() -> CudaPartitioner:
+    """Create a CudaPartitioner with default compile specs."""
+    compile_specs = [CudaBackend.generate_method_name_compile_spec("forward")]
+    return CudaPartitioner(compile_specs)
+
+
+class ToEdgeTransformAndLower(BaseStages.ToEdgeTransformAndLower):
+    """CUDA-specific ToEdgeTransformAndLower stage."""
+
+    def __init__(
+        self,
+        partitioners: Optional[List[Partitioner]] = None,
+        edge_compile_config: Optional[EdgeCompileConfig] = None,
+    ):
+        if partitioners is None:
+            partitioners = [_create_default_partitioner()]
+
+        super().__init__(
+            default_partitioner_cls=_create_default_partitioner,
+            partitioners=partitioners,
+            edge_compile_config=edge_compile_config
+            or EdgeCompileConfig(_check_ir_validity=False),
+        )
+
+
+class CudaTester(TesterBase):
+    """
+    Tester subclass for CUDA backend.
+
+    This tester defines the recipe for lowering models to the CUDA backend
+    using AOTInductor compilation.
+    """
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        example_inputs: Tuple[torch.Tensor],
+        dynamic_shapes: Optional[Tuple[Any]] = None,
+    ):
+        stage_classes = (
+            executorch.backends.test.harness.Tester.default_stage_classes()
+            | {
+                StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
+            }
+        )
+
+        super().__init__(
+            module=module,
+            stage_classes=stage_classes,
+            example_inputs=example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
@@ -1,11 +1,9 @@
 import copy
 import logging
-
-from typing import Optional
+from typing import Dict, Optional
 
 from executorch.backends.test.harness.stages.stage import Stage, StageType
 from executorch.exir import ExecutorchProgramManager
-
 from torch.utils._pytree import tree_flatten
 
 logger = logging.getLogger(__name__)
@@ -23,12 +21,15 @@
 class Serialize(Stage):
     def __init__(self):
         self.buffer = None
+        self.data_files: Dict[str, bytes] = {}
 
     def stage_type(self) -> StageType:
         return StageType.SERIALIZE
 
     def run(self, artifact: ExecutorchProgramManager, inputs=None) -> None:
         self.buffer = artifact.buffer
+        # Capture external data files (e.g., .ptd files for CUDA backend)
+        self.data_files = artifact.data_files
 
     @property
     def artifact(self) -> bytes:
@@ -40,8 +41,29 @@ def graph_module(self) -> None:
 
     def run_artifact(self, inputs):
         inputs_flattened, _ = tree_flatten(inputs)
+
+        # Combine all external data files into a single buffer for data_map_buffer
+        # Most backends have at most one external data file, but we concatenate
+        # in case there are multiple (though this may not be fully supported)
+        data_map_buffer = None
+        if self.data_files:
+            # If there's exactly one data file, use it directly
+            # Otherwise, log a warning - multiple external files may need special handling
+            if len(self.data_files) == 1:
+                data_map_buffer = list(self.data_files.values())[0]
+            else:
+                # For multiple files, we use the first one and warn
+                # This is a limitation - proper handling would need runtime support
+                logger.warning(
+                    f"Multiple external data files found ({list(self.data_files.keys())}). "
+                    f"Using the first one. This may not work correctly for all backends."
+                )
+                data_map_buffer = list(self.data_files.values())[0]
+
         executorch_module = _load_for_executorch_from_buffer(
-            self.buffer, program_verification=Verification.Minimal
+            self.buffer,
+            data_map_buffer=data_map_buffer,
+            program_verification=Verification.Minimal,
         )
         executorch_output = copy.deepcopy(
             executorch_module.run_method("forward", tuple(inputs_flattened))

@@ -3,7 +3,6 @@
 
 import pytest
 import torch
-
 from executorch.backends.test.suite.flow import all_flows, TestFlow
 from executorch.backends.test.suite.reporting import _sum_op_counts
 from executorch.backends.test.suite.runner import run_test
@@ -103,7 +102,14 @@ def lower_and_run_model(
     ids=str,
 )
 def test_runner(request):
-    return TestRunner(request.param, request.node.name, request.node.originalname)
+    flow = request.param
+    test_name = request.node.name
+
+    # Check if this test should be skipped based on the flow's skip_patterns
+    if flow.should_skip_test(test_name):
+        pytest.skip(f"Test '{test_name}' matches skip pattern for flow '{flow.name}'")
+
+    return TestRunner(flow, test_name, request.node.originalname)
 
 
 @pytest.hookimpl(optionalhook=True)