diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index 86d53c996bc..bfdae4f733e 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -17,7 +17,7 @@ echo "Running backend test job for suite $SUITE, flow $FLOW."
 echo "Saving job artifacts to $ARTIFACT_DIR."
 
 eval "$(conda shell.bash hook)"
-CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+CONDA_ENV=$(conda env list --json | python -c "import sys, json; print(json.load(sys.stdin)['envs'][-1])")
 conda activate "${CONDA_ENV}"
 
 if [[ "$(uname)" == "Darwin" ]]; then
@@ -56,6 +56,32 @@ if [[ "$FLOW" == *vulkan* ]]; then
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
+if [[ "$FLOW" == *cuda* ]]; then
+    # When running with the PyTorch test-infra Docker image (which has nvcc),
+    # install executorch directly — it will auto-detect CUDA and install
+    # CUDA-enabled PyTorch.  Skip setup-linux.sh which expects the custom
+    # Docker image with pre-built pinned-commit torch.
+    echo "Installing ExecuTorch with CUDA support..."
+    ./install_executorch.sh --editable
+
+    # Verify PyTorch was installed with CUDA support
+    python -c "import torch; assert torch.cuda.is_available(), 'PyTorch CUDA not available after reinstall'; print(f'PyTorch {torch.__version__} with CUDA {torch.version.cuda}')" || {
+        echo "ERROR: PyTorch was not installed with CUDA support"
+        exit 1
+    }
+
+    # Fix libstdc++ GLIBCXX version for CUDA backend.
+    # The embedded .so files in the CUDA blob require GLIBCXX_3.4.30
+    # which the default conda libstdc++ doesn't have.
+    echo "Installing newer libstdc++ for CUDA backend..."
+    conda install -y -c conda-forge 'libstdcxx-ng>=12'
+    export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH:-}"
+
+    source .ci/scripts/utils.sh
+    CMAKE_ARGS="$EXTRA_BUILD_ARGS" build_executorch_runner cmake Release
+    CUDA_SETUP_DONE=1
+fi
+
 if [[ "$FLOW" == *arm* ]]; then
 
     # Setup ARM deps.
@@ -78,12 +104,14 @@ if [[ "$FLOW" == *arm* ]]; then
     fi
 fi
 
-if [[ $IS_MACOS -eq 1 ]]; then
-    SETUP_SCRIPT=.ci/scripts/setup-macos.sh
-else
-    SETUP_SCRIPT=.ci/scripts/setup-linux.sh
+if [[ "${CUDA_SETUP_DONE:-0}" != "1" ]]; then
+    if [[ $IS_MACOS -eq 1 ]]; then
+        SETUP_SCRIPT=.ci/scripts/setup-macos.sh
+    else
+        SETUP_SCRIPT=.ci/scripts/setup-linux.sh
+    fi
+    CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
 fi
-CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
 
 GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
 export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
diff --git a/.github/workflows/test-backend-cuda.yml b/.github/workflows/test-backend-cuda.yml
new file mode 100644
index 00000000000..220bbc3a673
--- /dev/null
+++ b/.github/workflows/test-backend-cuda.yml
@@ -0,0 +1,41 @@
+name: Test CUDA Backend
+
+on:
+  schedule:
+    - cron: 0 2 * * *
+  push:
+    branches:
+      - release/*
+    tags:
+      - ciflow/nightly/*
+  pull_request:
+    paths:
+      - .github/workflows/test-backend-cuda.yml
+      - .ci/scripts/test_backend.sh
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  test-cuda:
+    strategy:
+      fail-fast: false
+      matrix:
+        suite: [models, operators]
+
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: '12.6'
+      use-custom-docker-registry: false
+      submodules: recursive
+      timeout: 120
+      upload-artifact: test-report-cuda-${{ matrix.suite }}
+      script: |
+        set -eux
+
+        source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "cuda" "${RUNNER_ARTIFACT_DIR}"
diff --git a/backends/cuda/test/tester.py b/backends/cuda/test/tester.py
new file mode 100644
index 00000000000..e4ac2b366d4
--- /dev/null
+++ b/backends/cuda/test/tester.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, List, Optional, Tuple
+
+import executorch
+import executorch.backends.test.harness.stages as BaseStages
+import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.backends.test.harness import Tester as TesterBase
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.partitioner import Partitioner
+
+
+def _create_default_partitioner() -> CudaPartitioner:
+    """Create a CudaPartitioner with default compile specs."""
+    compile_specs = [CudaBackend.generate_method_name_compile_spec("forward")]
+    return CudaPartitioner(compile_specs)
+
+
+class ToEdgeTransformAndLower(BaseStages.ToEdgeTransformAndLower):
+    """CUDA-specific ToEdgeTransformAndLower stage."""
+
+    def __init__(
+        self,
+        partitioners: Optional[List[Partitioner]] = None,
+        edge_compile_config: Optional[EdgeCompileConfig] = None,
+    ):
+        if partitioners is None:
+            partitioners = [_create_default_partitioner()]
+
+        super().__init__(
+            default_partitioner_cls=_create_default_partitioner,
+            partitioners=partitioners,
+            edge_compile_config=edge_compile_config
+            or EdgeCompileConfig(_check_ir_validity=False),
+        )
+
+
+class CudaTester(TesterBase):
+    """
+    Tester subclass for CUDA backend.
+
+    This tester defines the recipe for lowering models to the CUDA backend
+    using AOTInductor compilation.
+    """
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        example_inputs: Tuple[torch.Tensor],
+        dynamic_shapes: Optional[Tuple[Any]] = None,
+    ):
+        stage_classes = (
+            executorch.backends.test.harness.Tester.default_stage_classes()
+            | {
+                StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
+            }
+        )
+
+        super().__init__(
+            module=module,
+            stage_classes=stage_classes,
+            example_inputs=example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
diff --git a/backends/test/harness/stages/serialize.py b/backends/test/harness/stages/serialize.py
index a5be1631d98..2cdcfb1b5a5 100644
--- a/backends/test/harness/stages/serialize.py
+++ b/backends/test/harness/stages/serialize.py
@@ -1,11 +1,9 @@
 import copy
 import logging
-
-from typing import Optional
+from typing import Dict, Optional
 
 from executorch.backends.test.harness.stages.stage import Stage, StageType
 from executorch.exir import ExecutorchProgramManager
-
 from torch.utils._pytree import tree_flatten
 
 logger = logging.getLogger(__name__)
@@ -23,12 +21,15 @@
 class Serialize(Stage):
     def __init__(self):
         self.buffer = None
+        self.data_files: Dict[str, bytes] = {}
 
     def stage_type(self) -> StageType:
         return StageType.SERIALIZE
 
     def run(self, artifact: ExecutorchProgramManager, inputs=None) -> None:
         self.buffer = artifact.buffer
+        # Capture external data files (e.g., .ptd files for CUDA backend)
+        self.data_files = artifact.data_files
 
     @property
     def artifact(self) -> bytes:
@@ -40,8 +41,29 @@ def graph_module(self) -> None:
 
     def run_artifact(self, inputs):
         inputs_flattened, _ = tree_flatten(inputs)
+
+        # Combine all external data files into a single buffer for data_map_buffer
+        # Most backends have at most one external data file, but we concatenate
+        # in case there are multiple (though this may not be fully supported)
+        data_map_buffer = None
+        if self.data_files:
+            # If there's exactly one data file, use it directly
+            # Otherwise, log a warning - multiple external files may need special handling
+            if len(self.data_files) == 1:
+                data_map_buffer = list(self.data_files.values())[0]
+            else:
+                # For multiple files, we use the first one and warn
+                # This is a limitation - proper handling would need runtime support
+                logger.warning(
+                    f"Multiple external data files found ({list(self.data_files.keys())}). "
+                    f"Using the first one. This may not work correctly for all backends."
+                )
+                data_map_buffer = list(self.data_files.values())[0]
+
         executorch_module = _load_for_executorch_from_buffer(
-            self.buffer, program_verification=Verification.Minimal
+            self.buffer,
+            data_map_buffer=data_map_buffer,
+            program_verification=Verification.Minimal,
         )
         executorch_output = copy.deepcopy(
             executorch_module.run_method("forward", tuple(inputs_flattened))
diff --git a/backends/test/suite/conftest.py b/backends/test/suite/conftest.py
index 6de1e59a6b1..340e6c9ae2b 100644
--- a/backends/test/suite/conftest.py
+++ b/backends/test/suite/conftest.py
@@ -3,7 +3,6 @@
 
 import pytest
 import torch
-
 from executorch.backends.test.suite.flow import all_flows, TestFlow
 from executorch.backends.test.suite.reporting import _sum_op_counts
 from executorch.backends.test.suite.runner import run_test
@@ -103,7 +102,14 @@ def lower_and_run_model(
     ids=str,
 )
 def test_runner(request):
-    return TestRunner(request.param, request.node.name, request.node.originalname)
+    flow = request.param
+    test_name = request.node.name
+
+    # Check if this test should be skipped based on the flow's skip_patterns
+    if flow.should_skip_test(test_name):
+        pytest.skip(f"Test '{test_name}' matches skip pattern for flow '{flow.name}'")
+
+    return TestRunner(flow, test_name, request.node.originalname)
 
 
 @pytest.hookimpl(optionalhook=True)
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
index f3c9ee75083..5c071d025c1 100644
--- a/backends/test/suite/flow.py
+++ b/backends/test/suite/flow.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-
 from dataclasses import dataclass, field
 from typing import Callable
 
@@ -53,98 +52,98 @@ def __str__(self):
         return self.name
 
 
-def all_flows() -> dict[str, TestFlow]:
-    flows = []
-
-    from executorch.backends.test.suite.flows.portable import PORTABLE_TEST_FLOW
+def _try_import_flows(
+    module_path: str, flow_names: list[str], backend_name: str
+) -> list[TestFlow]:
+    """
+    Attempt to import test flows from a module.
 
-    flows += [
-        PORTABLE_TEST_FLOW,
-    ]
+    Args:
+        module_path: The full module path to import from.
+        flow_names: List of flow variable names to import from the module.
+        backend_name: Human-readable name for logging on failure.
 
+    Returns:
+        List of imported TestFlow objects, or empty list if import fails.
+    """
     try:
-        from executorch.backends.test.suite.flows.xnnpack import (
-            XNNPACK_DYNAMIC_INT8_PER_CHANNEL_TEST_FLOW,
-            XNNPACK_STATIC_INT8_PER_CHANNEL_TEST_FLOW,
-            XNNPACK_STATIC_INT8_PER_TENSOR_TEST_FLOW,
-            XNNPACK_TEST_FLOW,
-        )
-
-        flows += [
-            XNNPACK_TEST_FLOW,
-            XNNPACK_DYNAMIC_INT8_PER_CHANNEL_TEST_FLOW,
-            XNNPACK_STATIC_INT8_PER_CHANNEL_TEST_FLOW,
-            XNNPACK_STATIC_INT8_PER_TENSOR_TEST_FLOW,
-        ]
-    except Exception as e:
-        logger.info(f"Skipping XNNPACK flow registration: {e}")
+        import importlib
 
-    try:
-        from executorch.backends.test.suite.flows.coreml import (
-            COREML_STATIC_INT8_TEST_FLOW,
-            COREML_TEST_FLOW,
-        )
-
-        flows += [
-            COREML_TEST_FLOW,
-            COREML_STATIC_INT8_TEST_FLOW,
-        ]
+        module = importlib.import_module(module_path)
+        return [getattr(module, name) for name in flow_names]
     except Exception as e:
-        logger.info(f"Skipping Core ML flow registration: {e}")
+        logger.info(f"Skipping {backend_name} flow registration: {e}")
+        return []
+
+
+# Registry of backend flows to import: (module_path, flow_names, backend_name)
+_FLOW_REGISTRY: list[tuple[str, list[str], str]] = [
+    (
+        "executorch.backends.test.suite.flows.xnnpack",
+        [
+            "XNNPACK_TEST_FLOW",
+            "XNNPACK_DYNAMIC_INT8_PER_CHANNEL_TEST_FLOW",
+            "XNNPACK_STATIC_INT8_PER_CHANNEL_TEST_FLOW",
+            "XNNPACK_STATIC_INT8_PER_TENSOR_TEST_FLOW",
+        ],
+        "XNNPACK",
+    ),
+    (
+        "executorch.backends.test.suite.flows.coreml",
+        [
+            "COREML_TEST_FLOW",
+            "COREML_STATIC_INT8_TEST_FLOW",
+        ],
+        "Core ML",
+    ),
+    (
+        "executorch.backends.test.suite.flows.vulkan",
+        [
+            "VULKAN_TEST_FLOW",
+            "VULKAN_STATIC_INT8_PER_CHANNEL_TEST_FLOW",
+        ],
+        "Vulkan",
+    ),
+    (
+        "executorch.backends.test.suite.flows.qualcomm",
+        [
+            "QNN_TEST_FLOW",
+            "QNN_16A16W_TEST_FLOW",
+            "QNN_16A8W_TEST_FLOW",
+            "QNN_16A4W_TEST_FLOW",
+            "QNN_16A4W_BLOCK_TEST_FLOW",
+            "QNN_8A8W_TEST_FLOW",
+        ],
+        "QNN",
+    ),
+    (
+        "executorch.backends.test.suite.flows.arm",
+        [
+            "ARM_TOSA_FP_FLOW",
+            "ARM_TOSA_INT_FLOW",
+            "ARM_ETHOS_U55_FLOW",
+            "ARM_ETHOS_U85_FLOW",
+            "ARM_VGF_FP_FLOW",
+            "ARM_VGF_INT_FLOW",
+        ],
+        "ARM",
+    ),
+    (
+        "executorch.backends.test.suite.flows.cuda",
+        [
+            "CUDA_TEST_FLOW",
+        ],
+        "CUDA",
+    ),
+]
 
-    try:
-        from executorch.backends.test.suite.flows.vulkan import (
-            VULKAN_STATIC_INT8_PER_CHANNEL_TEST_FLOW,
-            VULKAN_TEST_FLOW,
-        )
-
-        flows += [
-            VULKAN_TEST_FLOW,
-            VULKAN_STATIC_INT8_PER_CHANNEL_TEST_FLOW,
-        ]
-    except Exception as e:
-        logger.info(f"Skipping Vulkan flow registration: {e}")
 
-    try:
-        from executorch.backends.test.suite.flows.qualcomm import (
-            QNN_16A16W_TEST_FLOW,
-            QNN_16A4W_BLOCK_TEST_FLOW,
-            QNN_16A4W_TEST_FLOW,
-            QNN_16A8W_TEST_FLOW,
-            QNN_8A8W_TEST_FLOW,
-            QNN_TEST_FLOW,
-        )
-
-        flows += [
-            QNN_TEST_FLOW,
-            QNN_16A16W_TEST_FLOW,
-            QNN_16A8W_TEST_FLOW,
-            QNN_16A4W_TEST_FLOW,
-            QNN_16A4W_BLOCK_TEST_FLOW,
-            QNN_8A8W_TEST_FLOW,
-        ]
-    except Exception as e:
-        logger.info(f"Skipping QNN flow registration: {e}")
+def all_flows() -> dict[str, TestFlow]:
+    from executorch.backends.test.suite.flows.portable import PORTABLE_TEST_FLOW
 
-    try:
-        from executorch.backends.test.suite.flows.arm import (
-            ARM_ETHOS_U55_FLOW,
-            ARM_ETHOS_U85_FLOW,
-            ARM_TOSA_FP_FLOW,
-            ARM_TOSA_INT_FLOW,
-            ARM_VGF_FP_FLOW,
-            ARM_VGF_INT_FLOW,
-        )
-
-        flows += [
-            ARM_TOSA_FP_FLOW,
-            ARM_TOSA_INT_FLOW,
-            ARM_ETHOS_U55_FLOW,
-            ARM_ETHOS_U85_FLOW,
-            ARM_VGF_FP_FLOW,
-            ARM_VGF_INT_FLOW,
-        ]
-    except Exception as e:
-        logger.info(f"Skipping ARM flow registration: {e}")
+    flows = [PORTABLE_TEST_FLOW]
+
+    for module_path, flow_names, backend_name in _FLOW_REGISTRY:
+        flows.extend(_try_import_flows(module_path, flow_names, backend_name))
 
     return {f.name: f for f in flows if f is not None}
diff --git a/backends/test/suite/flows/cuda.py b/backends/test/suite/flows/cuda.py
new file mode 100644
index 00000000000..2db3eb0fa5c
--- /dev/null
+++ b/backends/test/suite/flows/cuda.py
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.cuda.test.tester import CudaTester
+from executorch.backends.test.suite.flow import TestFlow
+
+
+def _create_cuda_flow(name: str = "cuda") -> TestFlow:
+    """Create a test flow for the CUDA backend.
+
+    The CUDA backend saves data externally (.so and weights blob in .ptd file).
+    The test harness serialize stage has been updated to support loading external
+    data via the data_map_buffer parameter of _load_for_executorch_from_buffer.
+    """
+
+    return TestFlow(name, backend="cuda", tester_factory=CudaTester, quantize=False)
+
+
+CUDA_TEST_FLOW = _create_cuda_flow("cuda")
diff --git a/exir/program/_program.py b/exir/program/_program.py
index baacd5eaec4..bcc667ed630 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1974,6 +1974,22 @@ def buffer(self) -> bytes:
             self._buffer = bytes(self._pte_data)
         return self._buffer
 
+    @property
+    def data_files(self) -> Dict[str, bytes]:
+        """Returns the external data files as a dictionary of filename to bytes.
+
+        External data files (e.g., .ptd files) contain tensor data that is stored
+        separately from the main .pte file. This is used by backends like CUDA
+        that serialize weights externally.
+
+        Returns:
+            Dict[str, bytes]: Dictionary mapping filenames to their byte content.
+            Returns an empty dict if no external data files exist.
+        """
+        if self._tensor_data is None:
+            return {}
+        return {filename: bytes(cord) for filename, cord in self._tensor_data.items()}
+
     def get_etrecord(self):
         """
         Get the generated ETRecord if etrecord generation was enabled.