From 1c86a9e9adace59810e2fdf03c317fb69e4f16ec Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 18:20:50 -0700
Subject: [PATCH 1/4] Migrate A100 CUDA CI jobs to OSDC runners

Move the A100-dependent jobs in cuda.yml (export-model-cuda-artifact,
test-model-cuda-e2e) and cuda-perf.yml (export-models, benchmark-cuda)
from pytorch/test-infra linux_job_v2 (AWS) to linux_job_v3 (OSDC/ARC).
Runner labels are remapped per pytorch/.github/arc.yaml:
linux.aws.a100 -> mt-l-x86iavx512-11-125-a100 and the A10G fallback
linux.g5.4xlarge.nvidia.gpu -> mt-l-x86aavx2-29-113-a10g.

Jobs that never run on A100 stay on linux_job_v2 / linux.g5.4xlarge.nvidia.gpu.

Authored with Claude Code.
---
 .github/workflows/cuda-perf.yml | 8 ++++----
 .github/workflows/cuda.yml      | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
index 1bb9b62be65..2f080028cf7 100644
--- a/.github/workflows/cuda-perf.yml
+++ b/.github/workflows/cuda-perf.yml
@@ -124,7 +124,7 @@ jobs:
   export-models:
     name: export-models
     needs: set-parameters
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main
     permissions:
       id-token: write
       contents: read
@@ -135,7 +135,7 @@ jobs:
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'mt-l-x86iavx512-11-125-a100' || 'mt-l-x86aavx2-29-113-a10g' }}
       gpu-arch-type: cuda
       gpu-arch-version: "13.0"
       use-custom-docker-registry: false
@@ -192,7 +192,7 @@ jobs:
         contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main
     permissions:
       id-token: write
       contents: read
@@ -201,7 +201,7 @@ jobs:
       fail-fast: false
     with:
       timeout: 90
-      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'mt-l-x86iavx512-11-125-a100' || 'mt-l-x86aavx2-29-113-a10g' }}
       gpu-arch-type: cuda
       gpu-arch-version: "13.0"
       use-custom-docker-registry: false
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index ada0f5983cc..88110f9635b 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -229,7 +229,7 @@ jobs:
         contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main
     permissions:
       id-token: write
       contents: read
@@ -342,7 +342,7 @@ jobs:
     with:
       timeout: 150
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'mt-l-x86iavx512-11-125-a100' || 'mt-l-x86aavx2-29-113-a10g' }}
       gpu-arch-type: cuda
       gpu-arch-version: "13.0"
       use-custom-docker-registry: false
@@ -390,7 +390,7 @@ jobs:
         contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main
     permissions:
       id-token: write
       contents: read
@@ -494,7 +494,7 @@ jobs:
             quant: "non-quantized"
     with:
       timeout: 90
-      runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'mt-l-x86iavx512-11-125-a100' || 'mt-l-x86aavx2-29-113-a10g' }}
       gpu-arch-type: cuda
       gpu-arch-version: "13.0"
       use-custom-docker-registry: false

From c0f779389e2f1586d44c37a82f192dd4c4efc9b3 Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 18:34:05 -0700
Subject: [PATCH 2/4] Pre-install torch deps from in-cluster cache on OSDC jobs

OSDC runners can't reach the public PyPI CDN that download.pytorch.org's
transitive deps resolve to, so the torch install in install_requirements.py
fails fetching e.g. sympy from files.pythonhosted.org. Pre-install torch's
pure-python deps from the in-cluster pypi-cache and clear PIP_EXTRA_INDEX_URL
in the four migrated CUDA jobs, mirroring the torchtitan/ao OSDC workaround.

Authored with Claude Code.
---
 .github/workflows/cuda-perf.yml | 12 ++++++++++++
 .github/workflows/cuda.yml      | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
index 2f080028cf7..f0360e37122 100644
--- a/.github/workflows/cuda-perf.yml
+++ b/.github/workflows/cuda-perf.yml
@@ -145,6 +145,12 @@ jobs:
       script: |
         set -eux
         echo "::group::Setup ExecuTorch"
+        # OSDC runners can't reach the public PyPI CDN that download.pytorch.org's
+        # transitive deps resolve to. Pre-install torch's pure-python deps from the
+        # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
+        # torch wheel is the only candidate.
+        export PIP_EXTRA_INDEX_URL=
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy
         # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
         export USE_MKL=OFF
         ./install_executorch.sh
@@ -212,6 +218,12 @@ jobs:
       script: |
         set -eux
         echo "::group::Setup environment"
+        # OSDC runners can't reach the public PyPI CDN that download.pytorch.org's
+        # transitive deps resolve to. Pre-install torch's pure-python deps from the
+        # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
+        # torch wheel is the only candidate.
+        export PIP_EXTRA_INDEX_URL=
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy
         ./install_requirements.sh
         pip list
         echo "::endgroup::"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 88110f9635b..f71003b47d2 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -353,6 +353,12 @@ jobs:
         set -eux
 
         echo "::group::Setup ExecuTorch"
+        # OSDC runners can't reach the public PyPI CDN that download.pytorch.org's
+        # transitive deps resolve to. Pre-install torch's pure-python deps from the
+        # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
+        # torch wheel is the only candidate.
+        export PIP_EXTRA_INDEX_URL=
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy
         # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
         export USE_MKL=OFF
         ./install_executorch.sh
@@ -502,6 +508,12 @@ jobs:
       download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
+        # OSDC runners can't reach the public PyPI CDN that download.pytorch.org's
+        # transitive deps resolve to. Pre-install torch's pure-python deps from the
+        # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
+        # torch wheel is the only candidate.
+        export PIP_EXTRA_INDEX_URL=
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy
         source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
 
   test-cuda-pybind:

From 7b0c89a5004e77f14b285e845dacf1b3faaee97f Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Wed, 10 Jun 2026 19:00:35 -0700
Subject: [PATCH 3/4] Pre-install pillow for torchvision on OSDC jobs

The example-deps install (torchvision==0.27.0 torchaudio==2.11.0) pulls
pillow, which still resolved from files.pythonhosted.org and failed on
OSDC. Add pillow to the pre-installed pure-python deps, matching the
torchao OSDC list.

Authored with Claude Code.
---
 .github/workflows/cuda-perf.yml | 4 ++--
 .github/workflows/cuda.yml      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
index f0360e37122..a0ce0f5c6e9 100644
--- a/.github/workflows/cuda-perf.yml
+++ b/.github/workflows/cuda-perf.yml
@@ -150,7 +150,7 @@ jobs:
         # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
         # torch wheel is the only candidate.
         export PIP_EXTRA_INDEX_URL=
-        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow
         # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
         export USE_MKL=OFF
         ./install_executorch.sh
@@ -223,7 +223,7 @@ jobs:
         # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
         # torch wheel is the only candidate.
         export PIP_EXTRA_INDEX_URL=
-        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow
         ./install_requirements.sh
         pip list
         echo "::endgroup::"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index f71003b47d2..f99ddfa771f 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -358,7 +358,7 @@ jobs:
         # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
         # torch wheel is the only candidate.
         export PIP_EXTRA_INDEX_URL=
-        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow
         # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
         export USE_MKL=OFF
         ./install_executorch.sh
@@ -513,7 +513,7 @@ jobs:
         # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
         # torch wheel is the only candidate.
         export PIP_EXTRA_INDEX_URL=
-        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow
         source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
 
   test-cuda-pybind:

From 1b440412bd7adb5a08861c88a90cc6196e12231a Mon Sep 17 00:00:00 2001
From: Huy Do <huydo@meta.com>
Date: Thu, 11 Jun 2026 09:22:28 -0700
Subject: [PATCH 4/4] Pin fsspec to datasets' constraint on OSDC jobs

The examples install pulls datasets==3.6.0, which pins
fsspec[http]<=2025.3.0. The unpinned pre-installed fsspec was newer, so
pip tried to downgrade it via download.pytorch.org's pythonhosted link,
which OSDC can't reach. Pre-install fsspec at <=2025.3.0 so only-if-needed
leaves it in place.

Authored with Claude Code.
---
 .github/workflows/cuda-perf.yml | 8 ++++++--
 .github/workflows/cuda.yml      | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
index a0ce0f5c6e9..ff126dbef1c 100644
--- a/.github/workflows/cuda-perf.yml
+++ b/.github/workflows/cuda-perf.yml
@@ -150,7 +150,9 @@ jobs:
         # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
         # torch wheel is the only candidate.
         export PIP_EXTRA_INDEX_URL=
-        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow
+        # fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later
+        # examples install doesn't try to downgrade it from the public CDN.
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow
         # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
         export USE_MKL=OFF
         ./install_executorch.sh
@@ -223,7 +225,9 @@ jobs:
         # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
         # torch wheel is the only candidate.
         export PIP_EXTRA_INDEX_URL=
-        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow
+        # fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later
+        # examples install doesn't try to downgrade it from the public CDN.
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow
         ./install_requirements.sh
         pip list
         echo "::endgroup::"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index f99ddfa771f..d0da13e5733 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -358,7 +358,9 @@ jobs:
         # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
         # torch wheel is the only candidate.
         export PIP_EXTRA_INDEX_URL=
-        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow
+        # fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later
+        # examples install doesn't try to downgrade it from the public CDN.
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow
         # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
         export USE_MKL=OFF
         ./install_executorch.sh
@@ -513,7 +515,9 @@ jobs:
         # in-cluster pypi-cache and drop the default cpu extra-index so the cuda
         # torch wheel is the only candidate.
         export PIP_EXTRA_INDEX_URL=
-        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow
+        # fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later
+        # examples install doesn't try to downgrade it from the public CDN.
+        pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow
         source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
 
   test-cuda-pybind: