From 1c86a9e9adace59810e2fdf03c317fb69e4f16ec Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 18:20:50 -0700 Subject: [PATCH 1/4] Migrate A100 CUDA CI jobs to OSDC runners Move the A100-dependent jobs in cuda.yml (export-model-cuda-artifact, test-model-cuda-e2e) and cuda-perf.yml (export-models, benchmark-cuda) from pytorch/test-infra linux_job_v2 (AWS) to linux_job_v3 (OSDC/ARC). Runner labels are remapped per pytorch/.github/arc.yaml: linux.aws.a100 -> mt-l-x86iavx512-11-125-a100 and the A10G fallback linux.g5.4xlarge.nvidia.gpu -> mt-l-x86aavx2-29-113-a10g. Jobs that never run on A100 stay on linux_job_v2 / linux.g5.4xlarge.nvidia.gpu. Authored with Claude Code. --- .github/workflows/cuda-perf.yml | 8 ++++---- .github/workflows/cuda.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml index 1bb9b62be65..2f080028cf7 100644 --- a/.github/workflows/cuda-perf.yml +++ b/.github/workflows/cuda-perf.yml @@ -124,7 +124,7 @@ jobs: export-models: name: export-models needs: set-parameters - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main permissions: id-token: write contents: read @@ -135,7 +135,7 @@ jobs: with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN - runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'mt-l-x86iavx512-11-125-a100' || 'mt-l-x86aavx2-29-113-a10g' }} gpu-arch-type: cuda gpu-arch-version: "13.0" use-custom-docker-registry: false @@ -192,7 +192,7 @@ jobs: contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') || needs.run-decision.outputs.is-full-run == 'true' ) - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main permissions: id-token: write contents: read @@ -201,7 +201,7 @@ jobs: fail-fast: false with: timeout: 90 - runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'mt-l-x86iavx512-11-125-a100' || 'mt-l-x86aavx2-29-113-a10g' }} gpu-arch-type: cuda gpu-arch-version: "13.0" use-custom-docker-registry: false diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index ada0f5983cc..88110f9635b 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -229,7 +229,7 @@ jobs: contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') || needs.run-decision.outputs.is-full-run == 'true' ) - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main permissions: id-token: write contents: read @@ -342,7 +342,7 @@ jobs: with: timeout: 150 secrets-env: EXECUTORCH_HF_TOKEN - runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'mt-l-x86iavx512-11-125-a100' || 'mt-l-x86aavx2-29-113-a10g' }} gpu-arch-type: cuda gpu-arch-version: "13.0" use-custom-docker-registry: false @@ -390,7 +390,7 @@ jobs: contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') || needs.run-decision.outputs.is-full-run == 'true' ) - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v3.yml@main permissions: id-token: write contents: read @@ -494,7 +494,7 @@ jobs: quant: "non-quantized" with: timeout: 90 - runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'mt-l-x86iavx512-11-125-a100' || 'mt-l-x86aavx2-29-113-a10g' }} gpu-arch-type: cuda gpu-arch-version: "13.0" use-custom-docker-registry: false From c0f779389e2f1586d44c37a82f192dd4c4efc9b3 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 18:34:05 -0700 Subject: [PATCH 2/4] Pre-install torch deps from in-cluster cache on OSDC jobs OSDC runners can't reach the public PyPI CDN that download.pytorch.org's transitive deps resolve to, so the torch install in install_requirements.py fails fetching e.g. sympy from files.pythonhosted.org. Pre-install torch's pure-python deps from the in-cluster pypi-cache and clear PIP_EXTRA_INDEX_URL in the four migrated CUDA jobs, mirroring the torchtitan/ao OSDC workaround. Authored with Claude Code. --- .github/workflows/cuda-perf.yml | 12 ++++++++++++ .github/workflows/cuda.yml | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml index 2f080028cf7..f0360e37122 100644 --- a/.github/workflows/cuda-perf.yml +++ b/.github/workflows/cuda-perf.yml @@ -145,6 +145,12 @@ jobs: script: | set -eux echo "::group::Setup ExecuTorch" + # OSDC runners can't reach the public PyPI CDN that download.pytorch.org's + # transitive deps resolve to. Pre-install torch's pure-python deps from the + # in-cluster pypi-cache and drop the default cpu extra-index so the cuda + # torch wheel is the only candidate. + export PIP_EXTRA_INDEX_URL= + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy # Disable MKL to avoid duplicate target error when conda has multiple MKL installations export USE_MKL=OFF ./install_executorch.sh @@ -212,6 +218,12 @@ jobs: script: | set -eux echo "::group::Setup environment" + # OSDC runners can't reach the public PyPI CDN that download.pytorch.org's + # transitive deps resolve to. Pre-install torch's pure-python deps from the + # in-cluster pypi-cache and drop the default cpu extra-index so the cuda + # torch wheel is the only candidate. + export PIP_EXTRA_INDEX_URL= + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy ./install_requirements.sh pip list echo "::endgroup::" diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 88110f9635b..f71003b47d2 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -353,6 +353,12 @@ jobs: set -eux echo "::group::Setup ExecuTorch" + # OSDC runners can't reach the public PyPI CDN that download.pytorch.org's + # transitive deps resolve to. Pre-install torch's pure-python deps from the + # in-cluster pypi-cache and drop the default cpu extra-index so the cuda + # torch wheel is the only candidate. + export PIP_EXTRA_INDEX_URL= + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy # Disable MKL to avoid duplicate target error when conda has multiple MKL installations export USE_MKL=OFF ./install_executorch.sh @@ -502,6 +508,12 @@ jobs: download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }} ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | + # OSDC runners can't reach the public PyPI CDN that download.pytorch.org's + # transitive deps resolve to. Pre-install torch's pure-python deps from the + # in-cluster pypi-cache and drop the default cpu extra-index so the cuda + # torch wheel is the only candidate. + export PIP_EXTRA_INDEX_URL= + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" test-cuda-pybind: From 7b0c89a5004e77f14b285e845dacf1b3faaee97f Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Jun 2026 19:00:35 -0700 Subject: [PATCH 3/4] Pre-install pillow for torchvision on OSDC jobs The example-deps install (torchvision==0.27.0 torchaudio==2.11.0) pulls pillow, which still resolved from files.pythonhosted.org and failed on OSDC. Add pillow to the pre-installed pure-python deps, matching the torchao OSDC list. Authored with Claude Code. --- .github/workflows/cuda-perf.yml | 4 ++-- .github/workflows/cuda.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml index f0360e37122..a0ce0f5c6e9 100644 --- a/.github/workflows/cuda-perf.yml +++ b/.github/workflows/cuda-perf.yml @@ -150,7 +150,7 @@ jobs: # in-cluster pypi-cache and drop the default cpu extra-index so the cuda # torch wheel is the only candidate. export PIP_EXTRA_INDEX_URL= - pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow # Disable MKL to avoid duplicate target error when conda has multiple MKL installations export USE_MKL=OFF ./install_executorch.sh @@ -223,7 +223,7 @@ jobs: # in-cluster pypi-cache and drop the default cpu extra-index so the cuda # torch wheel is the only candidate. export PIP_EXTRA_INDEX_URL= - pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow ./install_requirements.sh pip list echo "::endgroup::" diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index f71003b47d2..f99ddfa771f 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -358,7 +358,7 @@ jobs: # in-cluster pypi-cache and drop the default cpu extra-index so the cuda # torch wheel is the only candidate. export PIP_EXTRA_INDEX_URL= - pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow # Disable MKL to avoid duplicate target error when conda has multiple MKL installations export USE_MKL=OFF ./install_executorch.sh @@ -513,7 +513,7 @@ jobs: # in-cluster pypi-cache and drop the default cpu extra-index so the cuda # torch wheel is the only candidate. export PIP_EXTRA_INDEX_URL= - pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" test-cuda-pybind: From 1b440412bd7adb5a08861c88a90cc6196e12231a Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 11 Jun 2026 09:22:28 -0700 Subject: [PATCH 4/4] Pin fsspec to datasets' constraint on OSDC jobs The examples install pulls datasets==3.6.0, which pins fsspec[http]<=2025.3.0. The unpinned pre-installed fsspec was newer, so pip tried to downgrade it via download.pytorch.org's pythonhosted link, which OSDC can't reach. Pre-install fsspec at <=2025.3.0 so only-if-needed leaves it in place. Authored with Claude Code. --- .github/workflows/cuda-perf.yml | 8 ++++++-- .github/workflows/cuda.yml | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml index a0ce0f5c6e9..ff126dbef1c 100644 --- a/.github/workflows/cuda-perf.yml +++ b/.github/workflows/cuda-perf.yml @@ -150,7 +150,9 @@ jobs: # in-cluster pypi-cache and drop the default cpu extra-index so the cuda # torch wheel is the only candidate. export PIP_EXTRA_INDEX_URL= - pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow + # fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later + # examples install doesn't try to downgrade it from the public CDN. + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow # Disable MKL to avoid duplicate target error when conda has multiple MKL installations export USE_MKL=OFF ./install_executorch.sh @@ -223,7 +225,9 @@ jobs: # in-cluster pypi-cache and drop the default cpu extra-index so the cuda # torch wheel is the only candidate. export PIP_EXTRA_INDEX_URL= - pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow + # fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later + # examples install doesn't try to downgrade it from the public CDN. + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow ./install_requirements.sh pip list echo "::endgroup::" diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index f99ddfa771f..d0da13e5733 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -358,7 +358,9 @@ jobs: # in-cluster pypi-cache and drop the default cpu extra-index so the cuda # torch wheel is the only candidate. export PIP_EXTRA_INDEX_URL= - pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow + # fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later + # examples install doesn't try to downgrade it from the public CDN. + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow # Disable MKL to avoid duplicate target error when conda has multiple MKL installations export USE_MKL=OFF ./install_executorch.sh @@ -513,7 +515,9 @@ jobs: # in-cluster pypi-cache and drop the default cpu extra-index so the cuda # torch wheel is the only candidate. export PIP_EXTRA_INDEX_URL= - pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 fsspec numpy pillow + # fsspec is pinned to satisfy datasets' fsspec[http]<=2025.3.0 so the later + # examples install doesn't try to downgrade it from the public CDN. + pip install filelock typing-extensions "setuptools<82" sympy networkx jinja2 "fsspec[http]<=2025.3.0" numpy pillow source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" test-cuda-pybind: