kubeflow · google-oss-prow · Feb 8, 2026 · Jan 5, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/.github/workflows/test-e2e-gpu.yaml b/.github/workflows/test-e2e-gpu.yaml
@@ -11,7 +11,9 @@ permissions:
 jobs:
   gpu-e2e-test:
     name: GPU E2E Test
-    runs-on: oracle-vm-16cpu-a10gpu-240gb
+    runs-on:
+      labels: oracle-vm-gpu-a10-1
+      group: GPUs
 
     env:
       GOPATH: ${{ github.workspace }}/go
@@ -26,54 +28,36 @@ jobs:
         kubernetes-version: ["1.33.1"]
 
     steps:
-      - name: Check GPU label
-        id: check-label
-        run: |
-          if [[ "${{ join(github.event.pull_request.labels.*.name, ',') }}" != *"ok-to-test-gpu-runner"* ]]; then
-            echo "✅ Skipping GPU E2E tests (label not present)."
-            echo "skip=true" >> $GITHUB_OUTPUT
-            exit 0
-          else
-            echo "Label found. Requesting environment approval to run GPU tests."
-            echo "skip=false" >> $GITHUB_OUTPUT
-          fi
-
       - name: Check out code
-        if: steps.check-label.outputs.skip == 'false'
         uses: actions/checkout@v6
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
 
       - name: Setup Go
-        if: steps.check-label.outputs.skip == 'false'
         uses: actions/setup-go@v6
         with:
           go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod
 
       - name: Setup Python
-        if: steps.check-label.outputs.skip == 'false'
         uses: actions/setup-python@v6
         with:
           python-version: 3.11
 
       - name: Install dependencies
-        if: steps.check-label.outputs.skip == 'false'
         run: |
           pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
           pip install git+https://github.com/kubeflow/sdk.git@main
 
-      - name: Setup cluster with GPU support using nvidia/kind
-        if: steps.check-label.outputs.skip == 'false'
+      - name: Setup GPU cluster with nvkind
         run: |
           make test-e2e-setup-gpu-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
 
       - name: Run e2e test on GPU cluster
-        if: steps.check-label.outputs.skip == 'false'
         run: |
           mkdir -p artifacts/notebooks
-          make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/qwen2_5/qwen2.5-1.5B-with-alpaca.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_qwen2_5_with_alpaca-trainjob-yaml.ipynb TIMEOUT=900
-          make test-e2e-notebook NOTEBOOK_INPUT=./examples/jax/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_jax_mnist.ipynb PAPERMILL_PARAMS="-p num_cpu 3 -p num_gpu 1" TIMEOUT=600
+          make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/qwen2_5/qwen2.5-1.5B-with-alpaca.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_qwen2_5_with_alpaca-trainjob-yaml.ipynb TIMEOUT=600
+          make test-e2e-notebook NOTEBOOK_INPUT=./examples/jax/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_jax_mnist.ipynb PAPERMILL_PARAMS="-p num_cpu 8 -p num_gpu 1 -p num_nodes 1" TIMEOUT=600
 
       - name: Upload Artifacts to GitHub
         if: always()
@@ -82,13 +66,3 @@ jobs:
           name: ${{ matrix.kubernetes-version }}
           path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/artifacts/*
           retention-days: 1
-
-  delete-kind-cluster:
-    name: Delete kind Cluster
-    runs-on: oracle-vm-16cpu-a10gpu-240gb
-    needs: [gpu-e2e-test]
-    if: always()
-    steps:
-      - name: Delete any existing kind cluster
-        run: |
-          sudo kind delete cluster --name kind-gpu && echo "kind cluster has been deleted" || echo "kind cluster doesn't exist"
diff --git a/cmd/trainers/torchtune/requirements.txt b/cmd/trainers/torchtune/requirements.txt
@@ -1,3 +1,4 @@
 torchao>=0.9.0
 torchtune==0.6.1
 bitsandbytes>=0.41.1
+kagglehub>=0.4.0
diff --git a/examples/jax/image-classification/mnist.ipynb b/examples/jax/image-classification/mnist.ipynb
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -315,7 +315,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": null,
    "metadata": {
     "editable": true,
     "slideshow": {
@@ -329,12 +329,13 @@
    "source": [
     "#parameters\n",
     "num_cpu=3\n",
-    "num_gpu=0"
+    "num_gpu=0\n",
+    "num_nodes=3"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": null,
    "metadata": {
     "editable": true,
     "execution": {
@@ -369,7 +370,7 @@
     "    trainer=CustomTrainer(\n",
     "        func=jax_train_mnist,\n",
     "        # Set how many JAX nodes you want to use for distributed training.\n",
-    "        num_nodes=3,\n",
+    "        num_nodes=num_nodes,\n",
     "        resources_per_node=resources_per_node,\n",
     "    ),\n",
     "    runtime=\"jax-distributed\",\n",