From b8cfa9f9f8578bfb2381357d5e3dbd5e83e0777d Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 6 Apr 2026 10:54:36 -0700 Subject: [PATCH 1/7] ci(actions): add manual gpu e2e workflow --- .github/workflows/gpu-e2e.yml | 80 +++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 .github/workflows/gpu-e2e.yml diff --git a/.github/workflows/gpu-e2e.yml b/.github/workflows/gpu-e2e.yml new file mode 100644 index 000000000..d045d26e8 --- /dev/null +++ b/.github/workflows/gpu-e2e.yml @@ -0,0 +1,80 @@ +name: GPU E2E Test + +on: + workflow_dispatch: + inputs: + gpu-runner: + description: GPU runner label to target + required: true + type: choice + default: linux-arm64-gpu-l4-latest-1 + options: + - linux-arm64-gpu-l4-latest-1 + - linux-amd64-gpu-rtxpro6000-latest-1 + +permissions: + contents: read + packages: write + +jobs: + build-gateway: + uses: ./.github/workflows/docker-build.yml + with: + component: gateway + platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }} + runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }} + + build-cluster: + uses: ./.github/workflows/docker-build.yml + with: + component: cluster + platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }} + runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }} + + e2e-gpu: + name: GPU E2E + needs: [build-gateway, build-cluster] + runs-on: ${{ inputs.gpu-runner }} + timeout-minutes: 30 + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --privileged + volumes: + - /var/run/docker.sock:/var/run/docker.sock + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IMAGE_TAG: ${{ github.sha }} + OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell + OPENSHELL_REGISTRY_HOST: ghcr.io + OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell + OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} + OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + OPENSHELL_GATEWAY: e2e-gpu + steps: + - uses: actions/checkout@v4 + + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Pull cluster image + run: docker pull ghcr.io/nvidia/openshell/cluster:${{ github.sha }} + + - name: Install Python dependencies and generate protobuf stubs + run: uv sync --frozen && mise run --no-prepare python:proto + + - name: Bootstrap GPU cluster + env: + GATEWAY_HOST: host.docker.internal + GATEWAY_PORT: "8080" + CLUSTER_NAME: e2e-gpu + CLUSTER_GPU: "1" + SKIP_IMAGE_PUSH: "1" + SKIP_CLUSTER_IMAGE_BUILD: "1" + OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ github.sha }} + run: mise run --no-prepare --skip-deps cluster + + - name: Run GPU E2E test + run: mise run --no-prepare --skip-deps e2e:python:gpu From 4c9edbf6f417674ae77d6fda9ace2ed7ab3e86e8 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 6 Apr 2026 10:57:24 -0700 Subject: [PATCH 2/7] ci(actions): add manual gpu branch e2e path --- .github/workflows/branch-e2e.yml | 36 ++++++++++++++ .github/workflows/e2e-test.yml | 50 +++++++++++++++++--- .github/workflows/gpu-e2e.yml | 80 -------------------------------- 3 files changed, 80 insertions(+), 86 deletions(-) delete mode 100644 .github/workflows/gpu-e2e.yml diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index ad53bb635..cafdf4929 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -3,6 +3,16 @@ name: Branch E2E Checks on: pull_request: types: [opened, synchronize, reopened, labeled] + workflow_dispatch: + inputs: + gpu-runner: + description: GPU runner label to target + required: true + type: choice + default: linux-arm64-gpu-l4-latest-1 + options: + - linux-arm64-gpu-l4-latest-1 + - linux-amd64-gpu-rtxpro6000-latest-1 permissions: contents: read @@ -26,8 +36,34 @@ jobs: runner: build-arm64 e2e: + if: github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'test:e2e') needs: [build-gateway, build-cluster] uses: ./.github/workflows/e2e-test.yml with: image-tag: ${{ github.sha }} runner: build-arm64 + + build-gateway-gpu: + if: github.event_name == 'workflow_dispatch' + uses: ./.github/workflows/docker-build.yml + with: + component: gateway + platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }} + runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }} + + build-cluster-gpu: + if: github.event_name == 'workflow_dispatch' + uses: ./.github/workflows/docker-build.yml + with: + component: cluster + platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }} + runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }} + + e2e-gpu: + if: github.event_name == 'workflow_dispatch' + needs: [build-gateway-gpu, build-cluster-gpu] + uses: ./.github/workflows/e2e-test.yml + with: + image-tag: ${{ github.sha }} + runner: ${{ inputs.gpu-runner }} + suite: python-gpu diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index a89f4508f..057cef1a0 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -1,6 +1,29 @@ name: E2E Test on: + workflow_dispatch: + inputs: + image-tag: + description: "Image tag to test (typically the commit SHA)" + required: false + type: string + default: "" + runner: + description: "GitHub Actions runner label" + required: false + type: string + default: "build-amd64" + suite: + description: "E2E suite to run" + required: false + type: choice + default: all + options: + - all + - python + - rust + - gateway-resume + - python-gpu workflow_call: inputs: image-tag: @@ -12,6 +35,11 @@ on: required: false type: string default: "build-amd64" + suite: + description: "E2E suite to run" + required: false + type: string + default: "all" permissions: contents: read @@ -20,6 +48,7 @@ permissions: jobs: e2e: name: "E2E (${{ matrix.suite }})" + if: ${{ inputs.suite == 'all' || inputs.suite == matrix.suite }} runs-on: ${{ inputs.runner }} timeout-minutes: 30 strategy: @@ -29,15 +58,23 @@ jobs: - suite: python cluster: e2e-python port: "8080" + cluster_gpu: "0" cmd: "mise run --no-prepare --skip-deps e2e:python" - suite: rust cluster: e2e-rust port: "8081" + cluster_gpu: "0" cmd: "mise run --no-prepare --skip-deps e2e:rust" - suite: gateway-resume cluster: e2e-resume port: "8082" + cluster_gpu: "0" cmd: "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test gateway_resume" + - suite: python-gpu + cluster: e2e-gpu + port: "8083" + cluster_gpu: "1" + cmd: "mise run --no-prepare --skip-deps e2e:python:gpu" container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -48,7 +85,7 @@ jobs: - /var/run/docker.sock:/var/run/docker.sock env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IMAGE_TAG: ${{ inputs.image-tag }} + IMAGE_TAG: ${{ inputs.image-tag || github.sha }} OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell OPENSHELL_REGISTRY_HOST: ghcr.io OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell @@ -62,18 +99,18 @@ jobs: run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - name: Pull cluster image - run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} + run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} - name: Install Python dependencies and generate protobuf stubs - if: matrix.suite == 'python' + if: matrix.suite == 'python' || matrix.suite == 'python-gpu' run: uv sync --frozen && mise run --no-prepare python:proto - name: Build Rust CLI - if: matrix.suite != 'python' + if: matrix.suite == 'rust' || matrix.suite == 'gateway-resume' run: cargo build -p openshell-cli --features openshell-core/dev-settings - name: Install SSH client - if: matrix.suite != 'python' + if: matrix.suite == 'rust' || matrix.suite == 'gateway-resume' run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/* - name: Bootstrap cluster @@ -81,9 +118,10 @@ jobs: GATEWAY_HOST: host.docker.internal GATEWAY_PORT: ${{ matrix.port }} CLUSTER_NAME: ${{ matrix.cluster }} + CLUSTER_GPU: ${{ matrix.cluster_gpu }} SKIP_IMAGE_PUSH: "1" SKIP_CLUSTER_IMAGE_BUILD: "1" - OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} + OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} run: mise run --no-prepare --skip-deps cluster - name: Run tests diff --git a/.github/workflows/gpu-e2e.yml b/.github/workflows/gpu-e2e.yml deleted file mode 100644 index d045d26e8..000000000 --- a/.github/workflows/gpu-e2e.yml +++ /dev/null @@ -1,80 +0,0 @@ -name: GPU E2E Test - -on: - workflow_dispatch: - inputs: - gpu-runner: - description: GPU runner label to target - required: true - type: choice - default: linux-arm64-gpu-l4-latest-1 - options: - - linux-arm64-gpu-l4-latest-1 - - linux-amd64-gpu-rtxpro6000-latest-1 - -permissions: - contents: read - packages: write - -jobs: - build-gateway: - uses: ./.github/workflows/docker-build.yml - with: - component: gateway - platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }} - runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }} - - build-cluster: - uses: ./.github/workflows/docker-build.yml - with: - component: cluster - platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }} - runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }} - - e2e-gpu: - name: GPU E2E - needs: [build-gateway, build-cluster] - runs-on: ${{ inputs.gpu-runner }} - timeout-minutes: 30 - container: - image: ghcr.io/nvidia/openshell/ci:latest - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - options: --privileged - volumes: - - /var/run/docker.sock:/var/run/docker.sock - env: - MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IMAGE_TAG: ${{ github.sha }} - OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell - OPENSHELL_REGISTRY_HOST: ghcr.io - OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell - OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} - OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} - OPENSHELL_GATEWAY: e2e-gpu - steps: - - uses: actions/checkout@v4 - - - name: Log in to GHCR - run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - - - name: Pull cluster image - run: docker pull ghcr.io/nvidia/openshell/cluster:${{ github.sha }} - - - name: Install Python dependencies and generate protobuf stubs - run: uv sync --frozen && mise run --no-prepare python:proto - - - name: Bootstrap GPU cluster - env: - GATEWAY_HOST: host.docker.internal - GATEWAY_PORT: "8080" - CLUSTER_NAME: e2e-gpu - CLUSTER_GPU: "1" - SKIP_IMAGE_PUSH: "1" - SKIP_CLUSTER_IMAGE_BUILD: "1" - OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ github.sha }} - run: mise run --no-prepare --skip-deps cluster - - - name: Run GPU E2E test - run: mise run --no-prepare --skip-deps e2e:python:gpu From ed3e12e83bcc188b65c8f26098a47bfca87d636d Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 6 Apr 2026 10:58:33 -0700 Subject: [PATCH 3/7] ci(actions): split gpu e2e into dedicated job --- .github/workflows/e2e-test.yml | 64 ++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 057cef1a0..cd3bcec36 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -20,9 +20,6 @@ on: default: all options: - all - - python - - rust - - gateway-resume - python-gpu workflow_call: inputs: @@ -48,7 +45,7 @@ permissions: jobs: e2e: name: "E2E (${{ matrix.suite }})" - if: ${{ inputs.suite == 'all' || inputs.suite == matrix.suite }} + if: inputs.suite != 'python-gpu' runs-on: ${{ inputs.runner }} timeout-minutes: 30 strategy: @@ -58,23 +55,15 @@ jobs: - suite: python cluster: e2e-python port: "8080" - cluster_gpu: "0" cmd: "mise run --no-prepare --skip-deps e2e:python" - suite: rust cluster: e2e-rust port: "8081" - cluster_gpu: "0" cmd: "mise run --no-prepare --skip-deps e2e:rust" - suite: gateway-resume cluster: e2e-resume port: "8082" - cluster_gpu: "0" cmd: "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test gateway_resume" - - suite: python-gpu - cluster: e2e-gpu - port: "8083" - cluster_gpu: "1" - cmd: "mise run --no-prepare --skip-deps e2e:python:gpu" container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -102,7 +91,7 @@ jobs: run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} - name: Install Python dependencies and generate protobuf stubs - if: matrix.suite == 'python' || matrix.suite == 'python-gpu' + if: matrix.suite == 'python' run: uv sync --frozen && mise run --no-prepare python:proto - name: Build Rust CLI @@ -118,7 +107,6 @@ jobs: GATEWAY_HOST: host.docker.internal GATEWAY_PORT: ${{ matrix.port }} CLUSTER_NAME: ${{ matrix.cluster }} - CLUSTER_GPU: ${{ matrix.cluster_gpu }} SKIP_IMAGE_PUSH: "1" SKIP_CLUSTER_IMAGE_BUILD: "1" OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} @@ -126,3 +114,51 @@ jobs: - name: Run tests run: ${{ matrix.cmd }} + + e2e-gpu: + name: "E2E (python-gpu)" + if: inputs.suite == 'python-gpu' + runs-on: ${{ inputs.runner }} + timeout-minutes: 30 + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --privileged + volumes: + - /var/run/docker.sock:/var/run/docker.sock + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IMAGE_TAG: ${{ inputs.image-tag || github.sha }} + OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell + OPENSHELL_REGISTRY_HOST: ghcr.io + OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell + OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} + OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + OPENSHELL_GATEWAY: e2e-gpu + steps: + - uses: actions/checkout@v4 + + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Pull cluster image + run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} + + - name: Install Python dependencies and generate protobuf stubs + run: uv sync --frozen && mise run --no-prepare python:proto + + - name: Bootstrap GPU cluster + env: + GATEWAY_HOST: host.docker.internal + GATEWAY_PORT: "8083" + CLUSTER_NAME: e2e-gpu + CLUSTER_GPU: "1" + SKIP_IMAGE_PUSH: "1" + SKIP_CLUSTER_IMAGE_BUILD: "1" + OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} + run: mise run --no-prepare --skip-deps cluster + + - name: Run tests + run: mise run --no-prepare --skip-deps e2e:python:gpu From d819287efcf09cc3843ebdc79b4ffb078087b4e8 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 6 Apr 2026 12:10:15 -0700 Subject: [PATCH 4/7] ci(actions): split gpu checks into dedicated workflows --- .github/workflows/branch-e2e.yml | 36 ------------- .github/workflows/e2e-gpu-test.yaml | 82 ++++++++++++++++++++++++++++ .github/workflows/e2e-test.yml | 84 ++--------------------------- .github/workflows/test-gpu.yml | 31 +++++++++++ 4 files changed, 118 insertions(+), 115 deletions(-) create mode 100644 .github/workflows/e2e-gpu-test.yaml create mode 100644 .github/workflows/test-gpu.yml diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index cafdf4929..ad53bb635 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -3,16 +3,6 @@ name: Branch E2E Checks on: pull_request: types: [opened, synchronize, reopened, labeled] - workflow_dispatch: - inputs: - gpu-runner: - description: GPU runner label to target - required: true - type: choice - default: linux-arm64-gpu-l4-latest-1 - options: - - linux-arm64-gpu-l4-latest-1 - - linux-amd64-gpu-rtxpro6000-latest-1 permissions: contents: read @@ -36,34 +26,8 @@ jobs: runner: build-arm64 e2e: - if: github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'test:e2e') needs: [build-gateway, build-cluster] uses: ./.github/workflows/e2e-test.yml with: image-tag: ${{ github.sha }} runner: build-arm64 - - build-gateway-gpu: - if: github.event_name == 'workflow_dispatch' - uses: ./.github/workflows/docker-build.yml - with: - component: gateway - platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }} - runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }} - - build-cluster-gpu: - if: github.event_name == 'workflow_dispatch' - uses: ./.github/workflows/docker-build.yml - with: - component: cluster - platform: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'linux/arm64' || 'linux/amd64' }} - runner: ${{ inputs.gpu-runner == 'linux-arm64-gpu-l4-latest-1' && 'build-arm64' || 'build-amd64' }} - - e2e-gpu: - if: github.event_name == 'workflow_dispatch' - needs: [build-gateway-gpu, build-cluster-gpu] - uses: ./.github/workflows/e2e-test.yml - with: - image-tag: ${{ github.sha }} - runner: ${{ inputs.gpu-runner }} - suite: python-gpu diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml new file mode 100644 index 000000000..9feda3769 --- /dev/null +++ b/.github/workflows/e2e-gpu-test.yaml @@ -0,0 +1,82 @@ +name: GPU E2E Test + +on: + workflow_call: + inputs: + image-tag: + description: "Image tag to test (typically the commit SHA)" + required: true + type: string + +permissions: + contents: read + packages: read + +jobs: + e2e-gpu: + name: "E2E GPU (${{ matrix.name }})" + runs-on: ${{ matrix.runner }} + continue-on-error: ${{ matrix.experimental }} + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + include: + - name: linux-arm64 + runner: linux-arm64-gpu-l4-latest-1 + cluster: e2e-gpu-arm64 + port: "8083" + experimental: false + - name: linux-amd64 + runner: linux-amd64-gpu-rtxpro6000-latest-1 + cluster: e2e-gpu-amd64 + port: "8084" + experimental: false + - name: wsl-amd64 + runner: wsl-amd64-gpu-rtxpro6000-latest-1 + cluster: e2e-gpu-wsl + port: "8085" + experimental: true + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --privileged + volumes: + - /var/run/docker.sock:/var/run/docker.sock + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IMAGE_TAG: ${{ inputs.image-tag }} + OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell + OPENSHELL_REGISTRY_HOST: ghcr.io + OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell + OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} + OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + OPENSHELL_GATEWAY: ${{ matrix.cluster }} + steps: + - uses: actions/checkout@v4 + + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Pull cluster image + run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} + + - name: Install Python dependencies and generate protobuf stubs + run: uv sync --frozen && mise run --no-prepare python:proto + + - name: Bootstrap GPU cluster + env: + GATEWAY_HOST: host.docker.internal + GATEWAY_PORT: ${{ matrix.port }} + CLUSTER_NAME: ${{ matrix.cluster }} + # Passes --gpu to the gateway bootstrap so the cluster comes up with GPU passthrough enabled. + CLUSTER_GPU: "1" + SKIP_IMAGE_PUSH: "1" + SKIP_CLUSTER_IMAGE_BUILD: "1" + OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} + run: mise run --no-prepare --skip-deps cluster + + - name: Run tests + run: mise run --no-prepare --skip-deps e2e:python:gpu diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index cd3bcec36..a89f4508f 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -1,26 +1,6 @@ name: E2E Test on: - workflow_dispatch: - inputs: - image-tag: - description: "Image tag to test (typically the commit SHA)" - required: false - type: string - default: "" - runner: - description: "GitHub Actions runner label" - required: false - type: string - default: "build-amd64" - suite: - description: "E2E suite to run" - required: false - type: choice - default: all - options: - - all - - python-gpu workflow_call: inputs: image-tag: @@ -32,11 +12,6 @@ on: required: false type: string default: "build-amd64" - suite: - description: "E2E suite to run" - required: false - type: string - default: "all" permissions: contents: read @@ -45,7 +20,6 @@ permissions: jobs: e2e: name: "E2E (${{ matrix.suite }})" - if: inputs.suite != 'python-gpu' runs-on: ${{ inputs.runner }} timeout-minutes: 30 strategy: @@ -74,7 +48,7 @@ jobs: - /var/run/docker.sock:/var/run/docker.sock env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IMAGE_TAG: ${{ inputs.image-tag || github.sha }} + IMAGE_TAG: ${{ inputs.image-tag }} OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell OPENSHELL_REGISTRY_HOST: ghcr.io OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell @@ -88,18 +62,18 @@ jobs: run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - name: Pull cluster image - run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} + run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} - name: Install Python dependencies and generate protobuf stubs if: matrix.suite == 'python' run: uv sync --frozen && mise run --no-prepare python:proto - name: Build Rust CLI - if: matrix.suite == 'rust' || matrix.suite == 'gateway-resume' + if: matrix.suite != 'python' run: cargo build -p openshell-cli --features openshell-core/dev-settings - name: Install SSH client - if: matrix.suite == 'rust' || matrix.suite == 'gateway-resume' + if: matrix.suite != 'python' run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/* - name: Bootstrap cluster @@ -109,56 +83,8 @@ jobs: CLUSTER_NAME: ${{ matrix.cluster }} SKIP_IMAGE_PUSH: "1" SKIP_CLUSTER_IMAGE_BUILD: "1" - OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} + OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} run: mise run --no-prepare --skip-deps cluster - name: Run tests run: ${{ matrix.cmd }} - - e2e-gpu: - name: "E2E (python-gpu)" - if: inputs.suite == 'python-gpu' - runs-on: ${{ inputs.runner }} - timeout-minutes: 30 - container: - image: ghcr.io/nvidia/openshell/ci:latest - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - options: --privileged - volumes: - - /var/run/docker.sock:/var/run/docker.sock - env: - MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IMAGE_TAG: ${{ inputs.image-tag || github.sha }} - OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell - OPENSHELL_REGISTRY_HOST: ghcr.io - OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell - OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} - OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} - OPENSHELL_GATEWAY: e2e-gpu - steps: - - uses: actions/checkout@v4 - - - name: Log in to GHCR - run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - - - name: Pull cluster image - run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} - - - name: Install Python dependencies and generate protobuf stubs - run: uv sync --frozen && mise run --no-prepare python:proto - - - name: Bootstrap GPU cluster - env: - GATEWAY_HOST: host.docker.internal - GATEWAY_PORT: "8083" - CLUSTER_NAME: e2e-gpu - CLUSTER_GPU: "1" - SKIP_IMAGE_PUSH: "1" - SKIP_CLUSTER_IMAGE_BUILD: "1" - OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag || github.sha }} - run: mise run --no-prepare --skip-deps cluster - - - name: Run tests - run: mise run --no-prepare --skip-deps e2e:python:gpu diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml new file mode 100644 index 000000000..a7aa77778 --- /dev/null +++ b/.github/workflows/test-gpu.yml @@ -0,0 +1,31 @@ +name: GPU Test + +on: + pull_request: + types: [opened, synchronize, reopened, labeled] + workflow_dispatch: {} + # Add `schedule:` here when we want nightly coverage from the same workflow. + +permissions: + contents: read + packages: write + +jobs: + build-gateway: + if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'test:e2e-gpu') + uses: ./.github/workflows/docker-build.yml + with: + component: gateway + + build-cluster: + if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'test:e2e-gpu') + uses: ./.github/workflows/docker-build.yml + with: + component: cluster + + e2e-gpu: + if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'test:e2e-gpu') + needs: [build-gateway, build-cluster] + uses: ./.github/workflows/e2e-gpu-test.yaml + with: + image-tag: ${{ github.sha }} From 39c72353613a3bc05533a8d8eb92c33fffe0d49c Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 6 Apr 2026 12:36:20 -0700 Subject: [PATCH 5/7] ci(actions): enable copy-pr-bot for PR testing --- .github/copy-pr-bot.yaml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .github/copy-pr-bot.yaml diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml new file mode 100644 index 000000000..4cfbdc7f0 --- /dev/null +++ b/.github/copy-pr-bot.yaml @@ -0,0 +1,3 @@ +enabled: true +auto_sync_draft: false +auto_sync_ready: true From 75a53f13f136431ab1c02d5f1bf99c399e911d60 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 6 Apr 2026 12:55:06 -0700 Subject: [PATCH 6/7] test: verify ssh commit signing From aa65cceadc31c5bf1b2dd3b21afc1512dd264071 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Mon, 6 Apr 2026 13:13:17 -0700 Subject: [PATCH 7/7] ci(actions): gate gpu tests on trusted PR branches --- .github/workflows/test-gpu.yml | 58 ++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index a7aa77778..df953b5d3 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -1,31 +1,77 @@ name: GPU Test on: - pull_request: - types: [opened, synchronize, reopened, labeled] + push: + branches: + - "pull-request/[0-9]+" workflow_dispatch: {} # Add `schedule:` here when we want nightly coverage from the same workflow. permissions: contents: read + pull-requests: read packages: write jobs: + pr_metadata: + name: Resolve PR metadata + runs-on: ubuntu-latest + outputs: + should_run: ${{ steps.gate.outputs.should_run }} + steps: + - id: get_pr_info + if: github.event_name == 'push' + continue-on-error: true + uses: nv-gha-runners/get-pr-info@main + + - id: gate + shell: bash + env: + EVENT_NAME: ${{ github.event_name }} + GITHUB_SHA_VALUE: ${{ github.sha }} + GET_PR_INFO_OUTCOME: ${{ steps.get_pr_info.outcome }} + PR_INFO: ${{ steps.get_pr_info.outputs.pr-info }} + run: | + if [ "$EVENT_NAME" != "push" ]; then + echo "should_run=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + + if [ "$GET_PR_INFO_OUTCOME" != "success" ]; then + echo "should_run=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + + head_sha="$(jq -r '.head.sha' <<< "$PR_INFO")" + has_gpu_label="$(jq -r '[.labels[].name] | index("test:e2e-gpu") != null' <<< "$PR_INFO")" + + # Only trust copied pull-request/* pushes that still match the PR head SHA + # and are explicitly labeled for GPU coverage. + if [ "$head_sha" = "$GITHUB_SHA_VALUE" ] && [ "$has_gpu_label" = "true" ]; then + should_run=true + else + should_run=false + fi + + echo "should_run=$should_run" >> "$GITHUB_OUTPUT" + build-gateway: - if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'test:e2e-gpu') + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' uses: ./.github/workflows/docker-build.yml with: component: gateway build-cluster: - if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'test:e2e-gpu') + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' uses: ./.github/workflows/docker-build.yml with: component: cluster e2e-gpu: - if: github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'test:e2e-gpu') - needs: [build-gateway, build-cluster] + needs: [pr_metadata, build-gateway, build-cluster] + if: needs.pr_metadata.outputs.should_run == 'true' uses: ./.github/workflows/e2e-gpu-test.yaml with: image-tag: ${{ github.sha }}