diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml new file mode 100644 index 000000000..4cfbdc7f0 --- /dev/null +++ b/.github/copy-pr-bot.yaml @@ -0,0 +1,3 @@ +enabled: true +auto_sync_draft: false +auto_sync_ready: true diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml new file mode 100644 index 000000000..9feda3769 --- /dev/null +++ b/.github/workflows/e2e-gpu-test.yaml @@ -0,0 +1,82 @@ +name: GPU E2E Test + +on: + workflow_call: + inputs: + image-tag: + description: "Image tag to test (typically the commit SHA)" + required: true + type: string + +permissions: + contents: read + packages: read + +jobs: + e2e-gpu: + name: "E2E GPU (${{ matrix.name }})" + runs-on: ${{ matrix.runner }} + continue-on-error: ${{ matrix.experimental }} + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: + include: + - name: linux-arm64 + runner: linux-arm64-gpu-l4-latest-1 + cluster: e2e-gpu-arm64 + port: "8083" + experimental: false + - name: linux-amd64 + runner: linux-amd64-gpu-rtxpro6000-latest-1 + cluster: e2e-gpu-amd64 + port: "8084" + experimental: false + - name: wsl-amd64 + runner: wsl-amd64-gpu-rtxpro6000-latest-1 + cluster: e2e-gpu-wsl + port: "8085" + experimental: true + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --privileged + volumes: + - /var/run/docker.sock:/var/run/docker.sock + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IMAGE_TAG: ${{ inputs.image-tag }} + OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell + OPENSHELL_REGISTRY_HOST: ghcr.io + OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell + OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} + OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + OPENSHELL_GATEWAY: ${{ matrix.cluster }} + steps: + - uses: actions/checkout@v4 + + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Pull cluster image + run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} + + - name: Install Python dependencies and generate protobuf stubs + run: uv sync --frozen && mise run --no-prepare python:proto + + - name: Bootstrap GPU cluster + env: + GATEWAY_HOST: host.docker.internal + GATEWAY_PORT: ${{ matrix.port }} + CLUSTER_NAME: ${{ matrix.cluster }} + # Passes --gpu to the gateway bootstrap so the cluster comes up with GPU passthrough enabled. + CLUSTER_GPU: "1" + SKIP_IMAGE_PUSH: "1" + SKIP_CLUSTER_IMAGE_BUILD: "1" + OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} + run: mise run --no-prepare --skip-deps cluster + + - name: Run tests + run: mise run --no-prepare --skip-deps e2e:python:gpu diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml new file mode 100644 index 000000000..df953b5d3 --- /dev/null +++ b/.github/workflows/test-gpu.yml @@ -0,0 +1,77 @@ +name: GPU Test + +on: + push: + branches: + - "pull-request/[0-9]+" + workflow_dispatch: {} + # Add `schedule:` here when we want nightly coverage from the same workflow. + +permissions: + contents: read + pull-requests: read + packages: write + +jobs: + pr_metadata: + name: Resolve PR metadata + runs-on: ubuntu-latest + outputs: + should_run: ${{ steps.gate.outputs.should_run }} + steps: + - id: get_pr_info + if: github.event_name == 'push' + continue-on-error: true + uses: nv-gha-runners/get-pr-info@main + + - id: gate + shell: bash + env: + EVENT_NAME: ${{ github.event_name }} + GITHUB_SHA_VALUE: ${{ github.sha }} + GET_PR_INFO_OUTCOME: ${{ steps.get_pr_info.outcome }} + PR_INFO: ${{ steps.get_pr_info.outputs.pr-info }} + run: | + if [ "$EVENT_NAME" != "push" ]; then + echo "should_run=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + + if [ "$GET_PR_INFO_OUTCOME" != "success" ]; then + echo "should_run=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + + head_sha="$(jq -r '.head.sha' <<< "$PR_INFO")" + has_gpu_label="$(jq -r '[.labels[].name] | index("test:e2e-gpu") != null' <<< "$PR_INFO")" + + # Only trust copied pull-request/* pushes that still match the PR head SHA + # and are explicitly labeled for GPU coverage. + if [ "$head_sha" = "$GITHUB_SHA_VALUE" ] && [ "$has_gpu_label" = "true" ]; then + should_run=true + else + should_run=false + fi + + echo "should_run=$should_run" >> "$GITHUB_OUTPUT" + + build-gateway: + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + uses: ./.github/workflows/docker-build.yml + with: + component: gateway + + build-cluster: + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + uses: ./.github/workflows/docker-build.yml + with: + component: cluster + + e2e-gpu: + needs: [pr_metadata, build-gateway, build-cluster] + if: needs.pr_metadata.outputs.should_run == 'true' + uses: ./.github/workflows/e2e-gpu-test.yaml + with: + image-tag: ${{ github.sha }}