From 51a0e16f6465b209543e9218a94e94f8fe0a6066 Mon Sep 17 00:00:00 2001 From: Yuriy Kohut Date: Thu, 25 Jun 2026 17:53:12 +0300 Subject: [PATCH 1/4] feat(ci): add unified "GCP: Build, Test and Publish" workflow Single workflow_dispatch chaining the three standalone GCP workflows: build (x86_64 + aarch64, gcp-build-steps) -> test (cloud-image-tests: smoke on both arches in parallel, then the per-shape matrices) -> publish each arch to almalinux-cloud (prod), one arch at a time. Inputs: all gcp-build.yml inputs, plus a new run_test gate (default true) to build+publish without testing, plus the gcp-test.yml inputs except version_major and arch (arch acts as ALL - both arches built/tested/ published). No gcp-publish.yml inputs are exposed: publish runs as a max-parallel:1 arch matrix and takes image_datetag from the build's YYYYMMDD date_stamp. Stage gating (job results): tests require both builds to succeed; publish requires the builds and, when run_test is true, all test jobs to pass - so run_test=false builds and publishes straight through. The standalone gcp-build.yml / gcp-test.yml / gcp-publish.yml are left unchanged. --- .github/workflows/gcp-build-test-publish.yml | 672 +++++++++++++++++++ 1 file changed, 672 insertions(+) create mode 100644 .github/workflows/gcp-build-test-publish.yml diff --git a/.github/workflows/gcp-build-test-publish.yml b/.github/workflows/gcp-build-test-publish.yml new file mode 100644 index 00000000..25605044 --- /dev/null +++ b/.github/workflows/gcp-build-test-publish.yml @@ -0,0 +1,672 @@ +name: "GCP: Build, Test and Publish" +run-name: >- + GCP: AlmaLinux ${{ inputs.version_major == '10-kitten' && 'Kitten 10' || inputs.version_major }} Build${{ inputs.run_test && ', Test and Publish' || ' and Publish' }} + +# Unified GCP pipeline in a single workflow_dispatch: +# 1. Build the images with Packer (x86_64 + aarch64), upload to the dev GCS +# buckets and publish a dev test image (gcp-build-steps). +# 2. Test every built image with Google cloud-image-tests (smoke tests on +# both arches in parallel, then the per-shape matrices). Gated by run_test. +# 3. Publish each arch to the almalinux-cloud (prod) project, one arch at a +# time. +# +# Stage gating: +# run_test=false -> tests are skipped; builds publish straight through. +# publish_images=false -> images are built (and tested) but not published. +# a test job fails -> publish is skipped (no untested image promoted). +# +# arch is fixed to ALL (both x86_64 and aarch64 are built, tested, published). +# image_datetag for publish is the build's YYYYMMDD date_stamp. + +on: + workflow_dispatch: + inputs: + + date_time_stamp: + description: 'Custom date+time stamp, YYYYMMDDhhmmss' + required: false + default: '' + + version_major: + description: 'AlmaLinux major version' + required: true + default: '10' + type: choice + options: + - 10-kitten + - 10 + - 9 + - 8 + + self-hosted: + description: "Build aarch64 image on self-hosted runner" + required: true + type: boolean + default: true + + store_as_artifact: + description: "Store images to the workflow Artifacts" + required: true + type: boolean + default: false + + upload_to_s3: + description: "Upload to S3 Bucket" + required: true + type: boolean + default: true + + run_test: + description: "Test the built images. Disable to build and publish without testing." + required: true + type: boolean + default: true + + # ---- [tests] settings (only used when run_test is true) ---- + cit_git_repo: + description: '[tests] owner/repo of a cloud-image-tests fork to build instead of the prebuilt image. Empty = gcr.io/compute-image-tools/cloud-image-tests:latest. owner/repo only (no full URL).' + required: false + default: '' + + cit_git_ref: + description: '[tests] Branch, tag, or commit SHA inside cit_git_repo. Empty = default branch. Ignored when cit_git_repo is empty.' + required: false + default: '' + # ---- end [tests] settings ---- + + publish_images: + description: "Publish the built (and tested) images to the almalinux-cloud (prod) project" + required: true + type: boolean + default: true + + notify_mattermost: + description: "Send notification to Mattermost" + required: true + type: boolean + default: true + +env: + PACKER_GITHUB_API_TOKEN: ${{ secrets.GIT_HUB_TOKEN }} + # Default zone fallback list for cit-run-with-retry. Multi-region spread so a + # single-region capacity event doesn't fail the whole run. + GCP_DEFAULT_ZONES: "us-central1-a us-central1-b us-central1-c us-central1-f us-south1-a us-south1-b us-south1-c us-west1-a us-west1-b us-west1-c northamerica-northeast1-a northamerica-northeast1-b europe-west1-b europe-west1-c europe-west1-d europe-west2-a europe-west2-b europe-west2-c europe-west3-a europe-west3-b europe-west3-c europe-west4-a europe-west4-b europe-west4-c" + +jobs: + init-data: + name: Initialize common data + runs-on: ubuntu-24.04 + outputs: + time_stamp: ${{ steps.date-time-stamp.outputs.time_stamp }} + date_stamp: ${{ steps.date-time-stamp.outputs.date_stamp }} + image_path: ${{ steps.determine_image.outputs.image_path }} + steps: + - name: Date+time stamp + id: date-time-stamp + run: | + # date+time stamp, YYYYMMDDhhmmss + if [ "${{ inputs.date_time_stamp }}" != "" ]; then + date_time_stamp="${{ inputs.date_time_stamp }}" + else + date_time_stamp=$(date -u '+%Y%m%d%H%M%S') + fi + echo "time_stamp=${date_time_stamp}" >> $GITHUB_OUTPUT + + # date stamp, YYYYMMDD (used as the publish image_datetag) + date_stamp=${date_time_stamp:0:-6} + echo "date_stamp=${date_stamp}" >> "$GITHUB_OUTPUT" + + - name: Determine image to test + id: determine_image + run: | + # arch is fixed to ALL; the dev family path is arch-agnostic (the + # cit calls append -arm64 for aarch64). + image_path="projects/almalinux-dev-images-469421/global/images/family/almalinux-${{ inputs.version_major }}" + echo "Determined image path: ${image_path}" + echo "image_path=${image_path}" >> $GITHUB_OUTPUT + + build-gcp-x86_64: + name: ${{ inputs.version_major }} gcp-x86_64 image + permissions: + id-token: write + contents: read + needs: [init-data] + runs-on: >- + ${{ + github.repository_owner == 'AlmaLinux' && + format('runs-on={0}/family=c7i.metal-24xl+c7a.metal-48xl+*8gd.metal*/image=ubuntu24-full-x64', github.run_id) + || + 'ubuntu-24.04' + }} + + env: + TIME_STAMP: ${{ needs.init-data.outputs.time_stamp }} + DATE_STAMP: ${{ needs.init-data.outputs.date_stamp }} + + steps: + - name: Checkout ${{ github.action_repository }} + uses: actions/checkout@v6 + + - uses: ./.github/actions/gcp-build-steps + name: ${{ inputs.version_major }} gcp-x86_64 image + with: + variant: ${{ inputs.version_major }} + arch: x86_64 + S3_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ vars.AWS_REGION }} + AWS_S3_BUCKET: ${{ vars.AWS_S3_BUCKET }} + MATTERMOST_WEBHOOK_URL: ${{ secrets.MATTERMOST_WEBHOOK_URL }} + MATTERMOST_CHANNEL: ${{ vars.MATTERMOST_CHANNEL }} + store_as_artifact: ${{ inputs.store_as_artifact }} + upload_to_s3: ${{ inputs.upload_to_s3 }} + notify_mattermost: ${{ inputs.notify_mattermost }} + runner: ${{ github.repository_owner == 'AlmaLinux' && 'aws-ec2' || 'gh_hosted' }} + env: + PACKER_GITHUB_API_TOKEN: ${{ secrets.GIT_HUB_TOKEN }} + + start-self-hosted-runner: + name: ${{ inputs.version_major }} gcp-aarch64 runner + if: ${{ inputs.self-hosted }} + runs-on: ubuntu-24.04 + needs: [init-data] + + steps: + - name: Setup and start runner + if: github.repository_owner != 'AlmaLinux' + uses: unblocked/ec2-action-builder@v1.12 + with: + github_token: ${{ secrets.GIT_HUB_TOKEN }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_region: ${{ vars.AWS_REGION }} + ec2_ami_id: ${{ secrets.EC2_AMI_ID_AL9_AARCH64 }} + + ec2_subnet_id: ${{ secrets.EC2_SUBNET_ID }} + ec2_security_group_id: ${{ secrets.EC2_SECURITY_GROUP_ID }} + ec2_instance_type: a1.metal + + ec2_root_disk_size_gb: "16" + ec2_root_disk_ebs_class: "gp3" + ec2_instance_ttl: 30 + ec2_spot_instance_strategy: None + ec2_instance_tags: > + [ + {"Key": "Project", "Value": "GitHub Actions Self-hosted Runners"} + ] + + build-gcp-aarch64: + name: ${{ inputs.version_major }} gcp-aarch64 image + permissions: + id-token: write + contents: read + if: ${{ inputs.self-hosted }} + needs: [init-data, start-self-hosted-runner] + runs-on: >- + ${{ + github.repository_owner == 'AlmaLinux' && + format('runs-on={0}/family=c7i.metal-24xl+c7a.metal-48xl+*8gd.metal*/image=ubuntu24-full-arm64', github.run_id) + || + github.run_id + }} + + env: + TIME_STAMP: ${{ needs.init-data.outputs.time_stamp }} + DATE_STAMP: ${{ needs.init-data.outputs.date_stamp }} + + steps: + - name: Checkout ${{ github.action_repository }} + uses: actions/checkout@v6 + + - uses: ./.github/actions/gcp-build-steps + name: ${{ inputs.version_major }} gcp-aarch64 image + with: + variant: ${{ inputs.version_major }} + arch: aarch64 + S3_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ vars.AWS_REGION }} + AWS_S3_BUCKET: ${{ vars.AWS_S3_BUCKET }} + MATTERMOST_WEBHOOK_URL: ${{ secrets.MATTERMOST_WEBHOOK_URL }} + MATTERMOST_CHANNEL: ${{ vars.MATTERMOST_CHANNEL }} + store_as_artifact: ${{ inputs.store_as_artifact }} + upload_to_s3: ${{ inputs.upload_to_s3 }} + notify_mattermost: ${{ inputs.notify_mattermost }} + runner: aws-ec2 + env: + PACKER_GITHUB_API_TOKEN: ${{ secrets.GIT_HUB_TOKEN }} + + # Optional: build cloud-image-tests from a user-supplied git repo+ref and + # publish a docker tarball as an artifact. Skipped when cit_git_repo is empty + # or testing is disabled. + build-cit: + name: Build cloud-image-tests from source + if: ${{ inputs.run_test && inputs.cit_git_repo != '' }} + needs: [init-data] + runs-on: "${{ github.repository_owner == 'AlmaLinux' && format('runs-on={0}/family=m8azn/cpu=48/extras=tmpfs/spot=false/image=ubuntu24-full-x64', github.run_id) || 'ubuntu-24.04' }}" + steps: + - name: Show runs-on cost/performance summary + if: github.repository_owner == 'AlmaLinux' + uses: runs-on/action@v2 + + - name: Checkout CIT source + uses: actions/checkout@v6 + with: + repository: ${{ inputs.cit_git_repo }} + ref: ${{ inputs.cit_git_ref }} + path: cit-src + + - name: Build CIT image + shell: bash + run: docker build -t cit-custom:run-${{ github.run_id }} cit-src + + - name: Save image to tarball + shell: bash + run: docker save cit-custom:run-${{ github.run_id }} | zstd -T0 -3 --long=27 -o cit-image.tar.zst + + - name: Upload CIT image artifact + uses: actions/upload-artifact@v7 + with: + name: cit-custom-image + path: cit-image.tar.zst + retention-days: 1 + compression-level: 0 + + # Initial smoke tests against a high-concurrency shape, on both arches in + # parallel, against the just-built dev images. + test-gcp-initialtest: + name: AlmaLinux ${{ inputs.version_major }} ${{ matrix.arch }} Initial Tests + needs: [init-data, build-gcp-x86_64, build-gcp-aarch64, build-cit] + # Run only when testing is enabled and both image builds succeeded; accept + # build-cit as success or skipped (skipped whenever cit_git_repo is empty). + if: >- + !cancelled() + && inputs.run_test + && needs.build-gcp-x86_64.result == 'success' + && needs.build-gcp-aarch64.result == 'success' + && (needs.build-cit.result == 'success' || needs.build-cit.result == 'skipped') + permissions: + id-token: write + contents: read + runs-on: ubuntu-24.04 + strategy: + fail-fast: false + matrix: + arch: + - x86_64 + - aarch64 + steps: + # we don't need the checked out files, but this is required for the google auth action to work + - uses: 'actions/checkout@v6' + + - id: 'google-auth-image-testing' + uses: 'google-github-actions/auth@v3' + with: + workload_identity_provider: 'projects/527193872801/locations/global/workloadIdentityPools/github-actions/providers/github' + service_account: 'github-actions-image-testing@almalinux-image-testing-469421.iam.gserviceaccount.com' + + - name: 'Set up Google Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v3' + + - name: Download custom CIT image + if: inputs.cit_git_repo != '' + uses: actions/download-artifact@v7 + with: + name: cit-custom-image + + - name: Load custom CIT image + if: inputs.cit_git_repo != '' + shell: bash + run: zstd -d -c cit-image.tar.zst | docker load + + - name: 'Run Google cloud-image-testing tests which are hard-coded to specific shapes' + uses: ./.github/actions/cit-run-with-retry + with: + image: "${{ needs.init-data.outputs.image_path }}${{ matrix.arch == 'aarch64' && '-arm64' || '' }}" + filter: '^(lssd|disk|vmspec)$' + shape_flag: "${{ matrix.arch == 'aarch64' && '-arm64_shape c4a-standard-8' || '-x86_shape c4-standard-8' }}" + parallel_count: '20' + parallel_stagger: '1s' + zones: ${{ env.GCP_DEFAULT_ZONES }} + creds_path: ${{ env.GOOGLE_GHA_CREDS_PATH }} + quota_log_file: ${{ runner.temp }}/quota-failures.jsonl + image_ref: ${{ inputs.cit_git_repo != '' && format('cit-custom:run-{0}', github.run_id) || 'gcr.io/compute-image-tools/cloud-image-tests:latest' }} + + - name: Upload quota-failure log + if: always() + uses: actions/upload-artifact@v7 + with: + name: quota-failures-initialtest-${{ matrix.arch }} + path: ${{ runner.temp }}/quota-failures.jsonl + if-no-files-found: ignore + retention-days: 7 + + test-gcp-pershape-x86_64: + name: ${{ inputs.version_major }} x86_64 ${{ matrix.shape }} + needs: [init-data, test-gcp-initialtest, build-cit] + permissions: + id-token: write + contents: read + runs-on: "${{ github.repository_owner == 'AlmaLinux' && format('runs-on={0}/runner=2cpu-linux-x64/spot=false/volume=100g/image=almalinux-10-x86_64', github.run_id) || 'ubuntu-24.04' }}" + # !cancelled() suppresses the implicit success() that a custom if adds (which + # would skip on build-cit being skipped); gate explicitly on the smoke test. + if: >- + !cancelled() + && needs.test-gcp-initialtest.result == 'success' + timeout-minutes: 720 + strategy: + fail-fast: false + matrix: + shape: + - n4-standard-4 + - n2-standard-4 + - n2d-standard-4 + - n1-standard-4 + - c4-standard-4 + - c4d-standard-4 + - c3-standard-4 + - c3d-standard-4 + - e2-standard-4 + - e2-medium + - t2d-standard-4 + - c2-standard-4 + - c2d-standard-4 + include: + - shape: n4-standard-4 + zone: '' + zones: '' + + steps: + # we don't need the checked out files, but this is required for the google auth action to work + - uses: 'actions/checkout@v6' + + - id: 'google-auth-image-testing' + uses: 'google-github-actions/auth@v3' + with: + workload_identity_provider: 'projects/527193872801/locations/global/workloadIdentityPools/github-actions/providers/github' + service_account: 'github-actions-image-testing@almalinux-image-testing-469421.iam.gserviceaccount.com' + + - name: 'Set up Google Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v3' + + - name: Runner OS specific configuration + shell: bash + run: | + # Runner OS specific configuration + if [ -e /etc/redhat-release ]; then + sudo setenforce 0 + # we don't need scriptlets, just slows down install on selinux scriptlets that we won't be using + sudo dnf -y --setopt=tsflags=noscripts install podman zstd + runner_user=$USER + sudo loginctl enable-linger $runner_user + docker_cmd=podman + elif lsb_release -cs > /dev/null 2>&1; then + docker_cmd=docker + else + echo "[Debug] Unknown OS" + exit 1 + fi + echo "docker_cmd=${docker_cmd}" >> $GITHUB_ENV + + - name: Download custom CIT image + if: inputs.cit_git_repo != '' + uses: actions/download-artifact@v7 + with: + name: cit-custom-image + + - name: Load custom CIT image + if: inputs.cit_git_repo != '' + shell: bash + # build-cit emits a docker-save tarball; podman 4.x+ reads it natively. + run: zstd -d -c cit-image.tar.zst | ${{ env.docker_cmd }} load + + - name: 'Run Google cloud-image-testing tests on ${{ matrix.shape }}' + uses: ./.github/actions/cit-run-with-retry + with: + runtime: ${{ env.docker_cmd }} + image: ${{ needs.init-data.outputs.image_path }} + filter: '^(cvm|livemigrate|suspendresume|loadbalancer|guestagent|hostnamevalidation|imageboot|licensevalidation|security|hotattach|packagevalidation|ssh|metadata)$' + shape_flag: "-x86_shape ${{ matrix.shape }}" + parallel_count: '2' + first_attempt_zone: ${{ matrix.zone || '' }} + zones: ${{ matrix.zones || env.GCP_DEFAULT_ZONES }} + creds_path: ${{ env.GOOGLE_GHA_CREDS_PATH }} + quota_log_file: ${{ runner.temp }}/quota-failures.jsonl + image_ref: ${{ inputs.cit_git_repo != '' && format('cit-custom:run-{0}', github.run_id) || 'gcr.io/compute-image-tools/cloud-image-tests:latest' }} + + - name: Upload quota-failure log + if: always() + uses: actions/upload-artifact@v7 + with: + name: quota-failures-pershape-x86_64-${{ matrix.shape }} + path: ${{ runner.temp }}/quota-failures.jsonl + if-no-files-found: ignore + retention-days: 7 + + test-gcp-pershape-aarch64: + name: ${{ inputs.version_major }} aarch64 ${{ matrix.shape }} + needs: [init-data, test-gcp-initialtest, build-cit] + permissions: + id-token: write + contents: read + runs-on: ubuntu-24.04 + if: >- + !cancelled() + && needs.test-gcp-initialtest.result == 'success' + timeout-minutes: 720 + strategy: + fail-fast: false + matrix: + shape: + - c4a-standard-4 + - n4a-standard-4 + include: + - shape: c4a-standard-4 + parallel_count: '' + zone: '' + zones: '' + steps: + # this isn't a great way to handle this, but GH actions has limitations... + - name: Skip certain jobs + id: checker + if: (matrix.shape == 'c4a-standard-96-metal' && inputs.version_major == '8') + run: | + echo "Skipping ${{ matrix.shape }} for version_major ${{ inputs.version_major}} due to incompatibility." + echo "skip=true" >> $GITHUB_OUTPUT + exit 0 + + # we don't need the checked out files, but this is required for the google auth action to work + - uses: 'actions/checkout@v6' + if: steps.checker.outputs.skip != 'true' + + - id: 'google-auth-image-testing' + uses: 'google-github-actions/auth@v3' + if: steps.checker.outputs.skip != 'true' + with: + workload_identity_provider: 'projects/527193872801/locations/global/workloadIdentityPools/github-actions/providers/github' + service_account: 'github-actions-image-testing@almalinux-image-testing-469421.iam.gserviceaccount.com' + + - name: 'Set up Google Cloud SDK' + if: steps.checker.outputs.skip != 'true' + uses: 'google-github-actions/setup-gcloud@v3' + + - name: Download custom CIT image + if: inputs.cit_git_repo != '' && steps.checker.outputs.skip != 'true' + uses: actions/download-artifact@v7 + with: + name: cit-custom-image + + - name: Load custom CIT image + if: inputs.cit_git_repo != '' && steps.checker.outputs.skip != 'true' + shell: bash + run: zstd -d -c cit-image.tar.zst | docker load + + - name: 'Run Google cloud-image-testing tests on ${{ matrix.shape }}' + if: steps.checker.outputs.skip != 'true' + uses: ./.github/actions/cit-run-with-retry + with: + image: "${{ needs.init-data.outputs.image_path }}-arm64" + filter: '^(cvm|livemigrate|suspendresume|loadbalancer|guestagent|hostnamevalidation|imageboot|licensevalidation|security|hotattach|packagevalidation|ssh|metadata)$' + shape_flag: "-arm64_shape ${{ matrix.shape }}" + parallel_count: ${{ matrix.parallel_count || '2' }} + first_attempt_zone: ${{ matrix.zone || '' }} + zones: ${{ matrix.zones || env.GCP_DEFAULT_ZONES }} + creds_path: ${{ env.GOOGLE_GHA_CREDS_PATH }} + quota_log_file: ${{ runner.temp }}/quota-failures.jsonl + image_ref: ${{ inputs.cit_git_repo != '' && format('cit-custom:run-{0}', github.run_id) || 'gcr.io/compute-image-tools/cloud-image-tests:latest' }} + + - name: Upload quota-failure log + if: always() && steps.checker.outputs.skip != 'true' + uses: actions/upload-artifact@v7 + with: + name: quota-failures-pershape-aarch64-${{ matrix.shape }} + path: ${{ runner.temp }}/quota-failures.jsonl + if-no-files-found: ignore + retention-days: 7 + + # Aggregates the per-job JSONL quota-failure logs into a single markdown table. + summarize-quota-failures: + name: Summarize quota failures + needs: + - test-gcp-initialtest + - test-gcp-pershape-x86_64 + - test-gcp-pershape-aarch64 + if: ${{ always() && inputs.run_test }} + runs-on: ubuntu-24.04 + steps: + - name: Download all quota-failure artifacts + uses: actions/download-artifact@v4 + with: + pattern: quota-failures-* + path: quota-failures + merge-multiple: true + + - name: Write summary + shell: bash + run: | + set -uo pipefail + shopt -s nullglob + files=(quota-failures/*.jsonl quota-failures/**/*.jsonl) + if (( ${#files[@]} == 0 )); then + { + echo "## Quota failure summary" + echo + echo "No quota failures recorded in this run." + } >> "$GITHUB_STEP_SUMMARY" + exit 0 + fi + + cat "${files[@]}" > all-quota-failures.jsonl + total=$(wc -l < all-quota-failures.jsonl) + + { + echo "## Quota failure summary" + echo + echo "**Total quota failure events:** ${total}" + echo + echo "Group key: (location, quota, shape). Counts include retries." + echo "Use this to scope quota-increase requests." + echo + echo "| Location | Quota | Shape | Count |" + echo "|----------|-------|-------|-------|" + } >> "$GITHUB_STEP_SUMMARY" + + jq -s -r ' + group_by([.location, .quota, .shape]) + | map({ + location: .[0].location, + quota: .[0].quota, + shape: .[0].shape, + count: length + }) + | sort_by(-.count, .location, .quota, .shape) + | .[] + | "| \(.location // "?") | \(.quota) | \(.shape // "?") | \(.count) |" + ' all-quota-failures.jsonl >> "$GITHUB_STEP_SUMMARY" + + publish-gcp: + name: Publish ${{ inputs.version_major }} ${{ matrix.arch }} to almalinux-cloud (prod) + needs: + - init-data + - build-gcp-x86_64 + - build-gcp-aarch64 + - test-gcp-initialtest + - test-gcp-pershape-x86_64 + - test-gcp-pershape-aarch64 + # Publish after both builds succeed and, when testing is enabled, after all + # test jobs pass. run_test=false publishes straight after the builds; + # publish_images=false skips publishing entirely. + if: >- + !cancelled() + && inputs.publish_images + && needs.build-gcp-x86_64.result == 'success' + && needs.build-gcp-aarch64.result == 'success' + && ( + inputs.run_test == false + || ( + needs.test-gcp-initialtest.result == 'success' + && needs.test-gcp-pershape-x86_64.result == 'success' + && needs.test-gcp-pershape-aarch64.result == 'success' + ) + ) + permissions: + id-token: write + contents: read + runs-on: ubuntu-latest + strategy: + fail-fast: false + # Publish one arch at a time. + max-parallel: 1 + matrix: + arch: + - x86_64 + - aarch64 + steps: + # we don't need the checked out files, but this is required for the google auth action to work + - uses: actions/checkout@v6 + + - name: Build image name from inputs + id: build-image-name + run: | + IMAGE_NAME="almalinux-${{ inputs.version_major }}${{ matrix.arch == 'aarch64' && '-arm64' || '' }}-v${{ needs.init-data.outputs.date_stamp }}" + echo "image_name=${IMAGE_NAME}" >> $GITHUB_OUTPUT + + - id: google-auth-image-release + uses: google-github-actions/auth@v3 + with: + workload_identity_provider: projects/1071098808632/locations/global/workloadIdentityPools/github-actions/providers/github + service_account: gh-actions-prod-release@almalinux-image-release.iam.gserviceaccount.com + + - name: Set up Google Cloud SDK + uses: google-github-actions/setup-gcloud@v3 + + - name: Copy image to almalinux-cloud project + run: | + gcloud storage cp gs://almalinux-images-dev/${{ steps.build-image-name.outputs.image_name }}/root.tar.gz gs://almalinux-images-prod/${{ steps.build-image-name.outputs.image_name }}/root.tar.gz + + - name: Get gce_image_publish tool + run: | + wget https://storage.googleapis.com/compute-image-tools/release/linux/gce_image_publish + chmod +x gce_image_publish + + - name: Create production image on GCP + shell: bash + run: | + ./gce_image_publish \ + -var:environment=prod \ + -skip_confirmation \ + -rollout_rate=60 \ + -work_project="almalinux-image-release" \ + -source_gcs_path="gs://almalinux-images-prod/" \ + -source_version="v${{ needs.init-data.outputs.date_stamp }}" \ + vm-scripts/gcp/almalinux_${{ inputs.version_major }}${{ matrix.arch == 'aarch64' && '_arm64' || '' }}.publish.json + + - name: Get ID of image just created + shell: bash + run: | + IMAGE_ID=$(gcloud compute images describe ${{ steps.build-image-name.outputs.image_name }} --project=almalinux-cloud --format='value(id)') + echo "IMAGE_ID=$IMAGE_ID" >> $GITHUB_ENV + + - name: Copy SBOM data to GCP Bucket + run: | + gcloud storage cp gs://almalinux-images-dev-sbom/${{ steps.build-image-name.outputs.image_name }}.sbom.spdx.json gs://gce-image-almalinux-cloud-sbom/$IMAGE_ID.json From e3f5d45d497b9d778e5bcb439da5549111242f04 Mon Sep 17 00:00:00 2001 From: Yuriy Kohut Date: Tue, 30 Jun 2026 12:11:32 +0300 Subject: [PATCH 2/4] feat(ci): gcp unified: replace run_test with override_test_failure Tests now always run (the run_test skip gate is removed). The new override_test_failure input (default false) controls only whether a test *failure* still lets the publish stage proceed: override_test_failure=false (default) -> a failing test job skips publish override_test_failure=true -> publish even if test(s) fail build-cit, the test jobs, and the quota summary no longer reference run_test; publish gates on `override_test_failure || all tests passed` (still also requiring both builds and publish_images). The run-name always shows "Build, Test and Publish" since tests always run. --- .github/workflows/gcp-build-test-publish.yml | 39 ++++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/.github/workflows/gcp-build-test-publish.yml b/.github/workflows/gcp-build-test-publish.yml index 25605044..fca050d8 100644 --- a/.github/workflows/gcp-build-test-publish.yml +++ b/.github/workflows/gcp-build-test-publish.yml @@ -1,19 +1,20 @@ name: "GCP: Build, Test and Publish" run-name: >- - GCP: AlmaLinux ${{ inputs.version_major == '10-kitten' && 'Kitten 10' || inputs.version_major }} Build${{ inputs.run_test && ', Test and Publish' || ' and Publish' }} + GCP: AlmaLinux ${{ inputs.version_major == '10-kitten' && 'Kitten 10' || inputs.version_major }} Build, Test and Publish # Unified GCP pipeline in a single workflow_dispatch: # 1. Build the images with Packer (x86_64 + aarch64), upload to the dev GCS # buckets and publish a dev test image (gcp-build-steps). # 2. Test every built image with Google cloud-image-tests (smoke tests on -# both arches in parallel, then the per-shape matrices). Gated by run_test. +# both arches in parallel, then the per-shape matrices). Tests always run. # 3. Publish each arch to the almalinux-cloud (prod) project, one arch at a # time. # # Stage gating: -# run_test=false -> tests are skipped; builds publish straight through. -# publish_images=false -> images are built (and tested) but not published. -# a test job fails -> publish is skipped (no untested image promoted). +# tests always run. +# override_test_failure=true -> publish even if test(s) fail. +# publish_images=false -> images are built and tested but not published. +# a test job fails (override_test_failure=false) -> publish is skipped. # # arch is fixed to ALL (both x86_64 and aarch64 are built, tested, published). # image_datetag for publish is the build's YYYYMMDD date_stamp. @@ -56,13 +57,13 @@ on: type: boolean default: true - run_test: - description: "Test the built images. Disable to build and publish without testing." + override_test_failure: + description: "Publish the images even if test(s) fail. Tests always run regardless of this setting." required: true type: boolean - default: true + default: false - # ---- [tests] settings (only used when run_test is true) ---- + # ---- [tests] settings (optional custom cloud-image-tests build) ---- cit_git_repo: description: '[tests] owner/repo of a cloud-image-tests fork to build instead of the prebuilt image. Empty = gcr.io/compute-image-tools/cloud-image-tests:latest. owner/repo only (no full URL).' required: false @@ -237,11 +238,10 @@ jobs: PACKER_GITHUB_API_TOKEN: ${{ secrets.GIT_HUB_TOKEN }} # Optional: build cloud-image-tests from a user-supplied git repo+ref and - # publish a docker tarball as an artifact. Skipped when cit_git_repo is empty - # or testing is disabled. + # publish a docker tarball as an artifact. Skipped when cit_git_repo is empty. build-cit: name: Build cloud-image-tests from source - if: ${{ inputs.run_test && inputs.cit_git_repo != '' }} + if: ${{ inputs.cit_git_repo != '' }} needs: [init-data] runs-on: "${{ github.repository_owner == 'AlmaLinux' && format('runs-on={0}/family=m8azn/cpu=48/extras=tmpfs/spot=false/image=ubuntu24-full-x64', github.run_id) || 'ubuntu-24.04' }}" steps: @@ -277,11 +277,10 @@ jobs: test-gcp-initialtest: name: AlmaLinux ${{ inputs.version_major }} ${{ matrix.arch }} Initial Tests needs: [init-data, build-gcp-x86_64, build-gcp-aarch64, build-cit] - # Run only when testing is enabled and both image builds succeeded; accept - # build-cit as success or skipped (skipped whenever cit_git_repo is empty). + # Run after both image builds succeeded; accept build-cit as success or + # skipped (skipped whenever cit_git_repo is empty). Tests always run. if: >- !cancelled() - && inputs.run_test && needs.build-gcp-x86_64.result == 'success' && needs.build-gcp-aarch64.result == 'success' && (needs.build-cit.result == 'success' || needs.build-cit.result == 'skipped') @@ -531,7 +530,7 @@ jobs: - test-gcp-initialtest - test-gcp-pershape-x86_64 - test-gcp-pershape-aarch64 - if: ${{ always() && inputs.run_test }} + if: ${{ always() }} runs-on: ubuntu-24.04 steps: - name: Download all quota-failure artifacts @@ -593,16 +592,16 @@ jobs: - test-gcp-initialtest - test-gcp-pershape-x86_64 - test-gcp-pershape-aarch64 - # Publish after both builds succeed and, when testing is enabled, after all - # test jobs pass. run_test=false publishes straight after the builds; - # publish_images=false skips publishing entirely. + # Publish after both builds succeed. Tests always run; by default all test + # jobs must pass, but override_test_failure=true publishes even if they + # fail. publish_images=false skips publishing entirely. if: >- !cancelled() && inputs.publish_images && needs.build-gcp-x86_64.result == 'success' && needs.build-gcp-aarch64.result == 'success' && ( - inputs.run_test == false + inputs.override_test_failure || ( needs.test-gcp-initialtest.result == 'success' && needs.test-gcp-pershape-x86_64.result == 'success' From 712f25641beb1977fcfe4091bb90bf026c956cc6 Mon Sep 17 00:00:00 2001 From: Yuriy Kohut Date: Wed, 1 Jul 2026 11:19:14 +0300 Subject: [PATCH 3/4] feat(ci): gcp unified: manual approval gate to publish on test failure Replace override_test_failure with an interactive approval. Tests still always run. When all tests pass, publish runs automatically. When a test job fails, a new approve-publish gate job - behind the gcp-prod-publish GitHub Environment (required reviewers) - pauses the run so a human can inspect the failure and approve or reject publishing; publish-gcp then runs only if all tests passed OR approve-publish was approved. Requires a one-time repo setup: create the `gcp-prod-publish` Environment (Settings -> Environments) with Required reviewers. Without reviewers the environment provides no protection and the gate does not actually pause. --- .github/workflows/gcp-build-test-publish.yml | 61 ++++++++++++++------ 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/.github/workflows/gcp-build-test-publish.yml b/.github/workflows/gcp-build-test-publish.yml index fca050d8..f21a0092 100644 --- a/.github/workflows/gcp-build-test-publish.yml +++ b/.github/workflows/gcp-build-test-publish.yml @@ -12,9 +12,11 @@ run-name: >- # # Stage gating: # tests always run. -# override_test_failure=true -> publish even if test(s) fail. -# publish_images=false -> images are built and tested but not published. -# a test job fails (override_test_failure=false) -> publish is skipped. +# all tests pass -> publish runs automatically. +# a test fails -> the approve-publish gate (gcp-prod-publish environment, +# required reviewers) pauses for a human to approve or +# reject publishing after checking the failure. +# publish_images=false -> images are built and tested but not published. # # arch is fixed to ALL (both x86_64 and aarch64 are built, tested, published). # image_datetag for publish is the build's YYYYMMDD date_stamp. @@ -57,12 +59,6 @@ on: type: boolean default: true - override_test_failure: - description: "Publish the images even if test(s) fail. Tests always run regardless of this setting." - required: true - type: boolean - default: false - # ---- [tests] settings (optional custom cloud-image-tests build) ---- cit_git_repo: description: '[tests] owner/repo of a cloud-image-tests fork to build instead of the prebuilt image. Empty = gcr.io/compute-image-tools/cloud-image-tests:latest. owner/repo only (no full URL).' @@ -583,6 +579,34 @@ jobs: | "| \(.location // "?") | \(.quota) | \(.shape // "?") | \(.count) |" ' all-quota-failures.jsonl >> "$GITHUB_STEP_SUMMARY" + # Manual decision gate: runs only when a test job did NOT pass. Behind the + # gcp-prod-publish environment (required reviewers) so a human can inspect the + # failure and approve or reject publishing. When all tests pass it is skipped + # and publish runs automatically. + approve-publish: + name: Approve publish despite test failure + needs: + - build-gcp-x86_64 + - build-gcp-aarch64 + - test-gcp-initialtest + - test-gcp-pershape-x86_64 + - test-gcp-pershape-aarch64 + if: >- + !cancelled() + && inputs.publish_images + && needs.build-gcp-x86_64.result == 'success' + && needs.build-gcp-aarch64.result == 'success' + && !( + needs.test-gcp-initialtest.result == 'success' + && needs.test-gcp-pershape-x86_64.result == 'success' + && needs.test-gcp-pershape-aarch64.result == 'success' + ) + environment: gcp-prod-publish + runs-on: ubuntu-24.04 + steps: + - name: Approved + run: echo "Publishing approved despite test failure(s)." + publish-gcp: name: Publish ${{ inputs.version_major }} ${{ matrix.arch }} to almalinux-cloud (prod) needs: @@ -592,21 +616,22 @@ jobs: - test-gcp-initialtest - test-gcp-pershape-x86_64 - test-gcp-pershape-aarch64 - # Publish after both builds succeed. Tests always run; by default all test - # jobs must pass, but override_test_failure=true publishes even if they - # fail. publish_images=false skips publishing entirely. + - approve-publish + # Publish after both builds succeed and either all tests passed (automatic) + # or the approve-publish gate was approved (manual, after a test failure). + # publish_images=false skips publishing entirely. if: >- !cancelled() && inputs.publish_images && needs.build-gcp-x86_64.result == 'success' && needs.build-gcp-aarch64.result == 'success' && ( - inputs.override_test_failure - || ( - needs.test-gcp-initialtest.result == 'success' - && needs.test-gcp-pershape-x86_64.result == 'success' - && needs.test-gcp-pershape-aarch64.result == 'success' - ) + ( + needs.test-gcp-initialtest.result == 'success' + && needs.test-gcp-pershape-x86_64.result == 'success' + && needs.test-gcp-pershape-aarch64.result == 'success' + ) + || needs.approve-publish.result == 'success' ) permissions: id-token: write From e88ea36c2ea684cfbffe956290d0efda8c85441b Mon Sep 17 00:00:00 2001 From: Yuriy Kohut Date: Wed, 1 Jul 2026 17:08:19 +0300 Subject: [PATCH 4/4] docs(gcp): add GCP_BUILD_TEST_PUBLISH.md for the unified workflow Document gcp-build-test-publish.yml (build -> cloud-image-tests -> publish): inputs, job layout, stage gating, the manual approve-publish gate behind the gcp-prod-publish environment (auto-publish on all-green, human approval on test failure), OIDC auth, and required config. Add a README index row. --- GCP_BUILD_TEST_PUBLISH.md | 157 ++++++++++++++++++++++++++++++++++++++ README.md | 1 + 2 files changed, 158 insertions(+) create mode 100644 GCP_BUILD_TEST_PUBLISH.md diff --git a/GCP_BUILD_TEST_PUBLISH.md b/GCP_BUILD_TEST_PUBLISH.md new file mode 100644 index 00000000..7a3e933c --- /dev/null +++ b/GCP_BUILD_TEST_PUBLISH.md @@ -0,0 +1,157 @@ +# GCP: Build, Test and Publish (unified pipeline) + +## Overview + +`.github/workflows/gcp-build-test-publish.yml` runs the whole GCP image +lifecycle in a single `workflow_dispatch`: + +1. **Build** the images with Packer (x86_64 + aarch64, via + [`gcp-build-steps`](.github/actions/gcp-build-steps/action.yml)): + upload to the dev GCS buckets and publish a dev test image. +2. **Test** every built image with Google **cloud-image-tests** (CIT): + smoke tests on both arches in parallel, then the per-shape matrices. + Tests always run. +3. **Publish** each arch to the `almalinux-cloud` (prod) project, one arch + at a time - **automatically when all tests pass**, or **after a manual + approval** when a test fails (see below). + +It reuses the jobs from the standalone GCP workflows +([`gcp-build.yml`](BUILD_GCP.md), [`gcp-test.yml`](GCP_IMAGE_TEST_PUBLISH.md), +[`gcp-publish.yml`](GCP_IMAGE_TEST_PUBLISH.md)), which remain available for +running an individual stage or recovering a partial run. + +### Manual approval on test failure (the interactive gate) + +Because publishing targets the public prod project, a test failure does +**not** silently block or force the release - it routes to a human: + +- **All tests pass** -> the `approve-publish` gate is skipped and + `publish-gcp` runs automatically. +- **A test fails** -> the `approve-publish` job runs. It is tied to the + **`gcp-prod-publish` GitHub Environment** (Required reviewers), so the + run **pauses and waits** for a reviewer to inspect the failure and + **Approve** (publish proceeds) or **Reject** (publish is skipped). While + waiting it consumes no runner minutes. `publish-gcp` then runs only if + all tests passed OR `approve-publish` was approved. + +There is no `run_test` / `override_test_failure` input - the human +approval is the decision on failure. + +> **Required one-time setup:** create the `gcp-prod-publish` Environment +> (repo **Settings -> Environments**) with **Required reviewers**. Without +> reviewers the environment provides no protection and the gate will not +> actually pause - a failed-test run would publish unreviewed. + +### When to use which + +| Use | Workflow | +| :--- | :--- | +| Build + test + publish in one dispatch | this unified workflow | +| Just (re)build the images | `gcp-build.yml` | +| Test an existing image | `gcp-test.yml` | +| Publish an existing dev image to prod | `gcp-publish.yml` | + +## Workflow inputs + +| Input | Default | Notes | +| :--- | :--- | :--- | +| `date_time_stamp` | auto (`date -u +%Y%m%d%H%M%S`) | Shared stamp; its `YYYYMMDD` prefix is the publish `image_datetag`. | +| `version_major` | `10` | `10-kitten`, `10`, `9`, `8`. arch is fixed to **ALL** (both x86_64 and aarch64). | +| `self-hosted` | `true` | Build the aarch64 image on a self-hosted runner. Keep true - the ALL-arch test/publish needs it. | +| `store_as_artifact` | `false` | Upload images as workflow artifacts. | +| `upload_to_s3` | `true` | Upload to S3 in parallel. | +| `cit_git_repo` | `''` | Optional: `owner/repo` of a cloud-image-tests fork to build instead of the prebuilt image. | +| `cit_git_ref` | `''` | Branch/tag/SHA in `cit_git_repo`. Ignored when `cit_git_repo` is empty. | +| `publish_images` | `true` | `false` = build and test only, no publish. | +| `notify_mattermost` | `true` | Post build notifications to Mattermost. | + +There is intentionally no `run_test` input: tests always run. (`workflow_dispatch` +allows at most 10 inputs; this workflow uses 9.) + +## Job layout + +``` +init-data (time_stamp + YYYYMMDD date_stamp + dev image_path) + |- build-gcp-x86_64 -. gcp-build-steps: build + dev GCS + |- start-self-hosted-runner (fork EC2) | upload + dev test image + '- build-gcp-aarch64 -' + |- build-cit (optional, cit_git_repo set) + | + test-gcp-initialtest (x86_64 + aarch64 smoke, in parallel) + | + test-gcp-pershape-x86_64 / test-gcp-pershape-aarch64 (+ summarize-quota-failures) + | + approve-publish (only if a test failed; gcp-prod-publish environment, required reviewers) + | + publish-gcp (matrix arch, max-parallel: 1 -> almalinux-cloud prod) +``` + +### Stage gating + +``` +tests always run. +all tests pass -> publish runs automatically. +a test fails -> approve-publish pauses for a reviewer; publish only on Approve. +build fails -> no publish (the approval gate covers test failures, not build failures). +publish_images=false -> build + test only, no publish. +``` + +`publish-gcp` runs when `!cancelled() && publish_images && both builds +succeeded && (all tests passed || approve-publish approved)`. + +## Runners + +- **build-gcp-x86_64** / **build-gcp-aarch64**: RunsOn metal + (`ubuntu24-full-x64` / `ubuntu24-full-arm64`) in the org; `ubuntu-24.04` + / self-hosted EC2 on forks. +- **build-cit**: RunsOn `m8azn` (Go compile) when building a CIT fork. +- **test jobs**: `ubuntu-24.04` (smoke, aarch64 per-shape) and a RunsOn + `almalinux-10-x86_64` runner (x86_64 per-shape). Tests run the real GCP + VMs via `cit-run-with-retry`. +- **publish-gcp**: `ubuntu-latest`. + +## Authentication and required configuration + +GCP access is via **Workload Identity Federation (OIDC)** - no GCP keys in +secrets. The relevant jobs declare `permissions: id-token: write`: + +- Tests authenticate as the image-testing service account + (`almalinux-image-testing` project). +- Publish authenticates as the prod-release service account + (`almalinux-image-release` project). + +### Secrets +| Secret | Description | +|--------|-------------| +| `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` | S3 upload (build stage) | +| `MATTERMOST_WEBHOOK_URL` | Mattermost incoming webhook URL | +| `GIT_HUB_TOKEN` | Packer plugin GitHub API token | +| `EC2_AMI_ID_AL9_AARCH64`, `EC2_SUBNET_ID`, `EC2_SECURITY_GROUP_ID` | fork-only aarch64 EC2 runner | + +### Variables (`vars.*`) +| Variable | Description | +|----------|-------------| +| `AWS_REGION`, `AWS_S3_BUCKET` | S3 upload target | +| `MATTERMOST_CHANNEL` | Mattermost channel for notifications | + +### Environment (required for the approval gate) +- **`gcp-prod-publish`** with **Required reviewers** (repo Settings -> + Environments). Referenced by the `approve-publish` job. + +## Troubleshooting + +1. **A failed-test run published without asking** - the `gcp-prod-publish` + environment has no Required reviewers, so the gate did not pause. Add + reviewers in repo Settings. +2. **Publish skipped after a test failure** - the reviewer Rejected, or the + approval timed out (GitHub waits up to 30 days). +3. **aarch64 image missing at test/publish** - `self-hosted` was set false, + so the aarch64 build was skipped; keep it true for the ALL-arch flow. +4. **`workflow_dispatch` input-limit error from actionlint** - GitHub caps + `workflow_dispatch` at 10 inputs; this workflow uses 9, so anything new + needs one removed. + +## See also + +- [BUILD_GCP.md](BUILD_GCP.md) - the build stage (`gcp-build-steps`), SBOM, dev GCS upload, dev test image. +- [GCP_IMAGE_TEST_PUBLISH.md](GCP_IMAGE_TEST_PUBLISH.md) - the cloud-image-tests shapes/filters and the prod publish flow in detail. diff --git a/README.md b/README.md index bf2b4344..03cab8b9 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ This repository includes GitHub Actions workflows that automate building, testin | [OPENNEBULA_BUILD_TEST.md](OPENNEBULA_BUILD_TEST.md) | `opennebula-build-test.yml`, `shared-steps/`, `opennebula-test-steps/action.yml` | Unified single-dispatch OpenNebula pipeline: build → boot-test each image in-job under QEMU/KVM on the build runner with a one-context `CONTEXT` ISO (aarch64 builds on `ubuntu24-full-arm64` so the test runs locally; no S3 round-trip) | | [OPENNEBULA_TEST.md](OPENNEBULA_TEST.md) | `opennebula-test.yml`, `opennebula-test-steps/action.yml` | Sanity-test an OpenNebula `.qcow2` image directly under QEMU/KVM on the runner with a one-context `CONTEXT` ISO, run release/arch/disk/`dnf` and one-context-specific assertions over SSH, collect the package list, shut the guest down | | [GCP_IMAGE_TEST_PUBLISH.md](GCP_IMAGE_TEST_PUBLISH.md) | `gcp-test.yml`, `gcp-publish.yml` | Test GCP images across many machine shapes using Cloud Image Tests, then publish to the `almalinux-cloud` project (build step is covered by [BUILD_GCP.md](BUILD_GCP.md)) | +| [GCP_BUILD_TEST_PUBLISH.md](GCP_BUILD_TEST_PUBLISH.md) | `gcp-build-test-publish.yml`, `gcp-build-steps/action.yml` | Unified single-dispatch GCP pipeline: build → Cloud Image Tests → publish to `almalinux-cloud`, with an automatic publish on all-green and a manual approval gate (`gcp-prod-publish` environment) when a test fails | | [VAGRANT_CLOUD.md](VAGRANT_CLOUD.md) | `vagrant-publish.yml` | Publish Vagrant boxes to HashiCorp Cloud Platform (Vagrant Cloud) for VirtualBox, libvirt, and VMware Desktop providers | | [VMWARE_OVA.md](VMWARE_OVA.md) | _(manual process)_ | Convert Vagrant VMware Desktop `.box` files to vSphere/ESXi-compatible `.ova` templates |