From 27c41f5696fcc61c279a158c02e4f0d8e318269b Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 17:22:02 +0100 Subject: [PATCH 01/17] making deploy quicker - less intervals, observability now optional --- deploy.sh | 23 +++++++++++++++++++---- docker-compose.yaml | 14 +++++++------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/deploy.sh b/deploy.sh index f25c480f..68f8b25a 100755 --- a/deploy.sh +++ b/deploy.sh @@ -59,6 +59,8 @@ show_help() { echo " --build Rebuild images" echo " --wait Wait for services to be healthy" echo " --timeout Health check timeout (default: 300)" + echo " --observability Include Grafana, Jaeger, etc." + echo " --debug Include observability + Kafdrop" echo " infra [options] Start infrastructure only (mongo, redis, kafka, etc.)" echo " --wait Wait for services to be healthy" echo " --timeout Health check timeout (default: 120)" @@ -99,6 +101,7 @@ cmd_dev() { local BUILD_FLAG="" local WAIT_FLAG="" local WAIT_TIMEOUT="300" + local PROFILE_FLAGS="" while [[ $# -gt 0 ]]; do case "$1" in @@ -113,6 +116,14 @@ cmd_dev() { shift WAIT_TIMEOUT="$1" ;; + --observability) + PROFILE_FLAGS="--profile observability" + print_info "Including observability stack (Grafana, Jaeger, etc.)" + ;; + --debug) + PROFILE_FLAGS="--profile observability --profile debug" + print_info "Including observability + debug tools (Kafdrop, etc.)" + ;; esac shift done @@ -122,7 +133,7 @@ cmd_dev() { WAIT_TIMEOUT_FLAG="--wait-timeout $WAIT_TIMEOUT" fi - docker compose --profile observability up -d $BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG + docker compose $PROFILE_FLAGS up -d $BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG echo "" print_success "Development environment started!" @@ -130,9 +141,13 @@ cmd_dev() { echo "Services:" echo " Backend: https://localhost:443" echo " Frontend: https://localhost:5001" - echo " Kafdrop: http://localhost:9000" - echo " Jaeger: http://localhost:16686" - echo " Grafana: http://localhost:3000" + if [[ "$PROFILE_FLAGS" == *"debug"* ]]; then + echo " Kafdrop: http://localhost:9000" + fi + if [[ "$PROFILE_FLAGS" == *"observability"* ]]; then + echo " Jaeger: http://localhost:16686" + echo " Grafana: http://localhost:3000" + fi echo "" echo "Commands:" echo " ./deploy.sh logs # View all logs" diff --git a/docker-compose.yaml b/docker-compose.yaml index bbbb40a1..8d3cd5fd 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -10,7 +10,7 @@ services: image: alpine:latest volumes: - shared_ca:/shared_ca - command: sh -c "mkdir -p /shared_ca && chmod 777 /shared_ca && echo 'Shared CA directory ready' && sleep 2" + command: sh -c "mkdir -p /shared_ca && chmod 777 /shared_ca && echo 'Shared CA directory ready'" networks: - app-network @@ -74,10 +74,10 @@ services: - app-network healthcheck: test: ["CMD", "redis-cli", "ping"] - interval: 10s + interval: 5s timeout: 5s retries: 5 - start_period: 10s + start_period: 5s backend: build: @@ -120,12 +120,11 @@ services: extra_hosts: - "host.docker.internal:host-gateway" healthcheck: - # Simpler, reliable healthcheck: curl fails non-zero for HTTP >=400 with -f test: ["CMD-SHELL", "curl -k -f -s https://localhost:443/api/v1/health/live >/dev/null || exit 1"] interval: 3s timeout: 3s retries: 50 - start_period: 10s + start_period: 5s frontend: container_name: frontend @@ -154,7 +153,7 @@ services: interval: 3s timeout: 3s retries: 30 - start_period: 10s + start_period: 5s grafana: @@ -322,7 +321,7 @@ services: interval: 5s timeout: 10s retries: 12 - start_period: 15s + start_period: 5s schema-registry: image: confluentinc/cp-schema-registry:7.8.2 @@ -348,6 +347,7 @@ services: kafdrop: image: obsidiandynamics/kafdrop:3.31.0 container_name: kafdrop + profiles: ["debug"] depends_on: - kafka - schema-registry From da2ef689ae0899a0c072bbebf9f062a16814967f Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 18:43:55 +0100 Subject: [PATCH 02/17] Changes Made MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stack-tests.yml — Build, Test, Push - Removed dev from branch triggers (only main now) - Added tags: ['v*'] and cert-generator/** path triggers - Build job now pushes to GHCR with immutable sha-{sha} tag (push events only) - Added missing pre-builds: event-replay, dlq-processor, zookeeper-certgen (these were being rebuilt during compose startup before) - Added frontend-prod build (from Dockerfile.prod, pushed as frontend:sha-xxx for Trivy scanning) - E2E jobs pull from GHCR on push events (parallel docker pull & + retag to compose names), fall back to artifact for PRs - All push/pull commands are spelled out explicitly (no for loops) - Added packages: write permission to build job docker.yml — Scan & Promote (rewritten) - Trigger: workflow_run on "Stack Tests" completion (+ workflow_dispatch with optional SHA input) - Only runs when Stack Tests succeed on main - Scan jobs: Trivy scans backend and frontend-prod from GHCR using SHA tag - Promote job: crane copy sha-xxx → latest for all 12 images — registry-level manifest copy, no rebuild - latest is NEVER set during build — only after all tests + scans pass Flow Push to main: stack-tests.yml: unit → build (push sha-xxx to GHCR) → E2E (pull from GHCR) docker.yml: (on success) → scan → promote sha-xxx → latest PR: stack-tests.yml: unit → build (save artifact) → E2E (load artifact) docker.yml: (skipped — only triggers on main) --- .github/workflows/docker.yml | 317 +++++++++++------------------- .github/workflows/stack-tests.yml | 218 ++++++++++++++++++-- 2 files changed, 319 insertions(+), 216 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 2cdd4f40..c6535c54 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,213 +1,52 @@ -name: Docker Build, Scan & Publish +name: Docker Scan & Promote +# Runs after Stack Tests completes on main — promotes sha-xxx → latest. +# "latest" is NEVER set during build. Only this workflow can set it, +# and only after all tests pass. If any test fails, latest stays unchanged. on: - push: - branches: [ main ] - tags: [ 'v*' ] - pull_request: - branches: [ main ] + workflow_run: + workflows: ["Stack Tests"] + types: [completed] workflow_dispatch: + inputs: + sha: + description: 'Full commit SHA to promote (defaults to latest main)' + required: false env: REGISTRY: ghcr.io jobs: - build-base: - name: Build Base - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - - outputs: - image-tag: ${{ steps.image-tag.outputs.tag }} - - steps: - - uses: actions/checkout@v6 - - - name: Set lowercase image prefix - run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/base - tags: | - type=ref,event=branch - type=ref,event=pr - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=sha,prefix=sha- - type=raw,value=latest,enable={{is_default_branch}} - - - name: Determine image tag for dependent builds - id: image-tag - run: | - if [ "${{ github.event_name }}" = "pull_request" ]; then - echo "tag=pr-${{ github.event.number }}" >> $GITHUB_OUTPUT - else - echo "tag=latest" >> $GITHUB_OUTPUT - fi - - - name: Build and push - uses: docker/build-push-action@v6 - with: - context: ./backend - file: ./backend/Dockerfile.base - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha,scope=base - cache-to: type=gha,mode=max,scope=base - - build-backend: - name: Build Backend - needs: build-base - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - - outputs: - image-ref: ${{ steps.image-ref.outputs.ref }} - - steps: - - uses: actions/checkout@v6 - - - name: Set lowercase image prefix - run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend - tags: | - type=ref,event=branch - type=ref,event=pr - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=sha,prefix=sha- - type=raw,value=latest,enable={{is_default_branch}} - - - name: Set image reference for scan - id: image-ref - run: | - if [ "${{ github.event_name }}" = "pull_request" ]; then - echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend:pr-${{ github.event.number }}" >> $GITHUB_OUTPUT - else - echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend:latest" >> $GITHUB_OUTPUT - fi - - - name: Build and push - uses: docker/build-push-action@v6 - with: - context: ./backend - file: ./backend/Dockerfile - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha,scope=backend - cache-to: type=gha,mode=max,scope=backend - build-contexts: | - base=docker-image://${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/base:${{ needs.build-base.outputs.image-tag }} - - build-frontend: - name: Build Frontend - needs: build-base + scan-backend: + name: Scan Backend + if: > + github.event_name == 'workflow_dispatch' || + (github.event.workflow_run.conclusion == 'success' && + github.event.workflow_run.head_branch == 'main') runs-on: ubuntu-latest permissions: contents: read - packages: write - - outputs: - image-ref: ${{ steps.image-ref.outputs.ref }} - + security-events: write + packages: read steps: - uses: actions/checkout@v6 - - name: Set lowercase image prefix - run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend - tags: | - type=ref,event=branch - type=ref,event=pr - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=sha,prefix=sha- - type=raw,value=latest,enable={{is_default_branch}} - - - name: Set image reference for scan - id: image-ref + - name: Compute image ref + id: ref run: | - if [ "${{ github.event_name }}" = "pull_request" ]; then - echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend:pr-${{ github.event.number }}" >> $GITHUB_OUTPUT + PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode" + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + SHA="${{ github.event.inputs.sha || github.sha }}" else - echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend:latest" >> $GITHUB_OUTPUT + SHA="${{ github.event.workflow_run.head_sha }}" fi - - - name: Build and push - uses: docker/build-push-action@v6 - with: - context: ./frontend - file: ./frontend/Dockerfile.prod - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha,scope=frontend - cache-to: type=gha,mode=max,scope=frontend - - scan-backend: - name: Scan Backend - needs: build-backend - runs-on: ubuntu-latest - permissions: - contents: read - security-events: write - - steps: - - uses: actions/checkout@v6 + TAG="sha-${SHA::7}" + echo "image=${{ env.REGISTRY }}/$PREFIX/backend:$TAG" >> $GITHUB_OUTPUT - name: Run Trivy vulnerability scanner uses: aquasecurity/trivy-action@0.33.1 with: - image-ref: ${{ needs.build-backend.outputs.image-ref }} + image-ref: ${{ steps.ref.outputs.image }} format: 'sarif' output: 'trivy-backend-results.sarif' ignore-unfixed: true @@ -225,17 +64,32 @@ jobs: scan-frontend: name: Scan Frontend - needs: build-frontend + if: > + github.event_name == 'workflow_dispatch' || + (github.event.workflow_run.conclusion == 'success' && + github.event.workflow_run.head_branch == 'main') runs-on: ubuntu-latest permissions: contents: read security-events: write - + packages: read steps: + - name: Compute image ref + id: ref + run: | + PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode" + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + SHA="${{ github.event.inputs.sha || github.sha }}" + else + SHA="${{ github.event.workflow_run.head_sha }}" + fi + TAG="sha-${SHA::7}" + echo "image=${{ env.REGISTRY }}/$PREFIX/frontend:$TAG" >> $GITHUB_OUTPUT + - name: Run Trivy vulnerability scanner uses: aquasecurity/trivy-action@0.33.1 with: - image-ref: ${{ needs.build-frontend.outputs.image-ref }} + image-ref: ${{ steps.ref.outputs.image }} format: 'sarif' output: 'trivy-frontend-results.sarif' ignore-unfixed: true @@ -250,26 +104,79 @@ jobs: sarif_file: 'trivy-frontend-results.sarif' category: 'trivy-frontend' + # Promote SHA tag → latest using crane (registry-level manifest copy, no rebuild) + promote: + name: Promote to Latest + needs: [scan-backend, scan-frontend] + if: > + github.event_name == 'workflow_dispatch' || + (github.event.workflow_run.conclusion == 'success' && + github.event.workflow_run.head_branch == 'main') + runs-on: ubuntu-latest + permissions: + packages: write + steps: + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Install crane + uses: imjasonh/setup-crane@v0.4 + + - name: Promote images (SHA → latest) + run: | + PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode" + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + SHA="${{ github.event.inputs.sha || github.sha }}" + else + SHA="${{ github.event.workflow_run.head_sha }}" + fi + TAG="sha-${SHA::7}" + + echo "Promoting tag: $TAG → latest" + echo "" + + crane copy "$REGISTRY/$PREFIX/base:$TAG" "$REGISTRY/$PREFIX/base:latest" + crane copy "$REGISTRY/$PREFIX/backend:$TAG" "$REGISTRY/$PREFIX/backend:latest" + crane copy "$REGISTRY/$PREFIX/frontend:$TAG" "$REGISTRY/$PREFIX/frontend:latest" + crane copy "$REGISTRY/$PREFIX/coordinator:$TAG" "$REGISTRY/$PREFIX/coordinator:latest" + crane copy "$REGISTRY/$PREFIX/k8s-worker:$TAG" "$REGISTRY/$PREFIX/k8s-worker:latest" + crane copy "$REGISTRY/$PREFIX/pod-monitor:$TAG" "$REGISTRY/$PREFIX/pod-monitor:latest" + crane copy "$REGISTRY/$PREFIX/result-processor:$TAG" "$REGISTRY/$PREFIX/result-processor:latest" + crane copy "$REGISTRY/$PREFIX/saga-orchestrator:$TAG" "$REGISTRY/$PREFIX/saga-orchestrator:latest" + crane copy "$REGISTRY/$PREFIX/event-replay:$TAG" "$REGISTRY/$PREFIX/event-replay:latest" + crane copy "$REGISTRY/$PREFIX/dlq-processor:$TAG" "$REGISTRY/$PREFIX/dlq-processor:latest" + crane copy "$REGISTRY/$PREFIX/cert-generator:$TAG" "$REGISTRY/$PREFIX/cert-generator:latest" + crane copy "$REGISTRY/$PREFIX/zookeeper-certgen:$TAG" "$REGISTRY/$PREFIX/zookeeper-certgen:latest" + summary: name: Summary - if: github.event_name != 'pull_request' - needs: [build-base, build-backend, build-frontend, scan-backend, scan-frontend] + needs: [promote] runs-on: ubuntu-latest - steps: - - name: Set lowercase image prefix - run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV - - name: Generate summary run: | - echo "## Docker Images Published" >> $GITHUB_STEP_SUMMARY + PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode" + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + SHA="${{ github.event.inputs.sha || github.sha }}" + else + SHA="${{ github.event.workflow_run.head_sha }}" + fi + TAG="sha-${SHA::7}" + + echo "## Docker Images Promoted to Latest" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "All Stack Tests passed. Images promoted from \`$TAG\` to \`latest\`." >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "| Image | Pull Command |" >> $GITHUB_STEP_SUMMARY echo "|-------|--------------|" >> $GITHUB_STEP_SUMMARY - echo "| Base | \`docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/base:latest\` |" >> $GITHUB_STEP_SUMMARY - echo "| Backend | \`docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend:latest\` |" >> $GITHUB_STEP_SUMMARY - echo "| Frontend | \`docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend:latest\` |" >> $GITHUB_STEP_SUMMARY + echo "| Base | \`docker pull $REGISTRY/$PREFIX/base:latest\` |" >> $GITHUB_STEP_SUMMARY + echo "| Backend | \`docker pull $REGISTRY/$PREFIX/backend:latest\` |" >> $GITHUB_STEP_SUMMARY + echo "| Frontend | \`docker pull $REGISTRY/$PREFIX/frontend:latest\` |" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "### Scan Results" >> $GITHUB_STEP_SUMMARY - echo "- Backend scan: ✅ Passed" >> $GITHUB_STEP_SUMMARY - echo "- Frontend scan: ✅ Passed" >> $GITHUB_STEP_SUMMARY + echo "### Security Scans" >> $GITHUB_STEP_SUMMARY + echo "- Backend: Passed" >> $GITHUB_STEP_SUMMARY + echo "- Frontend: Passed" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index c2804f73..f5257e4b 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -2,19 +2,22 @@ name: Stack Tests on: push: - branches: [main, dev] + branches: [main] + tags: ['v*'] paths: - 'backend/**' - 'frontend/**' + - 'cert-generator/**' - 'docker-compose.yaml' - 'deploy.sh' - '.github/workflows/stack-tests.yml' - '.github/actions/**' pull_request: - branches: [main, dev] + branches: [main] paths: - 'backend/**' - 'frontend/**' + - 'cert-generator/**' - 'docker-compose.yaml' - 'deploy.sh' - '.github/workflows/stack-tests.yml' @@ -22,6 +25,7 @@ on: workflow_dispatch: env: + REGISTRY: ghcr.io MONGO_IMAGE: mongo:8.0 REDIS_IMAGE: redis:7-alpine KAFKA_IMAGE: confluentinc/cp-kafka:7.8.2 @@ -102,17 +106,41 @@ jobs: fail_ci_if_error: false verbose: true - # Build all images once, cache for test jobs + # Build all images, push to GHCR with immutable SHA tag (push events only). + # PRs fall back to artifact transfer (can't push to GHCR from forks). build-images: - name: Build Images + name: Build & Push Images needs: [backend-unit, frontend-unit] runs-on: ubuntu-latest + permissions: + contents: read + packages: write + outputs: + sha-tag: ${{ steps.tags.outputs.sha-tag }} + image-prefix: ${{ steps.tags.outputs.image-prefix }} steps: - uses: actions/checkout@v6 - name: Setup Docker Buildx uses: docker/setup-buildx-action@v3 + - name: Log in to GHCR + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Compute image tags + id: tags + run: | + PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode" + SHA_TAG="sha-${GITHUB_SHA::7}" + echo "sha-tag=$SHA_TAG" >> $GITHUB_OUTPUT + echo "image-prefix=$PREFIX" >> $GITHUB_OUTPUT + + # ── Base image (cached separately — rarely changes) ────────────── - name: Cache base image uses: actions/cache@v5 id: base-cache @@ -139,15 +167,50 @@ jobs: if: steps.base-cache.outputs.cache-hit != 'true' run: docker save integr8scode-base:latest | zstd -T0 -3 > /tmp/base-image.tar.zst + - name: Push base to GHCR + if: github.event_name != 'pull_request' + run: | + docker tag integr8scode-base:latest \ + ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }} + docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }} + + # ── Backend + workers (depend on local base image) ─────────────── - name: Build all images run: | docker build -t integr8scode-backend:latest --build-context base=docker-image://integr8scode-base:latest -f ./backend/Dockerfile ./backend - docker build -t integr8scode-coordinator:latest -f backend/workers/Dockerfile.coordinator --build-context base=docker-image://integr8scode-base:latest ./backend - docker build -t integr8scode-k8s-worker:latest -f backend/workers/Dockerfile.k8s_worker --build-context base=docker-image://integr8scode-base:latest ./backend - docker build -t integr8scode-pod-monitor:latest -f backend/workers/Dockerfile.pod_monitor --build-context base=docker-image://integr8scode-base:latest ./backend - docker build -t integr8scode-result-processor:latest -f backend/workers/Dockerfile.result_processor --build-context base=docker-image://integr8scode-base:latest ./backend - docker build -t integr8scode-saga-orchestrator:latest -f backend/workers/Dockerfile.saga_orchestrator --build-context base=docker-image://integr8scode-base:latest ./backend - + docker build -t integr8scode-coordinator:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.coordinator ./backend + docker build -t integr8scode-k8s-worker:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.k8s_worker ./backend + docker build -t integr8scode-pod-monitor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.pod_monitor ./backend + docker build -t integr8scode-result-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.result_processor ./backend + docker build -t integr8scode-saga-orchestrator:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.saga_orchestrator ./backend + docker build -t integr8scode-event-replay:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.event_replay ./backend + docker build -t integr8scode-dlq-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.dlq_processor ./backend + + - name: Push backend and workers to GHCR + if: github.event_name != 'pull_request' + env: + TAG: ${{ steps.tags.outputs.sha-tag }} + IMG: ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }} + run: | + docker tag integr8scode-backend:latest "$IMG/backend:$TAG" + docker tag integr8scode-coordinator:latest "$IMG/coordinator:$TAG" + docker tag integr8scode-k8s-worker:latest "$IMG/k8s-worker:$TAG" + docker tag integr8scode-pod-monitor:latest "$IMG/pod-monitor:$TAG" + docker tag integr8scode-result-processor:latest "$IMG/result-processor:$TAG" + docker tag integr8scode-saga-orchestrator:latest "$IMG/saga-orchestrator:$TAG" + docker tag integr8scode-event-replay:latest "$IMG/event-replay:$TAG" + docker tag integr8scode-dlq-processor:latest "$IMG/dlq-processor:$TAG" + + docker push "$IMG/backend:$TAG" + docker push "$IMG/coordinator:$TAG" + docker push "$IMG/k8s-worker:$TAG" + docker push "$IMG/pod-monitor:$TAG" + docker push "$IMG/result-processor:$TAG" + docker push "$IMG/saga-orchestrator:$TAG" + docker push "$IMG/event-replay:$TAG" + docker push "$IMG/dlq-processor:$TAG" + + # ── Utility images (GHA-cached, independent of base) ──────────── - name: Build cert-generator image uses: docker/build-push-action@v6 with: @@ -158,6 +221,31 @@ jobs: cache-from: type=gha,scope=cert-generator cache-to: type=gha,mode=max,scope=cert-generator + - name: Push cert-generator to GHCR + if: github.event_name != 'pull_request' + run: | + docker tag integr8scode-cert-generator:latest \ + ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }} + docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }} + + - name: Build zookeeper-certgen image + uses: docker/build-push-action@v6 + with: + context: ./backend/zookeeper + file: ./backend/zookeeper/Dockerfile.certgen + load: true + tags: integr8scode-zookeeper-certgen:latest + cache-from: type=gha,scope=zookeeper-certgen + cache-to: type=gha,mode=max,scope=zookeeper-certgen + + - name: Push zookeeper-certgen to GHCR + if: github.event_name != 'pull_request' + run: | + docker tag integr8scode-zookeeper-certgen:latest \ + ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }} + docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }} + + # ── Frontend (dev for E2E, prod for scanning/deployment) ───────── - name: Build frontend image uses: docker/build-push-action@v6 with: @@ -168,7 +256,33 @@ jobs: cache-from: type=gha,scope=frontend cache-to: type=gha,mode=max,scope=frontend + - name: Push frontend-dev to GHCR + if: github.event_name != 'pull_request' + run: | + docker tag integr8scode-frontend:latest \ + ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }} + docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }} + + - name: Build frontend-prod image + uses: docker/build-push-action@v6 + with: + context: ./frontend + file: ./frontend/Dockerfile.prod + load: true + tags: integr8scode-frontend-prod:latest + cache-from: type=gha,scope=frontend-prod + cache-to: type=gha,mode=max,scope=frontend-prod + + - name: Push frontend-prod to GHCR + if: github.event_name != 'pull_request' + run: | + docker tag integr8scode-frontend-prod:latest \ + ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }} + docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }} + + # ── Save images for PR builds (artifact fallback) ──────────────── - name: Save all images + if: github.event_name == 'pull_request' run: | docker save \ integr8scode-backend:latest \ @@ -177,18 +291,22 @@ jobs: integr8scode-pod-monitor:latest \ integr8scode-result-processor:latest \ integr8scode-saga-orchestrator:latest \ + integr8scode-event-replay:latest \ + integr8scode-dlq-processor:latest \ integr8scode-cert-generator:latest \ + integr8scode-zookeeper-certgen:latest \ integr8scode-frontend:latest \ | zstd -T0 -3 > /tmp/all-images.tar.zst - name: Upload images artifact + if: github.event_name == 'pull_request' uses: actions/upload-artifact@v6 with: name: docker-images path: /tmp/all-images.tar.zst retention-days: 1 - # Parallel test jobs (backend-e2e, frontend-e2e) + # Parallel E2E test jobs backend-e2e: name: Backend E2E Tests needs: [build-images] @@ -201,13 +319,52 @@ jobs: with: images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} + # Push events: pull pre-built images from GHCR + - name: Pull images from GHCR + if: github.event_name != 'pull_request' + env: + TAG: ${{ needs.build-images.outputs.sha-tag }} + IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }} + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin + + docker pull "$IMG/base:$TAG" & + docker pull "$IMG/backend:$TAG" & + docker pull "$IMG/frontend-dev:$TAG" & + docker pull "$IMG/coordinator:$TAG" & + docker pull "$IMG/k8s-worker:$TAG" & + docker pull "$IMG/pod-monitor:$TAG" & + docker pull "$IMG/result-processor:$TAG" & + docker pull "$IMG/saga-orchestrator:$TAG" & + docker pull "$IMG/event-replay:$TAG" & + docker pull "$IMG/dlq-processor:$TAG" & + docker pull "$IMG/cert-generator:$TAG" & + docker pull "$IMG/zookeeper-certgen:$TAG" & + wait + + docker tag "$IMG/base:$TAG" integr8scode-base:latest + docker tag "$IMG/backend:$TAG" integr8scode-backend:latest + docker tag "$IMG/frontend-dev:$TAG" integr8scode-frontend:latest + docker tag "$IMG/coordinator:$TAG" integr8scode-coordinator:latest + docker tag "$IMG/k8s-worker:$TAG" integr8scode-k8s-worker:latest + docker tag "$IMG/pod-monitor:$TAG" integr8scode-pod-monitor:latest + docker tag "$IMG/result-processor:$TAG" integr8scode-result-processor:latest + docker tag "$IMG/saga-orchestrator:$TAG" integr8scode-saga-orchestrator:latest + docker tag "$IMG/event-replay:$TAG" integr8scode-event-replay:latest + docker tag "$IMG/dlq-processor:$TAG" integr8scode-dlq-processor:latest + docker tag "$IMG/cert-generator:$TAG" integr8scode-cert-generator:latest + docker tag "$IMG/zookeeper-certgen:$TAG" integr8scode-zookeeper-certgen:latest + + # PR events: load from artifact - name: Download built images + if: github.event_name == 'pull_request' uses: actions/download-artifact@v7 with: name: docker-images path: /tmp - name: Load built images + if: github.event_name == 'pull_request' run: zstd -d -c /tmp/all-images.tar.zst | docker load - name: Setup k3s @@ -314,13 +471,52 @@ jobs: with: images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} + # Push events: pull pre-built images from GHCR + - name: Pull images from GHCR + if: github.event_name != 'pull_request' + env: + TAG: ${{ needs.build-images.outputs.sha-tag }} + IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }} + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin + + docker pull "$IMG/base:$TAG" & + docker pull "$IMG/backend:$TAG" & + docker pull "$IMG/frontend-dev:$TAG" & + docker pull "$IMG/coordinator:$TAG" & + docker pull "$IMG/k8s-worker:$TAG" & + docker pull "$IMG/pod-monitor:$TAG" & + docker pull "$IMG/result-processor:$TAG" & + docker pull "$IMG/saga-orchestrator:$TAG" & + docker pull "$IMG/event-replay:$TAG" & + docker pull "$IMG/dlq-processor:$TAG" & + docker pull "$IMG/cert-generator:$TAG" & + docker pull "$IMG/zookeeper-certgen:$TAG" & + wait + + docker tag "$IMG/base:$TAG" integr8scode-base:latest + docker tag "$IMG/backend:$TAG" integr8scode-backend:latest + docker tag "$IMG/frontend-dev:$TAG" integr8scode-frontend:latest + docker tag "$IMG/coordinator:$TAG" integr8scode-coordinator:latest + docker tag "$IMG/k8s-worker:$TAG" integr8scode-k8s-worker:latest + docker tag "$IMG/pod-monitor:$TAG" integr8scode-pod-monitor:latest + docker tag "$IMG/result-processor:$TAG" integr8scode-result-processor:latest + docker tag "$IMG/saga-orchestrator:$TAG" integr8scode-saga-orchestrator:latest + docker tag "$IMG/event-replay:$TAG" integr8scode-event-replay:latest + docker tag "$IMG/dlq-processor:$TAG" integr8scode-dlq-processor:latest + docker tag "$IMG/cert-generator:$TAG" integr8scode-cert-generator:latest + docker tag "$IMG/zookeeper-certgen:$TAG" integr8scode-zookeeper-certgen:latest + + # PR events: load from artifact - name: Download built images + if: github.event_name == 'pull_request' uses: actions/download-artifact@v7 with: name: docker-images path: /tmp - name: Load built images + if: github.event_name == 'pull_request' run: zstd -d -c /tmp/all-images.tar.zst | docker load - name: Setup k3s From feeebe73ff9aa60dbe6c6f5de188e5fdec9357dc Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 18:58:07 +0100 Subject: [PATCH 03/17] Replaced two separate scan jobs (scan-backend, scan-frontend) with a single matrix job (scan) that scans all 12 images in parallel: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fail-fast: false — one image's vulnerability findings don't cancel the other scans - Each matrix entry runs as its own parallel job on a separate runner - SARIF results uploaded per-image with unique categories (trivy-base, trivy-backend, etc.) - trivyignores: 'backend/.trivyignore' applied to all images (CVE exemptions are image-agnostic) - checkout@v6 included so the .trivyignore file is available Updated promote.needs from [scan-backend, scan-frontend] to [scan] — waits for all 12 matrix entries to pass before promoting anything to latest. Updated the summary security section to reflect that all 12 images are scanned. --- .github/workflows/docker.yml | 81 +++++++++++-------------------- .github/workflows/stack-tests.yml | 20 ++++++-- 2 files changed, 45 insertions(+), 56 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index c6535c54..9778e7a3 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -17,8 +17,8 @@ env: REGISTRY: ghcr.io jobs: - scan-backend: - name: Scan Backend + scan: + name: Scan ${{ matrix.image }} if: > github.event_name == 'workflow_dispatch' || (github.event.workflow_run.conclusion == 'success' && @@ -28,6 +28,22 @@ jobs: contents: read security-events: write packages: read + strategy: + fail-fast: false + matrix: + image: + - base + - backend + - frontend + - coordinator + - k8s-worker + - pod-monitor + - result-processor + - saga-orchestrator + - event-replay + - dlq-processor + - cert-generator + - zookeeper-certgen steps: - uses: actions/checkout@v6 @@ -41,14 +57,14 @@ jobs: SHA="${{ github.event.workflow_run.head_sha }}" fi TAG="sha-${SHA::7}" - echo "image=${{ env.REGISTRY }}/$PREFIX/backend:$TAG" >> $GITHUB_OUTPUT + echo "image=${{ env.REGISTRY }}/$PREFIX/${{ matrix.image }}:$TAG" >> $GITHUB_OUTPUT - name: Run Trivy vulnerability scanner uses: aquasecurity/trivy-action@0.33.1 with: image-ref: ${{ steps.ref.outputs.image }} format: 'sarif' - output: 'trivy-backend-results.sarif' + output: 'trivy-${{ matrix.image }}-results.sarif' ignore-unfixed: true severity: 'CRITICAL,HIGH' timeout: '5m0s' @@ -59,55 +75,13 @@ jobs: if: always() uses: github/codeql-action/upload-sarif@v4 with: - sarif_file: 'trivy-backend-results.sarif' - category: 'trivy-backend' - - scan-frontend: - name: Scan Frontend - if: > - github.event_name == 'workflow_dispatch' || - (github.event.workflow_run.conclusion == 'success' && - github.event.workflow_run.head_branch == 'main') - runs-on: ubuntu-latest - permissions: - contents: read - security-events: write - packages: read - steps: - - name: Compute image ref - id: ref - run: | - PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode" - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - SHA="${{ github.event.inputs.sha || github.sha }}" - else - SHA="${{ github.event.workflow_run.head_sha }}" - fi - TAG="sha-${SHA::7}" - echo "image=${{ env.REGISTRY }}/$PREFIX/frontend:$TAG" >> $GITHUB_OUTPUT - - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@0.33.1 - with: - image-ref: ${{ steps.ref.outputs.image }} - format: 'sarif' - output: 'trivy-frontend-results.sarif' - ignore-unfixed: true - severity: 'CRITICAL,HIGH' - timeout: '5m0s' - version: 'v0.68.2' - - - name: Upload Trivy scan results - if: always() - uses: github/codeql-action/upload-sarif@v4 - with: - sarif_file: 'trivy-frontend-results.sarif' - category: 'trivy-frontend' + sarif_file: 'trivy-${{ matrix.image }}-results.sarif' + category: 'trivy-${{ matrix.image }}' # Promote SHA tag → latest using crane (registry-level manifest copy, no rebuild) promote: name: Promote to Latest - needs: [scan-backend, scan-frontend] + needs: [scan] if: > github.event_name == 'workflow_dispatch' || (github.event.workflow_run.conclusion == 'success' && @@ -169,7 +143,11 @@ jobs: echo "## Docker Images Promoted to Latest" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "All Stack Tests passed. Images promoted from \`$TAG\` to \`latest\`." >> $GITHUB_STEP_SUMMARY + if [ "$GITHUB_EVENT_NAME" = "workflow_dispatch" ]; then + echo "Images promoted manually from \`$TAG\` to \`latest\` — Stack Tests may not have run." >> $GITHUB_STEP_SUMMARY + else + echo "All Stack Tests passed. Images promoted from \`$TAG\` to \`latest\`." >> $GITHUB_STEP_SUMMARY + fi echo "" >> $GITHUB_STEP_SUMMARY echo "| Image | Pull Command |" >> $GITHUB_STEP_SUMMARY echo "|-------|--------------|" >> $GITHUB_STEP_SUMMARY @@ -178,5 +156,4 @@ jobs: echo "| Frontend | \`docker pull $REGISTRY/$PREFIX/frontend:latest\` |" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "### Security Scans" >> $GITHUB_STEP_SUMMARY - echo "- Backend: Passed" >> $GITHUB_STEP_SUMMARY - echo "- Frontend: Passed" >> $GITHUB_STEP_SUMMARY + echo "All 12 images scanned with Trivy (CRITICAL + HIGH, unfixed ignored)." >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index f5257e4b..08651bb8 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -320,14 +320,20 @@ jobs: images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} # Push events: pull pre-built images from GHCR + - name: Log in to GHCR + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Pull images from GHCR if: github.event_name != 'pull_request' env: TAG: ${{ needs.build-images.outputs.sha-tag }} IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }} run: | - echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin - docker pull "$IMG/base:$TAG" & docker pull "$IMG/backend:$TAG" & docker pull "$IMG/frontend-dev:$TAG" & @@ -472,14 +478,20 @@ jobs: images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} # Push events: pull pre-built images from GHCR + - name: Log in to GHCR + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Pull images from GHCR if: github.event_name != 'pull_request' env: TAG: ${{ needs.build-images.outputs.sha-tag }} IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }} run: | - echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin - docker pull "$IMG/base:$TAG" & docker pull "$IMG/backend:$TAG" & docker pull "$IMG/frontend-dev:$TAG" & From 3dccec636705b2bfb66350a0a1d0a1eef1f872cd Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 19:28:32 +0100 Subject: [PATCH 04/17] What changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit docker-compose.yaml (+15 lines): Every buildable service now has an image: field pointing to ghcr.io/hardmax71/integr8scode/{service}:${IMAGE_TAG:-latest}. kafka-init and user-seed share the backend image. Compose now knows where to pull pre-built images from. deploy.sh (+10 lines): Added --no-build flag to cmd_dev(). Passes --no-build to compose, preventing any build fallback. stack-tests.yml (-149 lines): - Build job: push condition changed from event_name != 'pull_request' to !github.event.pull_request.head.repo.fork (same-repo PRs can push to GHCR). Artifact save/upload removed entirely. - Both E2E jobs: Deleted all GHCR login, parallel pull, retag, artifact download, and load steps. Replaced with a single IMAGE_TAG env var on the "Start stack" step. Compose pulls SHA-tagged images from GHCR automatically using the image: fields. - Both E2E jobs have if: !fork guard — fork PRs skip E2E (unit tests still run). How it works | Scenario | What happens | |---------------------------------------------------------|--------------------------------------------------| | ./deploy.sh dev (local, first time) | Compose pulls latest from GHCR — no build needed | | ./deploy.sh dev --build (local, with changes) | Builds locally, tags with GHCR name | | CI: IMAGE_TAG=sha-xxx ./deploy.sh dev --no-build --wait | Compose pulls sha-tagged images from GHCR | | ./deploy.sh prod | Helm uses GHCR images (unchanged) | --- .github/workflows/stack-tests.yml | 164 ++++-------------------------- deploy.sh | 10 +- docker-compose.yaml | 15 ++- 3 files changed, 40 insertions(+), 149 deletions(-) diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index 08651bb8..48f604fc 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -106,8 +106,8 @@ jobs: fail_ci_if_error: false verbose: true - # Build all images, push to GHCR with immutable SHA tag (push events only). - # PRs fall back to artifact transfer (can't push to GHCR from forks). + # Build all images, push to GHCR with immutable SHA tag. + # Fork PRs skip GHCR push (no write access) — E2E tests require pushed images. build-images: name: Build & Push Images needs: [backend-unit, frontend-unit] @@ -125,7 +125,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Log in to GHCR - if: github.event_name != 'pull_request' + if: ${{ !github.event.pull_request.head.repo.fork }} uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} @@ -168,7 +168,7 @@ jobs: run: docker save integr8scode-base:latest | zstd -T0 -3 > /tmp/base-image.tar.zst - name: Push base to GHCR - if: github.event_name != 'pull_request' + if: ${{ !github.event.pull_request.head.repo.fork }} run: | docker tag integr8scode-base:latest \ ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }} @@ -187,7 +187,7 @@ jobs: docker build -t integr8scode-dlq-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.dlq_processor ./backend - name: Push backend and workers to GHCR - if: github.event_name != 'pull_request' + if: ${{ !github.event.pull_request.head.repo.fork }} env: TAG: ${{ steps.tags.outputs.sha-tag }} IMG: ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }} @@ -222,7 +222,7 @@ jobs: cache-to: type=gha,mode=max,scope=cert-generator - name: Push cert-generator to GHCR - if: github.event_name != 'pull_request' + if: ${{ !github.event.pull_request.head.repo.fork }} run: | docker tag integr8scode-cert-generator:latest \ ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }} @@ -239,7 +239,7 @@ jobs: cache-to: type=gha,mode=max,scope=zookeeper-certgen - name: Push zookeeper-certgen to GHCR - if: github.event_name != 'pull_request' + if: ${{ !github.event.pull_request.head.repo.fork }} run: | docker tag integr8scode-zookeeper-certgen:latest \ ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }} @@ -257,7 +257,7 @@ jobs: cache-to: type=gha,mode=max,scope=frontend - name: Push frontend-dev to GHCR - if: github.event_name != 'pull_request' + if: ${{ !github.event.pull_request.head.repo.fork }} run: | docker tag integr8scode-frontend:latest \ ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }} @@ -274,42 +274,17 @@ jobs: cache-to: type=gha,mode=max,scope=frontend-prod - name: Push frontend-prod to GHCR - if: github.event_name != 'pull_request' + if: ${{ !github.event.pull_request.head.repo.fork }} run: | docker tag integr8scode-frontend-prod:latest \ ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }} docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }} - # ── Save images for PR builds (artifact fallback) ──────────────── - - name: Save all images - if: github.event_name == 'pull_request' - run: | - docker save \ - integr8scode-backend:latest \ - integr8scode-coordinator:latest \ - integr8scode-k8s-worker:latest \ - integr8scode-pod-monitor:latest \ - integr8scode-result-processor:latest \ - integr8scode-saga-orchestrator:latest \ - integr8scode-event-replay:latest \ - integr8scode-dlq-processor:latest \ - integr8scode-cert-generator:latest \ - integr8scode-zookeeper-certgen:latest \ - integr8scode-frontend:latest \ - | zstd -T0 -3 > /tmp/all-images.tar.zst - - - name: Upload images artifact - if: github.event_name == 'pull_request' - uses: actions/upload-artifact@v6 - with: - name: docker-images - path: /tmp/all-images.tar.zst - retention-days: 1 - - # Parallel E2E test jobs + # Parallel E2E test jobs — compose pulls from GHCR using IMAGE_TAG backend-e2e: name: Backend E2E Tests needs: [build-images] + if: ${{ !github.event.pull_request.head.repo.fork }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 @@ -319,60 +294,6 @@ jobs: with: images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} - # Push events: pull pre-built images from GHCR - - name: Log in to GHCR - if: github.event_name != 'pull_request' - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Pull images from GHCR - if: github.event_name != 'pull_request' - env: - TAG: ${{ needs.build-images.outputs.sha-tag }} - IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }} - run: | - docker pull "$IMG/base:$TAG" & - docker pull "$IMG/backend:$TAG" & - docker pull "$IMG/frontend-dev:$TAG" & - docker pull "$IMG/coordinator:$TAG" & - docker pull "$IMG/k8s-worker:$TAG" & - docker pull "$IMG/pod-monitor:$TAG" & - docker pull "$IMG/result-processor:$TAG" & - docker pull "$IMG/saga-orchestrator:$TAG" & - docker pull "$IMG/event-replay:$TAG" & - docker pull "$IMG/dlq-processor:$TAG" & - docker pull "$IMG/cert-generator:$TAG" & - docker pull "$IMG/zookeeper-certgen:$TAG" & - wait - - docker tag "$IMG/base:$TAG" integr8scode-base:latest - docker tag "$IMG/backend:$TAG" integr8scode-backend:latest - docker tag "$IMG/frontend-dev:$TAG" integr8scode-frontend:latest - docker tag "$IMG/coordinator:$TAG" integr8scode-coordinator:latest - docker tag "$IMG/k8s-worker:$TAG" integr8scode-k8s-worker:latest - docker tag "$IMG/pod-monitor:$TAG" integr8scode-pod-monitor:latest - docker tag "$IMG/result-processor:$TAG" integr8scode-result-processor:latest - docker tag "$IMG/saga-orchestrator:$TAG" integr8scode-saga-orchestrator:latest - docker tag "$IMG/event-replay:$TAG" integr8scode-event-replay:latest - docker tag "$IMG/dlq-processor:$TAG" integr8scode-dlq-processor:latest - docker tag "$IMG/cert-generator:$TAG" integr8scode-cert-generator:latest - docker tag "$IMG/zookeeper-certgen:$TAG" integr8scode-zookeeper-certgen:latest - - # PR events: load from artifact - - name: Download built images - if: github.event_name == 'pull_request' - uses: actions/download-artifact@v7 - with: - name: docker-images - path: /tmp - - - name: Load built images - if: github.event_name == 'pull_request' - run: zstd -d -c /tmp/all-images.tar.zst | docker load - - name: Setup k3s uses: ./.github/actions/k3s-setup @@ -382,7 +303,9 @@ jobs: cp backend/secrets.example.toml backend/secrets.toml - name: Start stack - run: ./deploy.sh dev --wait + env: + IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + run: ./deploy.sh dev --no-build --wait - name: Seed test users run: docker compose exec -T backend uv run python scripts/seed_users.py @@ -441,6 +364,7 @@ jobs: frontend-e2e: name: Frontend E2E Tests needs: [build-images] + if: ${{ !github.event.pull_request.head.repo.fork }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 @@ -477,60 +401,6 @@ jobs: with: images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} - # Push events: pull pre-built images from GHCR - - name: Log in to GHCR - if: github.event_name != 'pull_request' - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Pull images from GHCR - if: github.event_name != 'pull_request' - env: - TAG: ${{ needs.build-images.outputs.sha-tag }} - IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }} - run: | - docker pull "$IMG/base:$TAG" & - docker pull "$IMG/backend:$TAG" & - docker pull "$IMG/frontend-dev:$TAG" & - docker pull "$IMG/coordinator:$TAG" & - docker pull "$IMG/k8s-worker:$TAG" & - docker pull "$IMG/pod-monitor:$TAG" & - docker pull "$IMG/result-processor:$TAG" & - docker pull "$IMG/saga-orchestrator:$TAG" & - docker pull "$IMG/event-replay:$TAG" & - docker pull "$IMG/dlq-processor:$TAG" & - docker pull "$IMG/cert-generator:$TAG" & - docker pull "$IMG/zookeeper-certgen:$TAG" & - wait - - docker tag "$IMG/base:$TAG" integr8scode-base:latest - docker tag "$IMG/backend:$TAG" integr8scode-backend:latest - docker tag "$IMG/frontend-dev:$TAG" integr8scode-frontend:latest - docker tag "$IMG/coordinator:$TAG" integr8scode-coordinator:latest - docker tag "$IMG/k8s-worker:$TAG" integr8scode-k8s-worker:latest - docker tag "$IMG/pod-monitor:$TAG" integr8scode-pod-monitor:latest - docker tag "$IMG/result-processor:$TAG" integr8scode-result-processor:latest - docker tag "$IMG/saga-orchestrator:$TAG" integr8scode-saga-orchestrator:latest - docker tag "$IMG/event-replay:$TAG" integr8scode-event-replay:latest - docker tag "$IMG/dlq-processor:$TAG" integr8scode-dlq-processor:latest - docker tag "$IMG/cert-generator:$TAG" integr8scode-cert-generator:latest - docker tag "$IMG/zookeeper-certgen:$TAG" integr8scode-zookeeper-certgen:latest - - # PR events: load from artifact - - name: Download built images - if: github.event_name == 'pull_request' - uses: actions/download-artifact@v7 - with: - name: docker-images - path: /tmp - - - name: Load built images - if: github.event_name == 'pull_request' - run: zstd -d -c /tmp/all-images.tar.zst | docker load - - name: Setup k3s uses: ./.github/actions/k3s-setup @@ -540,7 +410,9 @@ jobs: cp backend/secrets.example.toml backend/secrets.toml - name: Start stack - run: ./deploy.sh dev --wait + env: + IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + run: ./deploy.sh dev --no-build --wait - name: Seed test users run: docker compose exec -T backend uv run python scripts/seed_users.py diff --git a/deploy.sh b/deploy.sh index 68f8b25a..a7dc8bec 100755 --- a/deploy.sh +++ b/deploy.sh @@ -56,7 +56,8 @@ show_help() { echo "" echo "Commands:" echo " dev [options] Start full stack (docker-compose)" - echo " --build Rebuild images" + echo " --build Rebuild images locally" + echo " --no-build Use pre-built images only (no build fallback)" echo " --wait Wait for services to be healthy" echo " --timeout Health check timeout (default: 300)" echo " --observability Include Grafana, Jaeger, etc." @@ -99,6 +100,7 @@ cmd_dev() { print_header "Starting Local Development Environment" local BUILD_FLAG="" + local NO_BUILD_FLAG="" local WAIT_FLAG="" local WAIT_TIMEOUT="300" local PROFILE_FLAGS="" @@ -109,6 +111,10 @@ cmd_dev() { BUILD_FLAG="--build" print_info "Rebuilding images..." ;; + --no-build) + NO_BUILD_FLAG="--no-build" + print_info "Using pre-built images (skipping build)..." + ;; --wait) WAIT_FLAG="--wait" ;; @@ -133,7 +139,7 @@ cmd_dev() { WAIT_TIMEOUT_FLAG="--wait-timeout $WAIT_TIMEOUT" fi - docker compose $PROFILE_FLAGS up -d $BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG + docker compose $PROFILE_FLAGS up -d $BUILD_FLAG $NO_BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG echo "" print_success "Development environment started!" diff --git a/docker-compose.yaml b/docker-compose.yaml index 8d3cd5fd..060a955c 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,10 +1,10 @@ services: # Shared base image for all Python backend services base: + image: ghcr.io/hardmax71/integr8scode/base:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: Dockerfile.base - image: integr8scode-base:latest shared-ca: image: alpine:latest @@ -15,6 +15,7 @@ services: - app-network cert-generator: + image: ghcr.io/hardmax71/integr8scode/cert-generator:${IMAGE_TAG:-latest} build: context: ./cert-generator dockerfile: Dockerfile @@ -80,6 +81,7 @@ services: start_period: 5s backend: + image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: Dockerfile @@ -127,6 +129,7 @@ services: start_period: 5s frontend: + image: ghcr.io/hardmax71/integr8scode/frontend-dev:${IMAGE_TAG:-latest} container_name: frontend build: context: ./frontend @@ -175,6 +178,7 @@ services: # Kafka Infrastructure for Event-Driven Design # Certificate generator for Zookeeper/Kafka SSL zookeeper-certgen: + image: ghcr.io/hardmax71/integr8scode/zookeeper-certgen:${IMAGE_TAG:-latest} build: context: ./backend/zookeeper dockerfile: Dockerfile.certgen @@ -362,6 +366,7 @@ services: # Kafka topic initialization kafka-init: + image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: Dockerfile @@ -388,6 +393,7 @@ services: # Seed default users (runs once after mongo is ready) user-seed: + image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: Dockerfile @@ -412,6 +418,7 @@ services: # Event-driven workers coordinator: + image: ghcr.io/hardmax71/integr8scode/coordinator:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: workers/Dockerfile.coordinator @@ -436,6 +443,7 @@ services: restart: unless-stopped k8s-worker: + image: ghcr.io/hardmax71/integr8scode/k8s-worker:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: workers/Dockerfile.k8s_worker @@ -463,6 +471,7 @@ services: restart: unless-stopped pod-monitor: + image: ghcr.io/hardmax71/integr8scode/pod-monitor:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: workers/Dockerfile.pod_monitor @@ -488,6 +497,7 @@ services: restart: unless-stopped result-processor: + image: ghcr.io/hardmax71/integr8scode/result-processor:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: workers/Dockerfile.result_processor @@ -515,6 +525,7 @@ services: restart: unless-stopped saga-orchestrator: + image: ghcr.io/hardmax71/integr8scode/saga-orchestrator:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: workers/Dockerfile.saga_orchestrator @@ -560,6 +571,7 @@ services: # Event replay service event-replay: + image: ghcr.io/hardmax71/integr8scode/event-replay:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: workers/Dockerfile.event_replay @@ -586,6 +598,7 @@ services: # DLQ Processor Service dlq-processor: + image: ghcr.io/hardmax71/integr8scode/dlq-processor:${IMAGE_TAG:-latest} build: context: ./backend dockerfile: workers/Dockerfile.dlq_processor From 16152ba309c70cb5730967cfdbec6390f0d7e9f7 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 20:21:21 +0100 Subject: [PATCH 05/17] 1. Playwright Sharding (frontend-e2e) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added strategy.matrix with shardIndex: [1, 2] and shardTotal: [2] - fail-fast: false so one shard failing doesn't cancel the other - Test command: npx playwright test --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} - Artifact names include shard index to avoid collisions: playwright-report-1, playwright-report-2, frontend-e2e-logs-1, etc. - Job name shows shard: Frontend E2E (1/2), Frontend E2E (2/2) 2. GHCR Pre-pull (both E2E jobs) - Immediately after checkout, docker compose pull --quiet starts in the background via nohup - While GHCR images pull, the subsequent setup steps run in parallel: - backend-e2e: Docker cache load + k3s install (~85s of overlap) - frontend-e2e: Node setup + npm ci + Playwright install + Docker cache + k3s (~150s of overlap) - A "Wait for GHCR images" step before "Start stack" ensures pull is complete - "Start stack" then finds images already local — skips pulling entirely --- .github/workflows/stack-tests.yml | 51 ++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index 48f604fc..b535dbde 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -289,6 +289,14 @@ jobs: steps: - uses: actions/checkout@v6 + - name: Pre-pull GHCR images (background) + env: + IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + run: | + nohup bash -c "IMAGE_TAG=$IMAGE_TAG docker compose pull --quiet" \ + > /tmp/ghcr-pull.log 2>&1 & + echo $! > /tmp/ghcr-pull.pid + - name: Cache and load Docker images uses: ./.github/actions/docker-cache with: @@ -302,6 +310,17 @@ jobs: cp backend/config.test.toml backend/config.toml cp backend/secrets.example.toml backend/secrets.toml + - name: Wait for GHCR images + run: | + if [ -f /tmp/ghcr-pull.pid ]; then + PID=$(cat /tmp/ghcr-pull.pid) + if kill -0 "$PID" 2>/dev/null; then + echo "Waiting for GHCR image pull to complete..." + tail --pid="$PID" -f /dev/null 2>/dev/null || true + fi + cat /tmp/ghcr-pull.log 2>/dev/null || true + fi + - name: Start stack env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} @@ -362,13 +381,26 @@ jobs: path: logs/ frontend-e2e: - name: Frontend E2E Tests + name: Frontend E2E (${{ matrix.shardIndex }}/${{ matrix.shardTotal }}) needs: [build-images] if: ${{ !github.event.pull_request.head.repo.fork }} runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + shardIndex: [1, 2] + shardTotal: [2] steps: - uses: actions/checkout@v6 + - name: Pre-pull GHCR images (background) + env: + IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + run: | + nohup bash -c "IMAGE_TAG=$IMAGE_TAG docker compose pull --quiet" \ + > /tmp/ghcr-pull.log 2>&1 & + echo $! > /tmp/ghcr-pull.pid + - name: Setup Node.js uses: actions/setup-node@v6 with: @@ -409,6 +441,17 @@ jobs: cp backend/config.test.toml backend/config.toml cp backend/secrets.example.toml backend/secrets.toml + - name: Wait for GHCR images + run: | + if [ -f /tmp/ghcr-pull.pid ]; then + PID=$(cat /tmp/ghcr-pull.pid) + if kill -0 "$PID" 2>/dev/null; then + echo "Waiting for GHCR image pull to complete..." + tail --pid="$PID" -f /dev/null 2>/dev/null || true + fi + cat /tmp/ghcr-pull.log 2>/dev/null || true + fi + - name: Start stack env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} @@ -420,13 +463,13 @@ jobs: - name: Run Playwright tests timeout-minutes: 10 working-directory: frontend - run: CI=true npx playwright test + run: CI=true npx playwright test --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }} - name: Upload Playwright report uses: actions/upload-artifact@v6 if: always() with: - name: playwright-report + name: playwright-report-${{ matrix.shardIndex }} path: frontend/playwright-report/ - name: Collect logs on failure @@ -441,5 +484,5 @@ jobs: if: failure() uses: actions/upload-artifact@v6 with: - name: frontend-e2e-logs + name: frontend-e2e-logs-${{ matrix.shardIndex }} path: logs/ From 7dd2244093cef6a39b531091cbdc3d6921b5fb38 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 20:45:23 +0100 Subject: [PATCH 06/17] =?UTF-8?q?Here's=20what=20this=20adds=20=E2=80=94?= =?UTF-8?q?=20infrastructure=20pre-warming:?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit How it works Both E2E jobs now have this timeline: Step 2: Pre-pull GHCR images ──────────────────────────────── (background) Step 3-7: Node/Playwright/Docker cache setup ──────────────── (foreground, ~50s) Step 8: Docker-cache loads infra images ───────────────────── (~15s) Step 9: Pre-warm infrastructure ───────────────────────────── (background, starts immediately) ├── mongo + redis start (~5s to healthy) ├── shared-ca + cert-gen + zk-certgen start (~5s) ├── zookeeper starts after zk-certgen (~15s) ├── kafka starts after zookeeper healthy (~20s) └── schema-registry starts after kafka (~10s) Step 10: k3s install ──────────────────────────────────────── (~42s, OVERLAPS with infra chain) Step 12: Wait for background tasks ────────────────────────── (both should be done) Step 13: Start stack ──────────────────────────────────────── (infra already healthy, only app services) Expected impact on "Start stack" | Component | Before | After | |---------------------------------|---------------------------|------------------------------| | Infra initialization (zk chain) | ~50s (during Start stack) | 0s (already done during k3s) | | App image pull | ~60s | 0s (pre-pulled) | | App service startup | ~30s | ~30s | | Health check waits | ~20s | ~20s | | Total "Start stack" | ~2:20 | ~0:50 | --- .github/workflows/stack-tests.yml | 59 ++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index b535dbde..8394b17e 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -302,6 +302,15 @@ jobs: with: images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} + - name: Pre-warm infrastructure (background) + env: + IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + run: | + nohup docker compose up -d --no-build \ + mongo redis shared-ca cert-generator zookeeper-certgen zookeeper kafka schema-registry \ + > /tmp/infra-warm.log 2>&1 & + echo $! > /tmp/infra-warm.pid + - name: Setup k3s uses: ./.github/actions/k3s-setup @@ -310,16 +319,19 @@ jobs: cp backend/config.test.toml backend/config.toml cp backend/secrets.example.toml backend/secrets.toml - - name: Wait for GHCR images + - name: Wait for background tasks run: | - if [ -f /tmp/ghcr-pull.pid ]; then - PID=$(cat /tmp/ghcr-pull.pid) - if kill -0 "$PID" 2>/dev/null; then - echo "Waiting for GHCR image pull to complete..." - tail --pid="$PID" -f /dev/null 2>/dev/null || true + for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do + if [ -f "$pidfile" ]; then + PID=$(cat "$pidfile") + if kill -0 "$PID" 2>/dev/null; then + echo "Waiting for $(basename $pidfile .pid)..." + tail --pid="$PID" -f /dev/null 2>/dev/null || true + fi fi - cat /tmp/ghcr-pull.log 2>/dev/null || true - fi + done + cat /tmp/ghcr-pull.log 2>/dev/null || true + cat /tmp/infra-warm.log 2>/dev/null || true - name: Start stack env: @@ -433,6 +445,18 @@ jobs: with: images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} + - name: Pre-warm infrastructure (background) + env: + IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + run: | + # Start infra services in background while k3s installs. + # Compose handles dependency ordering (zookeeper-certgen → zookeeper → kafka). + # GHCR images for cert-generator/zookeeper-certgen should be pre-pulled by now. + nohup docker compose up -d --no-build \ + mongo redis shared-ca cert-generator zookeeper-certgen zookeeper kafka schema-registry \ + > /tmp/infra-warm.log 2>&1 & + echo $! > /tmp/infra-warm.pid + - name: Setup k3s uses: ./.github/actions/k3s-setup @@ -441,16 +465,19 @@ jobs: cp backend/config.test.toml backend/config.toml cp backend/secrets.example.toml backend/secrets.toml - - name: Wait for GHCR images + - name: Wait for background tasks run: | - if [ -f /tmp/ghcr-pull.pid ]; then - PID=$(cat /tmp/ghcr-pull.pid) - if kill -0 "$PID" 2>/dev/null; then - echo "Waiting for GHCR image pull to complete..." - tail --pid="$PID" -f /dev/null 2>/dev/null || true + for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do + if [ -f "$pidfile" ]; then + PID=$(cat "$pidfile") + if kill -0 "$PID" 2>/dev/null; then + echo "Waiting for $(basename $pidfile .pid)..." + tail --pid="$PID" -f /dev/null 2>/dev/null || true + fi fi - cat /tmp/ghcr-pull.log 2>/dev/null || true - fi + done + cat /tmp/ghcr-pull.log 2>/dev/null || true + cat /tmp/infra-warm.log 2>/dev/null || true - name: Start stack env: From 69ed72a1cbb8ab1fcfa3820ff143db39c40ad565 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 20:55:48 +0100 Subject: [PATCH 07/17] The root cause: cert-generator service in docker-compose.yaml mounts ~/.kube:/root/.kube. When Docker creates that bind mount source directory, it creates it as root:root. Then k3s-setup's sudo k3s kubectl config view --raw > /home/runner/.kube/config fails because the shell redirect (>) runs as the runner user who can't write to the root-owned directory. --- .github/workflows/stack-tests.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index 8394b17e..924316cc 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -306,8 +306,10 @@ jobs: env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} run: | + # Start infra services in background while k3s installs. + # cert-generator is excluded: it needs k3s and mounts ~/.kube (conflicts with k3s-setup). nohup docker compose up -d --no-build \ - mongo redis shared-ca cert-generator zookeeper-certgen zookeeper kafka schema-registry \ + mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \ > /tmp/infra-warm.log 2>&1 & echo $! > /tmp/infra-warm.pid @@ -451,9 +453,9 @@ jobs: run: | # Start infra services in background while k3s installs. # Compose handles dependency ordering (zookeeper-certgen → zookeeper → kafka). - # GHCR images for cert-generator/zookeeper-certgen should be pre-pulled by now. + # cert-generator is excluded: it needs k3s and mounts ~/.kube (conflicts with k3s-setup). nohup docker compose up -d --no-build \ - mongo redis shared-ca cert-generator zookeeper-certgen zookeeper kafka schema-registry \ + mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \ > /tmp/infra-warm.log 2>&1 & echo $! > /tmp/infra-warm.pid From 815d6c605deec62dd5ea6094438eecae609ffe4b Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 21:24:09 +0100 Subject: [PATCH 08/17] Backend E2E step reorder (stack-tests.yml): MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit | Before | After | |-----------------------------------------|----------------------------------------| | 1. checkout | 1. checkout | | 2. GHCR pre-pull (bg) | 2. GHCR pre-pull (bg) | | 3. docker-cache | 3. config copy (moved up) | | 4. infra pre-warm (bg) | 4. Install k3s (split from composite) | | 5. k3s-setup (composite, ~45s blocking) | 5. docker-cache (runs during k3s boot) | | 6. config copy | 6. infra pre-warm (bg) | | 7. wait for bg | 7. Finalize k3s (~25s+ after install) | | 8. start stack | 8. wait for bg | | | 9. start stack | Key gain: k3s boot (30s) now overlaps with docker-cache (10-18s) instead of blocking sequentially. The composite k3s-setup action is inlined as "Install k3s" + "Finalize k3s", same pattern as frontend-e2e. Complete optimization summary across both files: 1. docker-compose.yaml — Tightened health check intervals (5s→2-3s) and start periods (10s→3-5s) across all 7 services 2. frontend-e2e — Inlined k3s, overlaps boot with Node + npm ci + Playwright (~50s overlap) 3. backend-e2e — Inlined k3s, overlaps boot with docker-cache (~15s overlap) 4. Both YAML files validated --- .github/workflows/stack-tests.yml | 87 +++++++++++++++++++++---------- docker-compose.yaml | 42 +++++++-------- 2 files changed, 81 insertions(+), 48 deletions(-) diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index 924316cc..e6019318 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -289,6 +289,7 @@ jobs: steps: - uses: actions/checkout@v6 + # ── Phase 1: Start background tasks + infra ── - name: Pre-pull GHCR images (background) env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} @@ -297,6 +298,16 @@ jobs: > /tmp/ghcr-pull.log 2>&1 & echo $! > /tmp/ghcr-pull.pid + - name: Use test environment config + run: | + cp backend/config.test.toml backend/config.toml + cp backend/secrets.example.toml backend/secrets.toml + + # ── Phase 2: Install k3s, then overlap boot with docker-cache + infra ── + - name: Install k3s + run: | + curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh - + - name: Cache and load Docker images uses: ./.github/actions/docker-cache with: @@ -306,21 +317,27 @@ jobs: env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} run: | - # Start infra services in background while k3s installs. - # cert-generator is excluded: it needs k3s and mounts ~/.kube (conflicts with k3s-setup). + # Start infra while k3s finishes booting (~25s+ since install). + # cert-generator excluded: needs k3s and mounts ~/.kube. nohup docker compose up -d --no-build \ mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \ > /tmp/infra-warm.log 2>&1 & echo $! > /tmp/infra-warm.pid - - name: Setup k3s - uses: ./.github/actions/k3s-setup - - - name: Use test environment config + # ── Phase 3: Finalize k3s (should be ready — 25s+ since install) ── + - name: Finalize k3s run: | - cp backend/config.test.toml backend/config.toml - cp backend/secrets.example.toml backend/secrets.toml - + mkdir -p /home/runner/.kube + sudo k3s kubectl config view --raw > /home/runner/.kube/config + sudo chmod 600 /home/runner/.kube/config + export KUBECONFIG=/home/runner/.kube/config + timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done' + kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f - + sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \ + /home/runner/.kube/config > backend/kubeconfig.yaml + chmod 644 backend/kubeconfig.yaml + + # ── Phase 4: Start remaining services (infra already healthy from pre-warm) ── - name: Wait for background tasks run: | for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do @@ -407,6 +424,7 @@ jobs: steps: - uses: actions/checkout@v6 + # ── Phase 1: Start background tasks + infra (runs during all subsequent steps) ── - name: Pre-pull GHCR images (background) env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} @@ -415,6 +433,27 @@ jobs: > /tmp/ghcr-pull.log 2>&1 & echo $! > /tmp/ghcr-pull.pid + - name: Cache and load Docker images + uses: ./.github/actions/docker-cache + with: + images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} + + - name: Pre-warm infrastructure (background) + env: + IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + run: | + # Start infra while k3s installs + Playwright sets up (~60s of overlap). + # cert-generator excluded: needs k3s and mounts ~/.kube. + nohup docker compose up -d --no-build \ + mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \ + > /tmp/infra-warm.log 2>&1 & + echo $! > /tmp/infra-warm.pid + + # ── Phase 2: k3s install + Node/Playwright setup (overlapped) ── + - name: Install k3s + run: | + curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh - + - name: Setup Node.js uses: actions/setup-node@v6 with: @@ -442,31 +481,25 @@ jobs: working-directory: frontend run: npx playwright install chromium - - name: Cache and load Docker images - uses: ./.github/actions/docker-cache - with: - images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} - - - name: Pre-warm infrastructure (background) - env: - IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + # ── Phase 3: Finalize k3s (should be ready — 50s+ since install) ── + - name: Finalize k3s run: | - # Start infra services in background while k3s installs. - # Compose handles dependency ordering (zookeeper-certgen → zookeeper → kafka). - # cert-generator is excluded: it needs k3s and mounts ~/.kube (conflicts with k3s-setup). - nohup docker compose up -d --no-build \ - mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \ - > /tmp/infra-warm.log 2>&1 & - echo $! > /tmp/infra-warm.pid - - - name: Setup k3s - uses: ./.github/actions/k3s-setup + mkdir -p /home/runner/.kube + sudo k3s kubectl config view --raw > /home/runner/.kube/config + sudo chmod 600 /home/runner/.kube/config + export KUBECONFIG=/home/runner/.kube/config + timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done' + kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f - + sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \ + /home/runner/.kube/config > backend/kubeconfig.yaml + chmod 644 backend/kubeconfig.yaml - name: Use test environment config run: | cp backend/config.test.toml backend/config.toml cp backend/secrets.example.toml backend/secrets.toml + # ── Phase 4: Start remaining services (infra already healthy from pre-warm) ── - name: Wait for background tasks run: | for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do diff --git a/docker-compose.yaml b/docker-compose.yaml index 060a955c..9f9fdd27 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -58,10 +58,10 @@ services: hard: 65536 healthcheck: test: echo 'db.runCommand("ping").ok' | mongosh localhost/integr8scode -u ${MONGO_ROOT_USER:-root} -p ${MONGO_ROOT_PASSWORD:-rootpassword} --authenticationDatabase admin --quiet - interval: 5s + interval: 3s timeout: 5s - retries: 10 - start_period: 10s + retries: 15 + start_period: 5s redis: image: redis:7-alpine @@ -75,10 +75,10 @@ services: - app-network healthcheck: test: ["CMD", "redis-cli", "ping"] - interval: 5s - timeout: 5s - retries: 5 - start_period: 5s + interval: 2s + timeout: 3s + retries: 10 + start_period: 2s backend: image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest} @@ -123,10 +123,10 @@ services: - "host.docker.internal:host-gateway" healthcheck: test: ["CMD-SHELL", "curl -k -f -s https://localhost:443/api/v1/health/live >/dev/null || exit 1"] - interval: 3s + interval: 2s timeout: 3s - retries: 50 - start_period: 5s + retries: 30 + start_period: 3s frontend: image: ghcr.io/hardmax71/integr8scode/frontend-dev:${IMAGE_TAG:-latest} @@ -153,10 +153,10 @@ services: - NODE_EXTRA_CA_CERTS=/shared_ca/mkcert-ca.pem healthcheck: test: ["CMD-SHELL", "curl -k -f -s https://localhost:5001 >/dev/null || exit 1"] - interval: 3s + interval: 2s timeout: 3s retries: 30 - start_period: 5s + start_period: 3s grafana: @@ -261,10 +261,10 @@ services: hard: 65536 healthcheck: test: ["CMD-SHELL", "echo ruok | nc localhost 2181 | grep imok"] - interval: 5s + interval: 3s timeout: 5s - retries: 10 - start_period: 10s + retries: 15 + start_period: 5s kafka: image: confluentinc/cp-kafka:7.8.2 @@ -322,10 +322,10 @@ services: hard: 65536 healthcheck: test: ["CMD-SHELL", "kafka-broker-api-versions --bootstrap-server localhost:9092"] - interval: 5s + interval: 3s timeout: 10s - retries: 12 - start_period: 5s + retries: 15 + start_period: 3s schema-registry: image: confluentinc/cp-schema-registry:7.8.2 @@ -343,10 +343,10 @@ services: - app-network healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8081/config"] - interval: 5s + interval: 3s timeout: 5s - retries: 10 - start_period: 10s + retries: 15 + start_period: 5s kafdrop: image: obsidiandynamics/kafdrop:3.31.0 From 073a3d7aef0d2f88da42ad7e6aa0a9a737ed9331 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 21:33:17 +0100 Subject: [PATCH 09/17] =?UTF-8?q?=20Before=20(19=20steps,=206=20sequential?= =?UTF-8?q?=20push=20steps=20=3D=20~81s=20pushing):=20=20=20Build=20base?= =?UTF-8?q?=20=E2=86=92=20Push=20base=20(13s)=20=E2=86=92=20Build=208=20wo?= =?UTF-8?q?rkers=20=E2=86=92=20Push=208=20workers=20(35s=20sequential)=20?= =?UTF-8?q?=20=20=E2=86=92=20Build=20cert-gen=20=E2=86=92=20Push=20cert-ge?= =?UTF-8?q?n=20(7s)=20=E2=86=92=20Build=20zk-certgen=20=E2=86=92=20Push=20?= =?UTF-8?q?zk-certgen=20(8s)=20=20=20=E2=86=92=20Build=20frontend=20?= =?UTF-8?q?=E2=86=92=20Push=20frontend-dev=20(12s)=20=E2=86=92=20Build=20f?= =?UTF-8?q?rontend-prod=20=E2=86=92=20Push=20frontend-prod=20(6s)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After (14 steps, 1 parallel push step): Build base → Build 8 workers → Build cert-gen → Build zk-certgen → Build frontend → Build frontend-prod → Push all 13 in parallel (~15-20s) Expected savings: ~60s (81s sequential → ~20s parallel). Job should drop from 2m 48s → ~1m 50s. The builds are all done first (same total time), then all 13 pushes fire concurrently. Since they share base layers, Docker deduplicates — the first push uploads shared layers and the rest skip them. --- .github/workflows/stack-tests.yml | 86 +++++++++++-------------------- 1 file changed, 29 insertions(+), 57 deletions(-) diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index e6019318..ce43f6c4 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -167,15 +167,8 @@ jobs: if: steps.base-cache.outputs.cache-hit != 'true' run: docker save integr8scode-base:latest | zstd -T0 -3 > /tmp/base-image.tar.zst - - name: Push base to GHCR - if: ${{ !github.event.pull_request.head.repo.fork }} - run: | - docker tag integr8scode-base:latest \ - ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }} - docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }} - # ── Backend + workers (depend on local base image) ─────────────── - - name: Build all images + - name: Build backend and worker images run: | docker build -t integr8scode-backend:latest --build-context base=docker-image://integr8scode-base:latest -f ./backend/Dockerfile ./backend docker build -t integr8scode-coordinator:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.coordinator ./backend @@ -186,30 +179,6 @@ jobs: docker build -t integr8scode-event-replay:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.event_replay ./backend docker build -t integr8scode-dlq-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.dlq_processor ./backend - - name: Push backend and workers to GHCR - if: ${{ !github.event.pull_request.head.repo.fork }} - env: - TAG: ${{ steps.tags.outputs.sha-tag }} - IMG: ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }} - run: | - docker tag integr8scode-backend:latest "$IMG/backend:$TAG" - docker tag integr8scode-coordinator:latest "$IMG/coordinator:$TAG" - docker tag integr8scode-k8s-worker:latest "$IMG/k8s-worker:$TAG" - docker tag integr8scode-pod-monitor:latest "$IMG/pod-monitor:$TAG" - docker tag integr8scode-result-processor:latest "$IMG/result-processor:$TAG" - docker tag integr8scode-saga-orchestrator:latest "$IMG/saga-orchestrator:$TAG" - docker tag integr8scode-event-replay:latest "$IMG/event-replay:$TAG" - docker tag integr8scode-dlq-processor:latest "$IMG/dlq-processor:$TAG" - - docker push "$IMG/backend:$TAG" - docker push "$IMG/coordinator:$TAG" - docker push "$IMG/k8s-worker:$TAG" - docker push "$IMG/pod-monitor:$TAG" - docker push "$IMG/result-processor:$TAG" - docker push "$IMG/saga-orchestrator:$TAG" - docker push "$IMG/event-replay:$TAG" - docker push "$IMG/dlq-processor:$TAG" - # ── Utility images (GHA-cached, independent of base) ──────────── - name: Build cert-generator image uses: docker/build-push-action@v6 @@ -221,13 +190,6 @@ jobs: cache-from: type=gha,scope=cert-generator cache-to: type=gha,mode=max,scope=cert-generator - - name: Push cert-generator to GHCR - if: ${{ !github.event.pull_request.head.repo.fork }} - run: | - docker tag integr8scode-cert-generator:latest \ - ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }} - docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }} - - name: Build zookeeper-certgen image uses: docker/build-push-action@v6 with: @@ -238,13 +200,6 @@ jobs: cache-from: type=gha,scope=zookeeper-certgen cache-to: type=gha,mode=max,scope=zookeeper-certgen - - name: Push zookeeper-certgen to GHCR - if: ${{ !github.event.pull_request.head.repo.fork }} - run: | - docker tag integr8scode-zookeeper-certgen:latest \ - ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }} - docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }} - # ── Frontend (dev for E2E, prod for scanning/deployment) ───────── - name: Build frontend image uses: docker/build-push-action@v6 @@ -256,13 +211,6 @@ jobs: cache-from: type=gha,scope=frontend cache-to: type=gha,mode=max,scope=frontend - - name: Push frontend-dev to GHCR - if: ${{ !github.event.pull_request.head.repo.fork }} - run: | - docker tag integr8scode-frontend:latest \ - ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }} - docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }} - - name: Build frontend-prod image uses: docker/build-push-action@v6 with: @@ -273,12 +221,36 @@ jobs: cache-from: type=gha,scope=frontend-prod cache-to: type=gha,mode=max,scope=frontend-prod - - name: Push frontend-prod to GHCR + # ── Push all images to GHCR in parallel ──────────────────────── + - name: Push all images to GHCR if: ${{ !github.event.pull_request.head.repo.fork }} + env: + TAG: ${{ steps.tags.outputs.sha-tag }} + IMG: ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }} run: | - docker tag integr8scode-frontend-prod:latest \ - ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }} - docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }} + # Tag all images for GHCR + docker tag integr8scode-base:latest "$IMG/base:$TAG" + docker tag integr8scode-backend:latest "$IMG/backend:$TAG" + docker tag integr8scode-coordinator:latest "$IMG/coordinator:$TAG" + docker tag integr8scode-k8s-worker:latest "$IMG/k8s-worker:$TAG" + docker tag integr8scode-pod-monitor:latest "$IMG/pod-monitor:$TAG" + docker tag integr8scode-result-processor:latest "$IMG/result-processor:$TAG" + docker tag integr8scode-saga-orchestrator:latest "$IMG/saga-orchestrator:$TAG" + docker tag integr8scode-event-replay:latest "$IMG/event-replay:$TAG" + docker tag integr8scode-dlq-processor:latest "$IMG/dlq-processor:$TAG" + docker tag integr8scode-cert-generator:latest "$IMG/cert-generator:$TAG" + docker tag integr8scode-zookeeper-certgen:latest "$IMG/zookeeper-certgen:$TAG" + docker tag integr8scode-frontend:latest "$IMG/frontend-dev:$TAG" + docker tag integr8scode-frontend-prod:latest "$IMG/frontend:$TAG" + + # Push all 13 images in parallel + for name in base backend coordinator k8s-worker pod-monitor \ + result-processor saga-orchestrator event-replay \ + dlq-processor cert-generator zookeeper-certgen \ + frontend-dev frontend; do + docker push "$IMG/$name:$TAG" & + done + wait # Parallel E2E test jobs — compose pulls from GHCR using IMAGE_TAG backend-e2e: From 9f164c7d4184f5321b4e6eb752f7dd0a718ba0cf Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 21:55:54 +0100 Subject: [PATCH 10/17] Two changes made: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Parallel GHCR pushes (build-images job): - Merged 6 separate push steps into 1 step that pushes all 13 images in parallel via for ... do docker push & done; wait - Expected: ~81s sequential → ~15-20s parallel (saves ~60s) 2. Targeted health checks (both E2E jobs): - Replaced deploy.sh dev --no-build --wait (waits for ALL 15+ containers) with: - docker compose up -d --no-build (returns immediately, ~3s) - curl loop that only waits for backend (backend-e2e) or backend + frontend (frontend-e2e) - Workers start in background and become ready while tests run their initial setup - Expected: "Start stack" drops from ~2:01 to ~5s + "Wait for health" ~40-60s = ~45-65s total (saves ~60s) --- .github/workflows/stack-tests.yml | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index ce43f6c4..5fa93822 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -327,7 +327,13 @@ jobs: - name: Start stack env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: ./deploy.sh dev --no-build --wait + run: docker compose up -d --no-build + + - name: Wait for backend + run: | + echo "Waiting for backend health..." + timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done' + echo "Backend ready" - name: Seed test users run: docker compose exec -T backend uv run python scripts/seed_users.py @@ -489,7 +495,16 @@ jobs: - name: Start stack env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: ./deploy.sh dev --no-build --wait + run: docker compose up -d --no-build + + - name: Wait for backend and frontend + run: | + echo "Waiting for backend health..." + timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done' + echo "Backend ready" + echo "Waiting for frontend health..." + timeout 60 bash -c 'until curl -ksf https://localhost:5001 2>/dev/null; do sleep 2; done' + echo "Frontend ready" - name: Seed test users run: docker compose exec -T backend uv run python scripts/seed_users.py From 89871ae7ccb69a89ae0c439a504704ee707fdb09 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 22:19:16 +0100 Subject: [PATCH 11/17] =?UTF-8?q?=20Root=20cause=20analysis:=20docker=20co?= =?UTF-8?q?mpose=20up=20-d=20--no-build=20(even=20without=20--wait)=20take?= =?UTF-8?q?s=201:23=20because=20depends=5Fon:=20condition:=20service=5Fhea?= =?UTF-8?q?lthy=20in=20docker-compose.yaml=20forces=20compose=20to=20wait?= =?UTF-8?q?=20for=20the=20entire=20dependency=20chain=20before=20=20=20cre?= =?UTF-8?q?ating=20dependent=20containers.=20Removing=20--wait=20only=20sk?= =?UTF-8?q?ipped=20the=20final=20"all=20healthy"=20check=20=E2=80=94=20the?= =?UTF-8?q?=20internal=20chain=20is=20the=20real=20bottleneck.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes made (3 optimizations): 1. Removed docker-cache step (saves ~1:08 blocking time) The docker-cache composite action was loading 5 infra images from GHA cache in ~68s of blocking foreground time. But docker compose pull (pre-pull) already fetches ALL images in background. Removed the redundant step. 2. Merged pre-pull + pre-warm into single sequential background task Instead of: pre-pull (bg) → docker-cache (blocking 1:08) → pre-warm (bg) Now: docker compose pull && docker compose up -d ... infra all in one background process. Infra starts pulling + booting immediately after checkout, overlapping with all subsequent setup steps. 3. Pre-start cert-generator after k3s finalize cert-generator is on the critical path: cert-gen(complete) → backend(healthy) → frontend. Starting it right after kubeconfig exists gives it a ~15-20s head start while we wait for pre-pull to finish. --- .github/workflows/stack-tests.yml | 131 ++++++++++++++---------------- 1 file changed, 61 insertions(+), 70 deletions(-) diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index 5fa93822..b07d51a7 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -261,42 +261,31 @@ jobs: steps: - uses: actions/checkout@v6 - # ── Phase 1: Start background tasks + infra ── - - name: Pre-pull GHCR images (background) + # ── Phase 1: Pull images + start infra in background (overlap with k3s) ── + - name: Pull images and pre-warm infra (background) env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} run: | - nohup bash -c "IMAGE_TAG=$IMAGE_TAG docker compose pull --quiet" \ - > /tmp/ghcr-pull.log 2>&1 & - echo $! > /tmp/ghcr-pull.pid + # Pull all images (GHCR + Docker Hub) then start infra services. + # This runs throughout k3s install/finalize (~20s of overlap). + nohup bash -c ' + IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1 + echo "--- pull done, starting infra ---" + docker compose up -d --no-build \ + mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1 + ' > /tmp/infra-pull.log 2>&1 & + echo $! > /tmp/infra-pull.pid - name: Use test environment config run: | cp backend/config.test.toml backend/config.toml cp backend/secrets.example.toml backend/secrets.toml - # ── Phase 2: Install k3s, then overlap boot with docker-cache + infra ── + # ── Phase 2: k3s install + finalize (overlaps with image pull + infra boot) ── - name: Install k3s run: | curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh - - - name: Cache and load Docker images - uses: ./.github/actions/docker-cache - with: - images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} - - - name: Pre-warm infrastructure (background) - env: - IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: | - # Start infra while k3s finishes booting (~25s+ since install). - # cert-generator excluded: needs k3s and mounts ~/.kube. - nohup docker compose up -d --no-build \ - mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \ - > /tmp/infra-warm.log 2>&1 & - echo $! > /tmp/infra-warm.pid - - # ── Phase 3: Finalize k3s (should be ready — 25s+ since install) ── - name: Finalize k3s run: | mkdir -p /home/runner/.kube @@ -309,20 +298,26 @@ jobs: /home/runner/.kube/config > backend/kubeconfig.yaml chmod 644 backend/kubeconfig.yaml - # ── Phase 4: Start remaining services (infra already healthy from pre-warm) ── - - name: Wait for background tasks + # Start cert-generator now that kubeconfig exists (runs during wait step) + - name: Start cert-generator (background) + env: + IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + run: | + nohup docker compose up -d --no-build cert-generator \ + > /tmp/cert-gen.log 2>&1 & + + # ── Phase 3: Wait for pulls + start stack ── + - name: Wait for image pull and infra run: | - for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do - if [ -f "$pidfile" ]; then - PID=$(cat "$pidfile") - if kill -0 "$PID" 2>/dev/null; then - echo "Waiting for $(basename $pidfile .pid)..." - tail --pid="$PID" -f /dev/null 2>/dev/null || true - fi + if [ -f /tmp/infra-pull.pid ]; then + PID=$(cat /tmp/infra-pull.pid) + if kill -0 "$PID" 2>/dev/null; then + echo "Waiting for image pull + infra startup..." + tail --pid="$PID" -f /dev/null 2>/dev/null || true fi - done - cat /tmp/ghcr-pull.log 2>/dev/null || true - cat /tmp/infra-warm.log 2>/dev/null || true + fi + cat /tmp/infra-pull.log 2>/dev/null || true + cat /tmp/cert-gen.log 2>/dev/null || true - name: Start stack env: @@ -402,32 +397,22 @@ jobs: steps: - uses: actions/checkout@v6 - # ── Phase 1: Start background tasks + infra (runs during all subsequent steps) ── - - name: Pre-pull GHCR images (background) + # ── Phase 1: Pull images + start infra in background (runs during all subsequent steps) ── + - name: Pull images and pre-warm infra (background) env: IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} run: | - nohup bash -c "IMAGE_TAG=$IMAGE_TAG docker compose pull --quiet" \ - > /tmp/ghcr-pull.log 2>&1 & - echo $! > /tmp/ghcr-pull.pid - - - name: Cache and load Docker images - uses: ./.github/actions/docker-cache - with: - images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }} - - - name: Pre-warm infrastructure (background) - env: - IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: | - # Start infra while k3s installs + Playwright sets up (~60s of overlap). - # cert-generator excluded: needs k3s and mounts ~/.kube. - nohup docker compose up -d --no-build \ - mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \ - > /tmp/infra-warm.log 2>&1 & - echo $! > /tmp/infra-warm.pid - - # ── Phase 2: k3s install + Node/Playwright setup (overlapped) ── + # Pull all images (GHCR + Docker Hub) then start infra services. + # This runs throughout k3s + Node + Playwright setup (~80s of overlap). + nohup bash -c ' + IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1 + echo "--- pull done, starting infra ---" + docker compose up -d --no-build \ + mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1 + ' > /tmp/infra-pull.log 2>&1 & + echo $! > /tmp/infra-pull.pid + + # ── Phase 2: k3s install + Node/Playwright setup (overlapped with pull + infra) ── - name: Install k3s run: | curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh - @@ -472,25 +457,31 @@ jobs: /home/runner/.kube/config > backend/kubeconfig.yaml chmod 644 backend/kubeconfig.yaml + # Start cert-generator now that kubeconfig exists (runs during wait step) + - name: Start cert-generator (background) + env: + IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} + run: | + nohup docker compose up -d --no-build cert-generator \ + > /tmp/cert-gen.log 2>&1 & + - name: Use test environment config run: | cp backend/config.test.toml backend/config.toml cp backend/secrets.example.toml backend/secrets.toml - # ── Phase 4: Start remaining services (infra already healthy from pre-warm) ── - - name: Wait for background tasks + # ── Phase 4: Wait for pulls + start stack ── + - name: Wait for image pull and infra run: | - for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do - if [ -f "$pidfile" ]; then - PID=$(cat "$pidfile") - if kill -0 "$PID" 2>/dev/null; then - echo "Waiting for $(basename $pidfile .pid)..." - tail --pid="$PID" -f /dev/null 2>/dev/null || true - fi + if [ -f /tmp/infra-pull.pid ]; then + PID=$(cat /tmp/infra-pull.pid) + if kill -0 "$PID" 2>/dev/null; then + echo "Waiting for image pull + infra startup..." + tail --pid="$PID" -f /dev/null 2>/dev/null || true fi - done - cat /tmp/ghcr-pull.log 2>/dev/null || true - cat /tmp/infra-warm.log 2>/dev/null || true + fi + cat /tmp/infra-pull.log 2>/dev/null || true + cat /tmp/cert-gen.log 2>/dev/null || true - name: Start stack env: From 33e8c33280eda1d716c7526c2cbaf68e4add91ec Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 22:44:21 +0100 Subject: [PATCH 12/17] =?UTF-8?q?=20What=20changed:=20frontend.depends=5Fo?= =?UTF-8?q?n.backend:=20service=5Fhealthy=20=E2=86=92=20service=5Fstarted?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: Compose no longer waits for backend to pass its health check (~35s) before creating the frontend container. Backend and frontend now boot in parallel during docker compose up -d. For frontend-e2e: "Start stack" should drop from 1:20 to 45-50s (no backend health wait in compose), and "Wait for backend+frontend" picks up the slack but runs in parallel (45s). Net: 2:03 → ~1:30, saving ~33s → job drops to ~5:00. For backend-e2e: Smaller impact since backend tests don't need frontend. "Start stack" drops slightly (~10s) since compose returns earlier. Job should be ~5:30. At this point we're approaching the hard floor: - Backend E2E: 3:00 tests + 100s minimum setup = ~4:40 floor, currently ~5:30 (50s over) - Frontend E2E: 2:11 tests + 80s minimum setup = ~3:31 floor, currently ~5:00 (89s over, mostly from the depends_on chain which is inherent to docker-compose) --- docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 9f9fdd27..80c215c8 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -138,7 +138,7 @@ services: cert-generator: condition: service_completed_successfully backend: - condition: service_healthy + condition: service_started volumes: - ./frontend:/app - /app/node_modules From c9ac036d5aa82fbf70e0f88bd55e9e0312d4281f Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 23:12:32 +0100 Subject: [PATCH 13/17] fixes --- .github/actions/k3s-setup/action.yml | 6 +++++- .github/workflows/stack-tests.yml | 29 ++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/.github/actions/k3s-setup/action.yml b/.github/actions/k3s-setup/action.yml index d21c4a43..523ca0ad 100644 --- a/.github/actions/k3s-setup/action.yml +++ b/.github/actions/k3s-setup/action.yml @@ -24,7 +24,11 @@ runs: run: | # --bind-address 0.0.0.0: Listen on all interfaces so Docker containers can reach it # --tls-san host.docker.internal: Include in cert SANs for Docker container access - curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh - + K3S_VERSION="${K3S_VERSION:-v1.32.11+k3s1}" + K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g') + curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh + chmod +x /tmp/k3s-install.sh + INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh mkdir -p /home/runner/.kube sudo k3s kubectl config view --raw > /home/runner/.kube/config sudo chmod 600 /home/runner/.kube/config diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index b07d51a7..27f48d98 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -31,6 +31,7 @@ env: KAFKA_IMAGE: confluentinc/cp-kafka:7.8.2 ZOOKEEPER_IMAGE: confluentinc/cp-zookeeper:7.8.2 SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.8.2 + K3S_VERSION: v1.32.11+k3s1 jobs: # Fast unit tests (no infrastructure needed) @@ -137,8 +138,8 @@ jobs: run: | PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode" SHA_TAG="sha-${GITHUB_SHA::7}" - echo "sha-tag=$SHA_TAG" >> $GITHUB_OUTPUT - echo "image-prefix=$PREFIX" >> $GITHUB_OUTPUT + echo "sha-tag=$SHA_TAG" >> "$GITHUB_OUTPUT" + echo "image-prefix=$PREFIX" >> "$GITHUB_OUTPUT" # ── Base image (cached separately — rarely changes) ────────────── - name: Cache base image @@ -243,14 +244,24 @@ jobs: docker tag integr8scode-frontend:latest "$IMG/frontend-dev:$TAG" docker tag integr8scode-frontend-prod:latest "$IMG/frontend:$TAG" - # Push all 13 images in parallel + # Push all 13 images in parallel, tracking each PID + declare -A PIDS for name in base backend coordinator k8s-worker pod-monitor \ result-processor saga-orchestrator event-replay \ dlq-processor cert-generator zookeeper-certgen \ frontend-dev frontend; do docker push "$IMG/$name:$TAG" & + PIDS[$name]=$! done - wait + + FAILED=0 + for name in "${!PIDS[@]}"; do + if ! wait "${PIDS[$name]}"; then + echo "::error::Failed to push $name" + FAILED=1 + fi + done + [ "$FAILED" -eq 0 ] || exit 1 # Parallel E2E test jobs — compose pulls from GHCR using IMAGE_TAG backend-e2e: @@ -284,7 +295,10 @@ jobs: # ── Phase 2: k3s install + finalize (overlaps with image pull + infra boot) ── - name: Install k3s run: | - curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh - + K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g') + curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh + chmod +x /tmp/k3s-install.sh + INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh - name: Finalize k3s run: | @@ -415,7 +429,10 @@ jobs: # ── Phase 2: k3s install + Node/Playwright setup (overlapped with pull + infra) ── - name: Install k3s run: | - curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh - + K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g') + curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh + chmod +x /tmp/k3s-install.sh + INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh - name: Setup Node.js uses: actions/setup-node@v6 From 68c18b8ea40c0536b6e18ef241f3c46707de512c Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 23:41:14 +0100 Subject: [PATCH 14/17] Created 2 composite actions, deleted 2 unused ones: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit | Action | Purpose | |------------------------|-----------------------------------------------------------------------------| | e2e-boot (new) | GHCR login + pull/prewarm infra (bg) + k3s install | | e2e-ready (new) | Finalize k3s + cert-gen + config + wait + start stack + health check + seed | | k3s-setup (deleted) | Was inlined previously, never referenced | | docker-cache (deleted) | Replaced by docker compose pull, never referenced | Step count reduction: - backend-e2e: 20 steps → 8 steps (checkout + 2 actions + test + coverage + logs) - frontend-e2e: 20 steps → 13 steps (checkout + e2e-boot + 5 Node/Playwright + e2e-ready + test + report + logs) Performance preserved: The split point between e2e-boot and e2e-ready is exactly where frontend-e2e interposes Node/Playwright setup, so k3s still boots in the background during that work. --- .github/actions/docker-cache/action.yml | 64 -------- .github/actions/e2e-boot/action.yml | 41 +++++ .github/actions/e2e-ready/action.yml | 78 ++++++++++ .github/actions/k3s-setup/action.yml | 61 -------- .github/workflows/stack-tests.yml | 199 ++++-------------------- 5 files changed, 152 insertions(+), 291 deletions(-) delete mode 100644 .github/actions/docker-cache/action.yml create mode 100644 .github/actions/e2e-boot/action.yml create mode 100644 .github/actions/e2e-ready/action.yml delete mode 100644 .github/actions/k3s-setup/action.yml diff --git a/.github/actions/docker-cache/action.yml b/.github/actions/docker-cache/action.yml deleted file mode 100644 index 253885e2..00000000 --- a/.github/actions/docker-cache/action.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: 'Docker Image Cache' -description: 'Cache and load Docker images for CI jobs' - -inputs: - images: - description: 'Space-separated list of Docker images to cache' - required: true - -runs: - using: 'composite' - steps: - - name: Generate cache key from images - id: cache-key - shell: bash - env: - IMAGES_INPUT: ${{ inputs.images }} - run: | - # Create a stable hash from the sorted image list - # Using env var to prevent script injection - IMAGES_HASH=$(echo "$IMAGES_INPUT" | tr ' ' '\n' | sort | md5sum | cut -d' ' -f1) - echo "key=docker-${{ runner.os }}-${IMAGES_HASH}" >> $GITHUB_OUTPUT - - - name: Cache Docker images - uses: actions/cache@v5 - id: docker-cache - with: - path: /tmp/docker-cache - key: ${{ steps.cache-key.outputs.key }} - - - name: Load cached Docker images - if: steps.docker-cache.outputs.cache-hit == 'true' - shell: bash - run: | - echo "Loading cached images..." - for f in /tmp/docker-cache/*.tar.zst; do - zstd -d -c "$f" | docker load & - done - wait - docker images - - - name: Pull and save Docker images - if: steps.docker-cache.outputs.cache-hit != 'true' - shell: bash - env: - IMAGES_INPUT: ${{ inputs.images }} - run: | - mkdir -p /tmp/docker-cache - - echo "Pulling images in parallel..." - for img in $IMAGES_INPUT; do - docker pull "$img" & - done - wait - - echo "Saving images with zstd compression..." - for img in $IMAGES_INPUT; do - # Create filename from image name (replace special chars) - filename=$(echo "$img" | tr '/:' '_') - docker save "$img" | zstd -T0 -3 > "/tmp/docker-cache/${filename}.tar.zst" & - done - wait - - echo "Cache size:" - du -sh /tmp/docker-cache/ diff --git a/.github/actions/e2e-boot/action.yml b/.github/actions/e2e-boot/action.yml new file mode 100644 index 00000000..be4d7bdf --- /dev/null +++ b/.github/actions/e2e-boot/action.yml @@ -0,0 +1,41 @@ +name: 'E2E Boot' +description: 'Kick off slow background tasks: GHCR auth, image pull + infra pre-warm, k3s install' + +inputs: + image-tag: + description: 'GHCR image tag (e.g., sha-abc1234)' + required: true + github-token: + description: 'GitHub token for GHCR authentication' + required: true + +runs: + using: 'composite' + steps: + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ inputs.github-token }} + + - name: Pull images and pre-warm infra (background) + shell: bash + env: + IMAGE_TAG: ${{ inputs.image-tag }} + run: | + nohup bash -c ' + IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1 + echo "--- pull done, starting infra ---" + docker compose up -d --no-build \ + mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1 + ' > /tmp/infra-pull.log 2>&1 & + echo $! > /tmp/infra-pull.pid + + - name: Install k3s + shell: bash + run: | + K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g') + curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh + chmod +x /tmp/k3s-install.sh + INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh diff --git a/.github/actions/e2e-ready/action.yml b/.github/actions/e2e-ready/action.yml new file mode 100644 index 00000000..c2b90d39 --- /dev/null +++ b/.github/actions/e2e-ready/action.yml @@ -0,0 +1,78 @@ +name: 'E2E Ready' +description: 'Finalize k3s, wait for infra, start compose stack, health-check, seed test users' + +inputs: + image-tag: + description: 'GHCR image tag (e.g., sha-abc1234)' + required: true + wait-for-frontend: + description: 'Also wait for frontend health check (default: false)' + required: false + default: 'false' + +runs: + using: 'composite' + steps: + - name: Finalize k3s + shell: bash + run: | + mkdir -p /home/runner/.kube + sudo k3s kubectl config view --raw > /home/runner/.kube/config + sudo chmod 600 /home/runner/.kube/config + export KUBECONFIG=/home/runner/.kube/config + timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done' + kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f - + sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \ + /home/runner/.kube/config > backend/kubeconfig.yaml + chmod 644 backend/kubeconfig.yaml + + - name: Start cert-generator (background) + shell: bash + env: + IMAGE_TAG: ${{ inputs.image-tag }} + run: | + nohup docker compose up -d --no-build cert-generator \ + > /tmp/cert-gen.log 2>&1 & + + - name: Use test environment config + shell: bash + run: | + cp backend/config.test.toml backend/config.toml + cp backend/secrets.example.toml backend/secrets.toml + + - name: Wait for image pull and infra + shell: bash + run: | + if [ -f /tmp/infra-pull.pid ]; then + PID=$(cat /tmp/infra-pull.pid) + if kill -0 "$PID" 2>/dev/null; then + echo "Waiting for image pull + infra startup..." + tail --pid="$PID" -f /dev/null 2>/dev/null || true + fi + fi + cat /tmp/infra-pull.log 2>/dev/null || true + cat /tmp/cert-gen.log 2>/dev/null || true + + - name: Start stack + shell: bash + env: + IMAGE_TAG: ${{ inputs.image-tag }} + run: docker compose up -d --no-build + + - name: Wait for services + shell: bash + env: + WAIT_FOR_FRONTEND: ${{ inputs.wait-for-frontend }} + run: | + echo "Waiting for backend health..." + timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done' + echo "Backend ready" + if [ "$WAIT_FOR_FRONTEND" = "true" ]; then + echo "Waiting for frontend health..." + timeout 60 bash -c 'until curl -ksf https://localhost:5001 2>/dev/null; do sleep 2; done' + echo "Frontend ready" + fi + + - name: Seed test users + shell: bash + run: docker compose exec -T backend uv run python scripts/seed_users.py diff --git a/.github/actions/k3s-setup/action.yml b/.github/actions/k3s-setup/action.yml deleted file mode 100644 index 523ca0ad..00000000 --- a/.github/actions/k3s-setup/action.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: 'K3s Setup' -description: 'Install k3s and create kubeconfig for Docker containers' - -inputs: - namespace: - description: 'Kubernetes namespace to create' - required: false - default: 'integr8scode' - kubeconfig-path: - description: 'Path to write the Docker-accessible kubeconfig' - required: false - default: 'backend/kubeconfig.yaml' - -outputs: - kubeconfig: - description: 'Path to the kubeconfig file for Docker containers' - value: ${{ inputs.kubeconfig-path }} - -runs: - using: 'composite' - steps: - - name: Install k3s - shell: bash - run: | - # --bind-address 0.0.0.0: Listen on all interfaces so Docker containers can reach it - # --tls-san host.docker.internal: Include in cert SANs for Docker container access - K3S_VERSION="${K3S_VERSION:-v1.32.11+k3s1}" - K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g') - curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh - chmod +x /tmp/k3s-install.sh - INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh - mkdir -p /home/runner/.kube - sudo k3s kubectl config view --raw > /home/runner/.kube/config - sudo chmod 600 /home/runner/.kube/config - - - name: Wait for k3s to be ready - shell: bash - run: | - export KUBECONFIG=/home/runner/.kube/config - timeout 90 bash -c 'until kubectl cluster-info; do sleep 5; done' - - - name: Create namespace - shell: bash - env: - NAMESPACE: ${{ inputs.namespace }} - run: | - export KUBECONFIG=/home/runner/.kube/config - kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f - - - - name: Create kubeconfig for Docker containers - shell: bash - env: - KUBECONFIG_PATH: ${{ inputs.kubeconfig-path }} - run: | - # Replace localhost/0.0.0.0 with host.docker.internal for container access - # (k3s may use 0.0.0.0 when started with --bind-address 0.0.0.0) - sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \ - /home/runner/.kube/config > "$KUBECONFIG_PATH" - chmod 644 "$KUBECONFIG_PATH" - echo "Kubeconfig written to $KUBECONFIG_PATH" - echo "Server URL: $(grep server "$KUBECONFIG_PATH" | head -1)" diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index 27f48d98..309433d6 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -269,83 +269,20 @@ jobs: needs: [build-images] if: ${{ !github.event.pull_request.head.repo.fork }} runs-on: ubuntu-latest + permissions: + contents: read + packages: read steps: - uses: actions/checkout@v6 - # ── Phase 1: Pull images + start infra in background (overlap with k3s) ── - - name: Pull images and pre-warm infra (background) - env: - IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: | - # Pull all images (GHCR + Docker Hub) then start infra services. - # This runs throughout k3s install/finalize (~20s of overlap). - nohup bash -c ' - IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1 - echo "--- pull done, starting infra ---" - docker compose up -d --no-build \ - mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1 - ' > /tmp/infra-pull.log 2>&1 & - echo $! > /tmp/infra-pull.pid - - - name: Use test environment config - run: | - cp backend/config.test.toml backend/config.toml - cp backend/secrets.example.toml backend/secrets.toml - - # ── Phase 2: k3s install + finalize (overlaps with image pull + infra boot) ── - - name: Install k3s - run: | - K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g') - curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh - chmod +x /tmp/k3s-install.sh - INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh - - - name: Finalize k3s - run: | - mkdir -p /home/runner/.kube - sudo k3s kubectl config view --raw > /home/runner/.kube/config - sudo chmod 600 /home/runner/.kube/config - export KUBECONFIG=/home/runner/.kube/config - timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done' - kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f - - sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \ - /home/runner/.kube/config > backend/kubeconfig.yaml - chmod 644 backend/kubeconfig.yaml - - # Start cert-generator now that kubeconfig exists (runs during wait step) - - name: Start cert-generator (background) - env: - IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: | - nohup docker compose up -d --no-build cert-generator \ - > /tmp/cert-gen.log 2>&1 & - - # ── Phase 3: Wait for pulls + start stack ── - - name: Wait for image pull and infra - run: | - if [ -f /tmp/infra-pull.pid ]; then - PID=$(cat /tmp/infra-pull.pid) - if kill -0 "$PID" 2>/dev/null; then - echo "Waiting for image pull + infra startup..." - tail --pid="$PID" -f /dev/null 2>/dev/null || true - fi - fi - cat /tmp/infra-pull.log 2>/dev/null || true - cat /tmp/cert-gen.log 2>/dev/null || true - - - name: Start stack - env: - IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: docker compose up -d --no-build - - - name: Wait for backend - run: | - echo "Waiting for backend health..." - timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done' - echo "Backend ready" + - uses: ./.github/actions/e2e-boot + with: + image-tag: ${{ needs.build-images.outputs.sha-tag }} + github-token: ${{ secrets.GITHUB_TOKEN }} - - name: Seed test users - run: docker compose exec -T backend uv run python scripts/seed_users.py + - uses: ./.github/actions/e2e-ready + with: + image-tag: ${{ needs.build-images.outputs.sha-tag }} - name: Run E2E tests timeout-minutes: 15 @@ -376,19 +313,11 @@ jobs: run: | mkdir -p logs docker compose logs --timestamps > logs/docker-compose.log 2>&1 - docker compose logs --timestamps backend > logs/backend.log 2>&1 - docker compose logs --timestamps mongo > logs/mongo.log 2>&1 || true - docker compose logs --timestamps redis > logs/redis.log 2>&1 || true - docker compose logs --timestamps kafka > logs/kafka.log 2>&1 || true - docker compose logs --timestamps zookeeper > logs/zookeeper.log 2>&1 || true - docker compose logs --timestamps schema-registry > logs/schema-registry.log 2>&1 || true - docker compose logs --timestamps coordinator > logs/coordinator.log 2>&1 || true - docker compose logs --timestamps k8s-worker > logs/k8s-worker.log 2>&1 || true - docker compose logs --timestamps pod-monitor > logs/pod-monitor.log 2>&1 || true - docker compose logs --timestamps result-processor > logs/result-processor.log 2>&1 || true - docker compose logs --timestamps saga-orchestrator > logs/saga-orchestrator.log 2>&1 || true - docker compose logs --timestamps event-replay > logs/event-replay.log 2>&1 || true - docker compose logs --timestamps dlq-processor > logs/dlq-processor.log 2>&1 || true + for svc in backend mongo redis kafka zookeeper schema-registry \ + coordinator k8s-worker pod-monitor result-processor \ + saga-orchestrator event-replay dlq-processor; do + docker compose logs --timestamps "$svc" > "logs/$svc.log" 2>&1 || true + done kubectl get events --sort-by='.metadata.creationTimestamp' -A > logs/k8s-events.log 2>&1 || true - name: Upload logs @@ -403,6 +332,9 @@ jobs: needs: [build-images] if: ${{ !github.event.pull_request.head.repo.fork }} runs-on: ubuntu-latest + permissions: + contents: read + packages: read strategy: fail-fast: false matrix: @@ -411,29 +343,13 @@ jobs: steps: - uses: actions/checkout@v6 - # ── Phase 1: Pull images + start infra in background (runs during all subsequent steps) ── - - name: Pull images and pre-warm infra (background) - env: - IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: | - # Pull all images (GHCR + Docker Hub) then start infra services. - # This runs throughout k3s + Node + Playwright setup (~80s of overlap). - nohup bash -c ' - IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1 - echo "--- pull done, starting infra ---" - docker compose up -d --no-build \ - mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1 - ' > /tmp/infra-pull.log 2>&1 & - echo $! > /tmp/infra-pull.pid - - # ── Phase 2: k3s install + Node/Playwright setup (overlapped with pull + infra) ── - - name: Install k3s - run: | - K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g') - curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh - chmod +x /tmp/k3s-install.sh - INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh + # Phase 1: kick off image pull + infra + k3s in background + - uses: ./.github/actions/e2e-boot + with: + image-tag: ${{ needs.build-images.outputs.sha-tag }} + github-token: ${{ secrets.GITHUB_TOKEN }} + # Phase 2: Node + Playwright setup (overlaps with k3s boot + image pull) - name: Setup Node.js uses: actions/setup-node@v6 with: @@ -461,61 +377,11 @@ jobs: working-directory: frontend run: npx playwright install chromium - # ── Phase 3: Finalize k3s (should be ready — 50s+ since install) ── - - name: Finalize k3s - run: | - mkdir -p /home/runner/.kube - sudo k3s kubectl config view --raw > /home/runner/.kube/config - sudo chmod 600 /home/runner/.kube/config - export KUBECONFIG=/home/runner/.kube/config - timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done' - kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f - - sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \ - /home/runner/.kube/config > backend/kubeconfig.yaml - chmod 644 backend/kubeconfig.yaml - - # Start cert-generator now that kubeconfig exists (runs during wait step) - - name: Start cert-generator (background) - env: - IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: | - nohup docker compose up -d --no-build cert-generator \ - > /tmp/cert-gen.log 2>&1 & - - - name: Use test environment config - run: | - cp backend/config.test.toml backend/config.toml - cp backend/secrets.example.toml backend/secrets.toml - - # ── Phase 4: Wait for pulls + start stack ── - - name: Wait for image pull and infra - run: | - if [ -f /tmp/infra-pull.pid ]; then - PID=$(cat /tmp/infra-pull.pid) - if kill -0 "$PID" 2>/dev/null; then - echo "Waiting for image pull + infra startup..." - tail --pid="$PID" -f /dev/null 2>/dev/null || true - fi - fi - cat /tmp/infra-pull.log 2>/dev/null || true - cat /tmp/cert-gen.log 2>/dev/null || true - - - name: Start stack - env: - IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }} - run: docker compose up -d --no-build - - - name: Wait for backend and frontend - run: | - echo "Waiting for backend health..." - timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done' - echo "Backend ready" - echo "Waiting for frontend health..." - timeout 60 bash -c 'until curl -ksf https://localhost:5001 2>/dev/null; do sleep 2; done' - echo "Frontend ready" - - - name: Seed test users - run: docker compose exec -T backend uv run python scripts/seed_users.py + # Phase 3: finalize k3s + start stack (k3s has been booting since e2e-boot) + - uses: ./.github/actions/e2e-ready + with: + image-tag: ${{ needs.build-images.outputs.sha-tag }} + wait-for-frontend: 'true' - name: Run Playwright tests timeout-minutes: 10 @@ -533,9 +399,10 @@ jobs: if: failure() run: | mkdir -p logs - docker compose logs > logs/docker-compose.log 2>&1 - docker compose logs backend > logs/backend.log 2>&1 - docker compose logs frontend > logs/frontend.log 2>&1 + docker compose logs --timestamps > logs/docker-compose.log 2>&1 + for svc in backend frontend; do + docker compose logs --timestamps "$svc" > "logs/$svc.log" 2>&1 || true + done - name: Upload logs if: failure() From 5ecb455737809e3d8be8ba18f2e139b470d78d39 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sat, 31 Jan 2026 23:56:43 +0100 Subject: [PATCH 15/17] fixes --- .github/actions/e2e-boot/action.yml | 2 ++ .github/actions/e2e-ready/action.yml | 7 +++++++ .github/workflows/stack-tests.yml | 1 + 3 files changed, 10 insertions(+) diff --git a/.github/actions/e2e-boot/action.yml b/.github/actions/e2e-boot/action.yml index be4d7bdf..01850ea1 100644 --- a/.github/actions/e2e-boot/action.yml +++ b/.github/actions/e2e-boot/action.yml @@ -29,6 +29,7 @@ runs: echo "--- pull done, starting infra ---" docker compose up -d --no-build \ mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1 + echo $? > /tmp/infra-pull.exit ' > /tmp/infra-pull.log 2>&1 & echo $! > /tmp/infra-pull.pid @@ -37,5 +38,6 @@ runs: run: | K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g') curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh + echo "$K3S_INSTALL_SHA256 /tmp/k3s-install.sh" | sha256sum -c - chmod +x /tmp/k3s-install.sh INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh diff --git a/.github/actions/e2e-ready/action.yml b/.github/actions/e2e-ready/action.yml index c2b90d39..fb794382 100644 --- a/.github/actions/e2e-ready/action.yml +++ b/.github/actions/e2e-ready/action.yml @@ -52,6 +52,13 @@ runs: fi cat /tmp/infra-pull.log 2>/dev/null || true cat /tmp/cert-gen.log 2>/dev/null || true + if [ -f /tmp/infra-pull.exit ]; then + EXIT_CODE=$(cat /tmp/infra-pull.exit) + if [ "$EXIT_CODE" != "0" ]; then + echo "::error::Background image pull / infra pre-warm failed (exit $EXIT_CODE)" + exit 1 + fi + fi - name: Start stack shell: bash diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml index 309433d6..373565ab 100644 --- a/.github/workflows/stack-tests.yml +++ b/.github/workflows/stack-tests.yml @@ -32,6 +32,7 @@ env: ZOOKEEPER_IMAGE: confluentinc/cp-zookeeper:7.8.2 SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.8.2 K3S_VERSION: v1.32.11+k3s1 + K3S_INSTALL_SHA256: d75e014f2d2ab5d30a318efa5c326f3b0b7596f194afcff90fa7a7a91166d5f7 jobs: # Fast unit tests (no infrastructure needed) From b62ebbd65a877397adf65ca4e69f9d0dafe9e189 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sun, 1 Feb 2026 00:47:02 +0100 Subject: [PATCH 16/17] updated docs + branch = main for all CI workflows (removed dev) --- .github/workflows/frontend-ci.yml | 4 +- .github/workflows/mypy.yml | 4 +- .github/workflows/ruff.yml | 4 +- .github/workflows/security.yml | 4 +- docs/operations/cicd.md | 499 +++++++++++++++++++----------- 5 files changed, 324 insertions(+), 191 deletions(-) diff --git a/.github/workflows/frontend-ci.yml b/.github/workflows/frontend-ci.yml index fe29a033..e6303aa1 100644 --- a/.github/workflows/frontend-ci.yml +++ b/.github/workflows/frontend-ci.yml @@ -2,12 +2,12 @@ name: Frontend CI on: push: - branches: [main, dev] + branches: [main] paths: - 'frontend/**' - '.github/workflows/frontend-ci.yml' pull_request: - branches: [main, dev] + branches: [main] paths: - 'frontend/**' - '.github/workflows/frontend-ci.yml' diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index d4752b08..34070e65 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -2,9 +2,9 @@ name: MyPy Type Checking on: push: - branches: [ main, dev ] + branches: [ main ] pull_request: - branches: [ main, dev ] + branches: [ main ] workflow_dispatch: jobs: diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 3ddec835..c670ce34 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -2,9 +2,9 @@ name: Ruff Linting on: push: - branches: [ main, dev ] + branches: [ main ] pull_request: - branches: [ main, dev ] + branches: [ main ] workflow_dispatch: jobs: diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 4452c432..10837590 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -2,9 +2,9 @@ name: Security Scanning on: push: - branches: [ main, dev ] + branches: [ main ] pull_request: - branches: [ main, dev ] + branches: [ main ] workflow_dispatch: jobs: diff --git a/docs/operations/cicd.md b/docs/operations/cicd.md index 54ff0130..b19f45f9 100644 --- a/docs/operations/cicd.md +++ b/docs/operations/cicd.md @@ -1,41 +1,40 @@ # CI/CD Pipeline -The project uses GitHub Actions to automate code quality checks, security scanning, testing, and documentation -deployment. Every push to `main` or `dev` and every pull request triggers the pipeline, with workflows running in -parallel to provide fast feedback. +The project uses GitHub Actions to automate code quality checks, security scanning, testing, image publishing, and +documentation deployment. The pipeline is split across several workflow files that trigger independently based on path +filters, so only relevant checks run for each change. ## Pipeline overview ```mermaid graph LR - subgraph "Code Quality" + subgraph "Code Quality (lightweight)" Ruff["Ruff Linting"] MyPy["MyPy Type Check"] - ESLint["ESLint + TypeScript"] + ESLint["ESLint + Svelte Check"] end subgraph "Security" Bandit["Bandit SAST"] + SBOM["SBOM & Grype"] end - subgraph "Docker Build & Scan" - Base["Build Base"] - Backend["Build Backend"] - Frontend["Build Frontend"] - ScanBE["Scan Backend"] - ScanFE["Scan Frontend"] - Base --> Backend - Base --> Frontend - Backend --> ScanBE - Frontend --> ScanFE - end - - subgraph "Testing (stack-tests.yml)" + subgraph "Stack Tests" UnitBE["Backend Unit"] UnitFE["Frontend Unit"] - Stack["Stack Tests"] - UnitBE --> Stack - UnitFE --> Stack + Build["Build & Push Images"] + E2E_BE["Backend E2E"] + E2E_FE["Frontend E2E"] + UnitBE --> Build + UnitFE --> Build + Build --> E2E_BE + Build --> E2E_FE + end + + subgraph "Docker Scan & Promote" + Scan["Trivy Scan (12 images)"] + Promote["Promote SHA → latest"] + Scan --> Promote end subgraph "Documentation" @@ -43,143 +42,289 @@ graph LR Pages["GitHub Pages"] end - Push["Push / PR"] --> Ruff - Push --> MyPy - Push --> ESLint - Push --> Bandit - Push --> Base - Push --> UnitBE - Push --> UnitFE - Push --> Docs + Push["Push / PR"] --> Ruff & MyPy & ESLint & Bandit & SBOM & UnitBE & UnitFE & Docs + Build -->|main, all tests pass| Scan Docs -->|main only| Pages ``` -All workflows trigger on pushes to `main` and `dev` branches, pull requests against those branches, and can be triggered -manually via `workflow_dispatch`. Path filters ensure workflows only run when relevant files change. - -## Linting and type checking - -Three lightweight workflows run first since they catch obvious issues quickly. - -**Backend (Python):** -- [Ruff](https://docs.astral.sh/ruff/) checks for style violations, import ordering, and common bugs -- [mypy](https://mypy.readthedocs.io/) with strict settings catches type mismatches and missing return types - -**Frontend (TypeScript):** -- ESLint checks for code quality issues -- TypeScript compiler (`tsc --noEmit`) verifies type correctness - -Both use dependency caching to skip reinstallation when lockfiles haven't changed. - -## Security scanning - -The security workflow uses [Bandit](https://bandit.readthedocs.io/) to perform static analysis on Python source files, -flagging issues like hardcoded credentials, SQL injection patterns, and unsafe deserialization. It excludes the test -directory and reports only medium-severity and above findings. Container-level vulnerability scanning with Trivy runs -as part of the Docker workflow. +The two heavyweight workflows are **Stack Tests** (builds images, runs all tests) and **Docker Scan & Promote** +(scans images with Trivy and promotes to `latest`). They're connected: Docker Scan & Promote triggers automatically +after Stack Tests succeeds on `main`, forming a build-test-scan-promote pipeline where the `latest` tag only moves +forward when everything passes. -## Docker build and scan - -The Docker workflow is structured as multiple jobs with dependencies, enabling parallel execution and early failure -detection. If any job fails, dependent jobs are skipped immediately. - -```mermaid -graph TD - A[build-base] --> B[build-backend] - A --> C[build-frontend] - B --> D[scan-backend] - C --> E[scan-frontend] - D --> F[summary] - E --> F - - style A fill:#e1f5fe - style B fill:#fff3e0 - style C fill:#fff3e0 - style D fill:#ffebee - style E fill:#ffebee - style F fill:#e8f5e9 -``` +## Workflow files -| Job | Depends On | Purpose | -|------------------|------------------|------------------------------------------------------| -| `build-base` | - | Build shared base image with Python and dependencies | -| `build-backend` | `build-base` | Build backend image using base as build context | -| `build-frontend` | `build-base` | Build frontend image (runs parallel with backend) | -| `scan-backend` | `build-backend` | Trivy vulnerability scan on backend image | -| `scan-frontend` | `build-frontend` | Trivy vulnerability scan on frontend image | -| `summary` | All scans | Generate summary (main branch only) | +| Workflow | File | Trigger | Purpose | +|-------------------------|----------------------------------------------|-----------------------------------------------|--------------------------------------------| +| Stack Tests | `.github/workflows/stack-tests.yml` | Push/PR to `main`, tags `v*` | Unit tests, image build, E2E tests | +| Docker Scan & Promote | `.github/workflows/docker.yml` | After Stack Tests completes on `main` | Trivy scan + promote SHA tag to `latest` | +| SBOM & Supply Chain | `.github/workflows/sbom-compliance.yml` | Push/PR to `main`, weekly schedule | SPDX SBOM generation + Grype vulnerability scan | +| Ruff Linting | `.github/workflows/ruff.yml` | Push/PR to `main` | Python code style and import checks | +| MyPy Type Checking | `.github/workflows/mypy.yml` | Push/PR to `main` | Python static type analysis | +| Frontend CI | `.github/workflows/frontend-ci.yml` | Push/PR to `main` (frontend changes) | ESLint + Svelte type check | +| Security Scanning | `.github/workflows/security.yml` | Push/PR to `main` | Bandit SAST | +| Documentation | `.github/workflows/docs.yml` | Push/PR (`docs/`, `mkdocs.yml`) | MkDocs build and GitHub Pages deploy | -### Base image +## Composite actions -The base image (`Dockerfile.base`) contains Python, system dependencies, and all pip packages. It -uses [uv](https://docs.astral.sh/uv/) to install dependencies from the lockfile with `uv sync --locked --no-dev`, -ensuring reproducible builds without development tools. +Shared steps are extracted into reusable composite actions under `.github/actions/`. This eliminates duplication between +the backend and frontend E2E jobs, which both need k3s and the full docker compose stack but set it up differently. -### Security scanning +| Action | File | Purpose | +|-------------------------|----------------------------------------------|--------------------------------------------| +| E2E Boot | `.github/actions/e2e-boot/action.yml` | GHCR login, background image pull + infra pre-warm, k3s install | +| E2E Ready | `.github/actions/e2e-ready/action.yml` | Finalize k3s, start compose stack, health checks, seed users | -After each image builds, [Trivy](https://trivy.dev/) scans it for known vulnerabilities in OS packages and Python -dependencies. The scan fails if it finds any critical or high severity issues with available fixes. +The split is intentional. Frontend E2E needs to install Node.js and Playwright browsers _between_ boot and ready, +overlapping that work with k3s installation to save wall-clock time. Backend E2E calls them back-to-back since it has +no setup to overlap. -## Stack tests (unified testing) +## Stack Tests (the main workflow) -The `stack-tests.yml` workflow consolidates all testing that requires infrastructure into a single job, avoiding -redundant stack setup across multiple jobs. +This is the core testing workflow. It builds all 13 container images, pushes them to GHCR with immutable SHA-based +tags, then runs E2E tests on separate runners that pull images from the registry. ```mermaid graph TD - subgraph "Parallel (fast)" - A[Backend Unit Tests] - B[Frontend Unit Tests] - end - - subgraph "Build" - C[Build Images] + subgraph "Phase 1: Fast feedback" + A["Backend Unit Tests"] + B["Frontend Unit Tests"] end - subgraph "Backend E2E (own runner)" - D1[Setup k3s + Stack] - E[Backend E2E Tests] - D1 --> E + subgraph "Phase 2: Build" + C["Build & Push 13 Images to GHCR"] end - subgraph "Frontend E2E (own runner)" - D2[Setup k3s + Stack] - F[Frontend E2E Tests] - D2 --> F + subgraph "Phase 3: E2E (parallel runners)" + D["Backend E2E
(k3s + full stack)"] + E["Frontend E2E Shard 1/2
(k3s + Playwright)"] + F["Frontend E2E Shard 2/2
(k3s + Playwright)"] end A --> C B --> C - C --> D1 - C --> D2 + C --> D & E & F style A fill:#e8f5e9 style B fill:#e8f5e9 style C fill:#e1f5fe - style D1 fill:#e1f5fe - style D2 fill:#e1f5fe + style D fill:#fff3e0 style E fill:#fff3e0 style F fill:#fff3e0 ``` -### Test execution order +### Phase 1: Unit tests + +Backend and frontend unit tests run in parallel. They need no infrastructure and complete quickly. If either fails, +the image build is skipped entirely. + +### Phase 2: Build and push + +All 13 images are built on a single runner and pushed to GHCR with an immutable `sha-<7chars>` tag: + +| Image | Source | +|----------------------|---------------------------------------------| +| `base` | `backend/Dockerfile.base` | +| `backend` | `backend/Dockerfile` | +| `coordinator` | `backend/workers/Dockerfile.coordinator` | +| `k8s-worker` | `backend/workers/Dockerfile.k8s_worker` | +| `pod-monitor` | `backend/workers/Dockerfile.pod_monitor` | +| `result-processor` | `backend/workers/Dockerfile.result_processor` | +| `saga-orchestrator` | `backend/workers/Dockerfile.saga_orchestrator` | +| `event-replay` | `backend/workers/Dockerfile.event_replay` | +| `dlq-processor` | `backend/workers/Dockerfile.dlq_processor` | +| `cert-generator` | `cert-generator/Dockerfile` | +| `zookeeper-certgen` | `backend/zookeeper/Dockerfile.certgen` | +| `frontend-dev` | `frontend/Dockerfile` | +| `frontend` | `frontend/Dockerfile.prod` | + +The base image is cached separately as a zstd-compressed tarball since its dependencies rarely change. Worker images +depend on it via `--build-context base=docker-image://integr8scode-base:latest`. Utility and frontend images use GHA +layer caching. + +All 13 images are pushed to GHCR in parallel, with each push tracked by PID so individual failures are reported: + +```yaml +declare -A PIDS +for name in base backend coordinator k8s-worker ...; do + docker push "$IMG/$name:$TAG" & + PIDS[$name]=$! +done +FAILED=0 +for name in "${!PIDS[@]}"; do + if ! wait "${PIDS[$name]}"; then + echo "::error::Failed to push $name" + FAILED=1 + fi +done +[ "$FAILED" -eq 0 ] || exit 1 +``` -1. **Unit tests (parallel)**: Backend and frontend unit tests run simultaneously. They require no infrastructure and - complete quickly (~1-2 min each). +Fork PRs skip the GHCR push (no write access), so E2E tests only run for non-fork PRs. -2. **Image build**: After unit tests pass, all Docker images are built with GHA layer caching. +### Phase 3: E2E tests -3. **E2E tests (parallel)**: Backend and frontend E2E tests run in parallel on separate runners, each with its own - isolated stack (k3s + docker compose): - - Backend E2E tests (pytest with k8s) - - Frontend E2E tests (Playwright) +Backend and frontend E2E tests run on separate runners. Each runner provisions its own k3s cluster and docker compose +stack, pulling pre-built images from GHCR. + +#### E2E Boot (`.github/actions/e2e-boot`) + +This action kicks off three slow tasks that can overlap: + +1. **GHCR login** using `docker/login-action@v3` +2. **Background image pull + infra pre-warm** — pulls all compose images then starts infrastructure services + (mongo, redis, kafka, zookeeper, schema-registry) in a background `nohup` process. The exit status is persisted + to `/tmp/infra-pull.exit` so the next action can check for failures. +3. **k3s install** — downloads and installs a pinned k3s version with SHA256 checksum verification (see + [supply-chain hardening](#supply-chain-hardening) below) + +#### E2E Ready (`.github/actions/e2e-ready`) + +This action finalizes the environment after boot tasks complete: + +1. **Finalize k3s** — copies kubeconfig, rewrites the API server address to `host.docker.internal` so containers + inside docker compose can reach the k3s API server, creates the `integr8scode` namespace +2. **Start cert-generator** in the background +3. **Copy test config** — uses `config.test.toml` and `secrets.example.toml` +4. **Wait for image pull and infra** — blocks until the background pull completes and checks the exit code from + `/tmp/infra-pull.exit`, failing fast if the background process had errors +5. **Start compose stack** with `docker compose up -d --no-build` +6. **Health checks** — waits for backend (`/api/v1/health/live`), and optionally frontend (`https://localhost:5001`) +7. **Seed test users** via `scripts/seed_users.py` + +#### Frontend E2E sharding + +Frontend E2E tests use Playwright with 2 shards running in parallel on separate runners. Between `e2e-boot` and +`e2e-ready`, each shard installs Node.js dependencies and Playwright browsers (with caching), overlapping that work +with k3s booting in the background. + +``` +e2e-boot (GHCR login + pull + k3s install) + | + ├── npm ci + playwright install (overlapped with k3s) + | +e2e-ready (finalize k3s + start stack + health check) + | + └── npx playwright test --shard=N/2 +``` ### Coverage reporting -Each test suite reports coverage to [Codecov](https://codecov.io/): -- `backend-unit` flag for unit tests -- `backend-e2e` flag for E2E tests -- `frontend-unit` flag for frontend unit tests +Each test suite reports coverage to [Codecov](https://codecov.io/) with separate flags: + +- `backend-unit` — backend unit tests +- `backend-e2e` — backend E2E tests +- `frontend-unit` — frontend unit tests (Vitest with `lcov` output) + +### Log collection on failure + +When E2E tests fail, logs are collected automatically and uploaded as artifacts: + +- All docker compose service logs with timestamps +- Individual service logs for each worker +- Kubernetes events sorted by timestamp (backend E2E only) + +## Docker Scan & Promote + +This workflow implements the promotion model: the `latest` tag is never set during the build. Only this workflow +sets it, and only after all tests pass. + +```mermaid +graph LR + ST["Stack Tests
(main, success)"] -->|workflow_run trigger| Scan + Scan["Trivy Scan
(12 images in parallel)"] --> Promote["crane copy
sha-xxx → latest"] + Promote --> Summary["Step Summary"] +``` + +### Trigger + +Runs automatically when `Stack Tests` completes successfully on `main`. Can also be triggered manually via +`workflow_dispatch` with an optional SHA input to promote a specific commit. + +### Scan + +Uses [Trivy](https://trivy.dev/) (pinned at `v0.68.2`) to scan all 12 deployed images in parallel via matrix strategy. +Scans for `CRITICAL` and `HIGH` severity vulnerabilities with unfixed issues ignored. Results are uploaded as SARIF +files to GitHub's Security tab. + +### Promote + +Uses [crane](https://github.com/google/go-containerregistry/blob/main/cmd/crane/README.md) to copy manifests at the +registry level (`crane copy sha-tag latest`), avoiding any rebuild or re-push. This is a fast, atomic operation that +simply re-tags existing image manifests. + +## SBOM & Supply Chain Security + +The `sbom-compliance.yml` workflow generates [SPDX](https://spdx.dev/) Software Bills of Materials for both backend +(Python) and frontend (JavaScript) components. It runs on every push/PR to `main` and weekly on a schedule. + +For each component: + +1. **Generate SBOM** using [anchore/sbom-action](https://github.com/anchore/sbom-action) — produces an SPDX JSON file + listing all direct and transitive dependencies +2. **Scan SBOM** using [anchore/scan-action](https://github.com/anchore/scan-action) (Grype) — checks for known + vulnerabilities with a `high` severity cutoff +3. **Upload** — SBOM artifacts are retained for 5 days; vulnerability results are uploaded as SARIF to GitHub's + Security tab + +## Supply-chain hardening + +### k3s version pinning and checksum verification + +The k3s installation in CI is hardened against supply-chain attacks: + +1. **Pinned version** — `K3S_VERSION` is set as a workflow-level env var (`v1.32.11+k3s1`), not fetched dynamically +2. **Source pinning** — the install script is fetched from the k3s GitHub repository at the exact tagged version + (e.g., `https://raw.githubusercontent.com/k3s-io/k3s/v1.32.11%2Bk3s1/install.sh`), not from the `get.k3s.io` CDN +3. **SHA256 verification** — the install script is verified against a known checksum before execution: + +```bash +K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g') +curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh +echo "$K3S_INSTALL_SHA256 /tmp/k3s-install.sh" | sha256sum -c - +chmod +x /tmp/k3s-install.sh +INSTALL_K3S_VERSION="$K3S_VERSION" ... /tmp/k3s-install.sh +``` + +This prevents the common `curl | sh` anti-pattern where a compromised CDN or MITM could inject malicious code. + +### GHCR image tags + +Images are tagged with `sha-<7chars>` (immutable, tied to a specific commit) during build. The `latest` tag is only +applied by the Docker Scan & Promote workflow after all tests and security scans pass. This means: + +- Every E2E test runs against exactly the images built from that commit +- `latest` is never stale or untested +- Any commit's images can be pulled by their SHA tag for debugging + +### Dependency pinning + +All GitHub Actions are pinned to major versions (e.g., `actions/checkout@v6`, `docker/build-push-action@v6`). Trivy is +pinned to a specific version (`aquasecurity/trivy-action@0.33.1`) for scan reproducibility. + +## Linting and type checking + +Three lightweight workflows run independently since they catch obvious issues quickly. + +**Backend (Python):** + +- [Ruff](https://docs.astral.sh/ruff/) checks for style violations, import ordering, and common bugs +- [mypy](https://mypy.readthedocs.io/) with strict settings catches type mismatches and missing return types + +**Frontend (TypeScript/Svelte):** + +- ESLint checks for code quality issues +- `svelte-check` verifies TypeScript types and Svelte component correctness + +Both use dependency caching ([uv](https://docs.astral.sh/uv/) for Python, npm for Node.js) to skip reinstallation +when lockfiles haven't changed. + +## Security scanning + +The `security.yml` workflow uses [Bandit](https://bandit.readthedocs.io/) to perform static analysis on Python source +files, flagging issues like hardcoded credentials, SQL injection patterns, and unsafe deserialization. It excludes the +test directory and reports only medium-severity and above findings. Container-level vulnerability scanning with Trivy +runs as part of the [Docker Scan & Promote](#docker-scan--promote) workflow. ## Documentation @@ -189,6 +334,47 @@ the [Material theme](https://squidfunk.github.io/mkdocs-material/). It triggers On pushes to main, the workflow deploys the built site to GitHub Pages. +## Build optimizations + +### Docker layer caching + +All image builds use [docker/build-push-action](https://github.com/docker/build-push-action) with GitHub Actions +cache. Each service has its own cache scope, preventing pollution between unrelated builds: + +```yaml +- name: Build cert-generator image + uses: docker/build-push-action@v6 + with: + context: ./cert-generator + file: ./cert-generator/Dockerfile + load: true + tags: integr8scode-cert-generator:latest + cache-from: type=gha,scope=cert-generator + cache-to: type=gha,mode=max,scope=cert-generator +``` + +### Base image caching + +The base image (Python + all pip dependencies) changes infrequently, so it's cached as a zstd-compressed tarball keyed +on `Dockerfile.base`, `pyproject.toml`, and `uv.lock`. On cache hit the image is loaded directly with `docker load`, +skipping the entire build. + +### Background infra pre-warm + +The `e2e-boot` action pulls all docker compose images and starts infrastructure services _in the background_ while k3s +installs. This overlaps network-bound (image pull) and CPU-bound (k3s compilation) work, saving several minutes per +E2E job. + +### Frontend Playwright caching + +Playwright browsers are cached by `package-lock.json` hash. On cache hit, only system dependencies are installed +(`playwright install-deps chromium`), skipping the browser download. + +### Parallel image push + +All 13 images are pushed to GHCR concurrently using background processes with PID tracking. Each push failure is +reported individually via `::error::` annotations. + ## Running locally You can run most checks locally before pushing. @@ -197,10 +383,10 @@ You can run most checks locally before pushing. cd backend # Linting -uv run ruff check . +uv run ruff check . --config pyproject.toml # Type checking -uv run mypy . +uv run mypy --config-file pyproject.toml --strict . # Security scan uv tool run bandit -r . -x tests/ -ll @@ -216,76 +402,23 @@ cd frontend npm run lint # Type checking -npx tsc --noEmit +npm run check # Unit tests npm run test ``` -For E2E tests, use the same deployment as CI: +For E2E tests, use the deployment script to bring up the full stack: ```bash -# Start full stack (requires k8s configured locally) -./deploy.sh dev +# Start full stack with k8s configured locally +./deploy.sh dev --wait -# Run tests inside the running backend container +# Run backend E2E tests inside the running container docker compose exec -T backend uv run pytest tests/e2e -v # Run frontend E2E tests cd frontend && npx playwright test ``` -Or use `./deploy.sh test` which handles everything automatically. - -## Build optimizations - -The CI pipeline employs several caching strategies to minimize build times. - -### Docker layer caching - -All image builds use [docker/build-push-action](https://github.com/docker/build-push-action) with GitHub Actions cache: - -```yaml -- name: Build base image - uses: docker/build-push-action@v6 - with: - context: ./backend - file: ./backend/Dockerfile.base - load: true - tags: integr8scode-base:latest - cache-from: type=gha,scope=backend-base - cache-to: type=gha,mode=max,scope=backend-base -``` - -Each service has its own cache scope (`backend-base`, `backend`, `frontend`, `cert-generator`), preventing cache -pollution between unrelated builds. - -### Infrastructure image caching - -A reusable action at `.github/actions/docker-cache` handles infrastructure images (MongoDB, Redis, Kafka, Schema -Registry). It stores pulled images as zstd-compressed tarballs in the GitHub Actions cache, saving ~30 seconds per run -and avoiding Docker Hub rate limits. - -### k3s setup action - -A reusable composite action at `.github/actions/k3s-setup` handles Kubernetes setup: -- Installs k3s with traefik disabled -- Creates the `integr8scode` namespace -- Generates a kubeconfig accessible from Docker containers (via `host.docker.internal`) - -This eliminates copy-paste across workflows and ensures consistent k8s setup. - -## Workflow files - -| Workflow | File | Purpose | -|--------------------|--------------------------------------|------------------------------------| -| Ruff Linting | `.github/workflows/ruff.yml` | Python code style and import checks | -| MyPy Type Checking | `.github/workflows/mypy.yml` | Python static type analysis | -| Frontend CI | `.github/workflows/frontend-ci.yml` | TypeScript lint and type check | -| Security Scanning | `.github/workflows/security.yml` | Bandit SAST | -| Docker Build & Scan| `.github/workflows/docker.yml` | Image build and Trivy scan | -| Stack Tests | `.github/workflows/stack-tests.yml` | All unit and E2E tests | -| Documentation | `.github/workflows/docs.yml` | MkDocs build and deploy | - -All workflows use [uv](https://docs.astral.sh/uv/) for Python dependency management and npm for Node.js, with caching -enabled for both. +Or use `./deploy.sh test` which handles stack setup, testing, and teardown automatically. From 595c2238c17adbd3d2cfcdf2354dbaa7ff2737ba Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Sun, 1 Feb 2026 00:54:40 +0100 Subject: [PATCH 17/17] clarified 12/13 images in docs --- docs/operations/cicd.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/operations/cicd.md b/docs/operations/cicd.md index b19f45f9..0db2fee6 100644 --- a/docs/operations/cicd.md +++ b/docs/operations/cicd.md @@ -138,6 +138,10 @@ All 13 images are built on a single runner and pushed to GHCR with an immutable | `frontend-dev` | `frontend/Dockerfile` | | `frontend` | `frontend/Dockerfile.prod` | +Of these 13 images, 12 are scanned by Trivy and promoted to `latest` in the +[Docker Scan & Promote](#docker-scan--promote) workflow. The `frontend-dev` image is excluded — it's the Vite dev +server build used only for E2E tests in CI and is never deployed to production. + The base image is cached separately as a zstd-compressed tarball since its dependencies rarely change. Worker images depend on it via `--build-context base=docker-image://integr8scode-base:latest`. Utility and frontend images use GHA layer caching.