From 27c41f5696fcc61c279a158c02e4f0d8e318269b Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 17:22:02 +0100
Subject: [PATCH 01/17] making deploy quicker - less intervals, observability
 now optional

---
 deploy.sh           | 23 +++++++++++++++++++----
 docker-compose.yaml | 14 +++++++-------
 2 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/deploy.sh b/deploy.sh
index f25c480f..68f8b25a 100755
--- a/deploy.sh
+++ b/deploy.sh
@@ -59,6 +59,8 @@ show_help() {
     echo "                     --build             Rebuild images"
     echo "                     --wait              Wait for services to be healthy"
     echo "                     --timeout <secs>    Health check timeout (default: 300)"
+    echo "                     --observability     Include Grafana, Jaeger, etc."
+    echo "                     --debug             Include observability + Kafdrop"
     echo "  infra [options]    Start infrastructure only (mongo, redis, kafka, etc.)"
     echo "                     --wait              Wait for services to be healthy"
     echo "                     --timeout <secs>    Health check timeout (default: 120)"
@@ -99,6 +101,7 @@ cmd_dev() {
     local BUILD_FLAG=""
     local WAIT_FLAG=""
     local WAIT_TIMEOUT="300"
+    local PROFILE_FLAGS=""
 
     while [[ $# -gt 0 ]]; do
         case "$1" in
@@ -113,6 +116,14 @@ cmd_dev() {
                 shift
                 WAIT_TIMEOUT="$1"
                 ;;
+            --observability)
+                PROFILE_FLAGS="--profile observability"
+                print_info "Including observability stack (Grafana, Jaeger, etc.)"
+                ;;
+            --debug)
+                PROFILE_FLAGS="--profile observability --profile debug"
+                print_info "Including observability + debug tools (Kafdrop, etc.)"
+                ;;
         esac
         shift
     done
@@ -122,7 +133,7 @@ cmd_dev() {
         WAIT_TIMEOUT_FLAG="--wait-timeout $WAIT_TIMEOUT"
     fi
 
-    docker compose --profile observability up -d $BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG
+    docker compose $PROFILE_FLAGS up -d $BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG
 
     echo ""
     print_success "Development environment started!"
@@ -130,9 +141,13 @@ cmd_dev() {
     echo "Services:"
     echo "  Backend:   https://localhost:443"
     echo "  Frontend:  https://localhost:5001"
-    echo "  Kafdrop:   http://localhost:9000"
-    echo "  Jaeger:    http://localhost:16686"
-    echo "  Grafana:   http://localhost:3000"
+    if [[ "$PROFILE_FLAGS" == *"debug"* ]]; then
+        echo "  Kafdrop:   http://localhost:9000"
+    fi
+    if [[ "$PROFILE_FLAGS" == *"observability"* ]]; then
+        echo "  Jaeger:    http://localhost:16686"
+        echo "  Grafana:   http://localhost:3000"
+    fi
     echo ""
     echo "Commands:"
     echo "  ./deploy.sh logs             # View all logs"
diff --git a/docker-compose.yaml b/docker-compose.yaml
index bbbb40a1..8d3cd5fd 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -10,7 +10,7 @@ services:
     image: alpine:latest
     volumes:
       - shared_ca:/shared_ca
-    command: sh -c "mkdir -p /shared_ca && chmod 777 /shared_ca && echo 'Shared CA directory ready' && sleep 2"
+    command: sh -c "mkdir -p /shared_ca && chmod 777 /shared_ca && echo 'Shared CA directory ready'"
     networks:
       - app-network
 
@@ -74,10 +74,10 @@ services:
       - app-network
     healthcheck:
       test: ["CMD", "redis-cli", "ping"]
-      interval: 10s
+      interval: 5s
       timeout: 5s
       retries: 5
-      start_period: 10s
+      start_period: 5s
 
   backend:
     build:
@@ -120,12 +120,11 @@ services:
     extra_hosts:
       - "host.docker.internal:host-gateway"
     healthcheck:
-      # Simpler, reliable healthcheck: curl fails non-zero for HTTP >=400 with -f
       test: ["CMD-SHELL", "curl -k -f -s https://localhost:443/api/v1/health/live >/dev/null || exit 1"]
       interval: 3s
       timeout: 3s
       retries: 50
-      start_period: 10s
+      start_period: 5s
 
   frontend:
     container_name: frontend
@@ -154,7 +153,7 @@ services:
       interval: 3s
       timeout: 3s
       retries: 30
-      start_period: 10s
+      start_period: 5s
 
 
   grafana:
@@ -322,7 +321,7 @@ services:
       interval: 5s
       timeout: 10s
       retries: 12
-      start_period: 15s
+      start_period: 5s
 
   schema-registry:
     image: confluentinc/cp-schema-registry:7.8.2
@@ -348,6 +347,7 @@ services:
   kafdrop:
     image: obsidiandynamics/kafdrop:3.31.0
     container_name: kafdrop
+    profiles: ["debug"]
     depends_on:
       - kafka
       - schema-registry

From da2ef689ae0899a0c072bbebf9f062a16814967f Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 18:43:55 +0100
Subject: [PATCH 02/17]  Changes Made
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  stack-tests.yml — Build, Test, Push

  - Removed dev from branch triggers (only main now)
  - Added tags: ['v*'] and cert-generator/** path triggers
  - Build job now pushes to GHCR with immutable sha-{sha} tag (push events only)
  - Added missing pre-builds: event-replay, dlq-processor, zookeeper-certgen (these were being rebuilt during compose startup before)
  - Added frontend-prod build (from Dockerfile.prod, pushed as frontend:sha-xxx for Trivy scanning)
  - E2E jobs pull from GHCR on push events (parallel docker pull & + retag to compose names), fall back to artifact for PRs
  - All push/pull commands are spelled out explicitly (no for loops)
  - Added packages: write permission to build job

  docker.yml — Scan & Promote (rewritten)

  - Trigger: workflow_run on "Stack Tests" completion (+ workflow_dispatch with optional SHA input)
  - Only runs when Stack Tests succeed on main
  - Scan jobs: Trivy scans backend and frontend-prod from GHCR using SHA tag
  - Promote job: crane copy sha-xxx → latest for all 12 images — registry-level manifest copy, no rebuild
  - latest is NEVER set during build — only after all tests + scans pass

  Flow

  Push to main:
    stack-tests.yml: unit → build (push sha-xxx to GHCR) → E2E (pull from GHCR)
    docker.yml:      (on success) → scan → promote sha-xxx → latest

  PR:
    stack-tests.yml: unit → build (save artifact) → E2E (load artifact)
    docker.yml:      (skipped — only triggers on main)
---
 .github/workflows/docker.yml      | 317 +++++++++++-------------------
 .github/workflows/stack-tests.yml | 218 ++++++++++++++++++--
 2 files changed, 319 insertions(+), 216 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 2cdd4f40..c6535c54 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -1,213 +1,52 @@
-name: Docker Build, Scan & Publish
+name: Docker Scan & Promote
 
+# Runs after Stack Tests completes on main — promotes sha-xxx → latest.
+# "latest" is NEVER set during build. Only this workflow can set it,
+# and only after all tests pass. If any test fails, latest stays unchanged.
 on:
-  push:
-    branches: [ main ]
-    tags: [ 'v*' ]
-  pull_request:
-    branches: [ main ]
+  workflow_run:
+    workflows: ["Stack Tests"]
+    types: [completed]
   workflow_dispatch:
+    inputs:
+      sha:
+        description: 'Full commit SHA to promote (defaults to latest main)'
+        required: false
 
 env:
   REGISTRY: ghcr.io
 
 jobs:
-  build-base:
-    name: Build Base
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-
-    outputs:
-      image-tag: ${{ steps.image-tag.outputs.tag }}
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Set lowercase image prefix
-        run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/base
-          tags: |
-            type=ref,event=branch
-            type=ref,event=pr
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=sha-
-            type=raw,value=latest,enable={{is_default_branch}}
-
-      - name: Determine image tag for dependent builds
-        id: image-tag
-        run: |
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            echo "tag=pr-${{ github.event.number }}" >> $GITHUB_OUTPUT
-          else
-            echo "tag=latest" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Build and push
-        uses: docker/build-push-action@v6
-        with:
-          context: ./backend
-          file: ./backend/Dockerfile.base
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,scope=base
-          cache-to: type=gha,mode=max,scope=base
-
-  build-backend:
-    name: Build Backend
-    needs: build-base
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-
-    outputs:
-      image-ref: ${{ steps.image-ref.outputs.ref }}
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Set lowercase image prefix
-        run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend
-          tags: |
-            type=ref,event=branch
-            type=ref,event=pr
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=sha-
-            type=raw,value=latest,enable={{is_default_branch}}
-
-      - name: Set image reference for scan
-        id: image-ref
-        run: |
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend:pr-${{ github.event.number }}" >> $GITHUB_OUTPUT
-          else
-            echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend:latest" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Build and push
-        uses: docker/build-push-action@v6
-        with:
-          context: ./backend
-          file: ./backend/Dockerfile
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,scope=backend
-          cache-to: type=gha,mode=max,scope=backend
-          build-contexts: |
-            base=docker-image://${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/base:${{ needs.build-base.outputs.image-tag }}
-
-  build-frontend:
-    name: Build Frontend
-    needs: build-base
+  scan-backend:
+    name: Scan Backend
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.workflow_run.conclusion == 'success' &&
+       github.event.workflow_run.head_branch == 'main')
     runs-on: ubuntu-latest
     permissions:
       contents: read
-      packages: write
-
-    outputs:
-      image-ref: ${{ steps.image-ref.outputs.ref }}
-
+      security-events: write
+      packages: read
     steps:
       - uses: actions/checkout@v6
 
-      - name: Set lowercase image prefix
-        run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend
-          tags: |
-            type=ref,event=branch
-            type=ref,event=pr
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=sha-
-            type=raw,value=latest,enable={{is_default_branch}}
-
-      - name: Set image reference for scan
-        id: image-ref
+      - name: Compute image ref
+        id: ref
         run: |
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend:pr-${{ github.event.number }}" >> $GITHUB_OUTPUT
+          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            SHA="${{ github.event.inputs.sha || github.sha }}"
           else
-            echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend:latest" >> $GITHUB_OUTPUT
+            SHA="${{ github.event.workflow_run.head_sha }}"
           fi
-
-      - name: Build and push
-        uses: docker/build-push-action@v6
-        with:
-          context: ./frontend
-          file: ./frontend/Dockerfile.prod
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,scope=frontend
-          cache-to: type=gha,mode=max,scope=frontend
-
-  scan-backend:
-    name: Scan Backend
-    needs: build-backend
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      security-events: write
-
-    steps:
-      - uses: actions/checkout@v6
+          TAG="sha-${SHA::7}"
+          echo "image=${{ env.REGISTRY }}/$PREFIX/backend:$TAG" >> $GITHUB_OUTPUT
 
       - name: Run Trivy vulnerability scanner
         uses: aquasecurity/trivy-action@0.33.1
         with:
-          image-ref: ${{ needs.build-backend.outputs.image-ref }}
+          image-ref: ${{ steps.ref.outputs.image }}
           format: 'sarif'
           output: 'trivy-backend-results.sarif'
           ignore-unfixed: true
@@ -225,17 +64,32 @@ jobs:
 
   scan-frontend:
     name: Scan Frontend
-    needs: build-frontend
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.workflow_run.conclusion == 'success' &&
+       github.event.workflow_run.head_branch == 'main')
     runs-on: ubuntu-latest
     permissions:
       contents: read
       security-events: write
-
+      packages: read
     steps:
+      - name: Compute image ref
+        id: ref
+        run: |
+          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            SHA="${{ github.event.inputs.sha || github.sha }}"
+          else
+            SHA="${{ github.event.workflow_run.head_sha }}"
+          fi
+          TAG="sha-${SHA::7}"
+          echo "image=${{ env.REGISTRY }}/$PREFIX/frontend:$TAG" >> $GITHUB_OUTPUT
+
       - name: Run Trivy vulnerability scanner
         uses: aquasecurity/trivy-action@0.33.1
         with:
-          image-ref: ${{ needs.build-frontend.outputs.image-ref }}
+          image-ref: ${{ steps.ref.outputs.image }}
           format: 'sarif'
           output: 'trivy-frontend-results.sarif'
           ignore-unfixed: true
@@ -250,26 +104,79 @@ jobs:
           sarif_file: 'trivy-frontend-results.sarif'
           category: 'trivy-frontend'
 
+  # Promote SHA tag → latest using crane (registry-level manifest copy, no rebuild)
+  promote:
+    name: Promote to Latest
+    needs: [scan-backend, scan-frontend]
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.workflow_run.conclusion == 'success' &&
+       github.event.workflow_run.head_branch == 'main')
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+    steps:
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Install crane
+        uses: imjasonh/setup-crane@v0.4
+
+      - name: Promote images (SHA → latest)
+        run: |
+          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            SHA="${{ github.event.inputs.sha || github.sha }}"
+          else
+            SHA="${{ github.event.workflow_run.head_sha }}"
+          fi
+          TAG="sha-${SHA::7}"
+
+          echo "Promoting tag: $TAG → latest"
+          echo ""
+
+          crane copy "$REGISTRY/$PREFIX/base:$TAG" "$REGISTRY/$PREFIX/base:latest"
+          crane copy "$REGISTRY/$PREFIX/backend:$TAG" "$REGISTRY/$PREFIX/backend:latest"
+          crane copy "$REGISTRY/$PREFIX/frontend:$TAG" "$REGISTRY/$PREFIX/frontend:latest"
+          crane copy "$REGISTRY/$PREFIX/coordinator:$TAG" "$REGISTRY/$PREFIX/coordinator:latest"
+          crane copy "$REGISTRY/$PREFIX/k8s-worker:$TAG" "$REGISTRY/$PREFIX/k8s-worker:latest"
+          crane copy "$REGISTRY/$PREFIX/pod-monitor:$TAG" "$REGISTRY/$PREFIX/pod-monitor:latest"
+          crane copy "$REGISTRY/$PREFIX/result-processor:$TAG" "$REGISTRY/$PREFIX/result-processor:latest"
+          crane copy "$REGISTRY/$PREFIX/saga-orchestrator:$TAG" "$REGISTRY/$PREFIX/saga-orchestrator:latest"
+          crane copy "$REGISTRY/$PREFIX/event-replay:$TAG" "$REGISTRY/$PREFIX/event-replay:latest"
+          crane copy "$REGISTRY/$PREFIX/dlq-processor:$TAG" "$REGISTRY/$PREFIX/dlq-processor:latest"
+          crane copy "$REGISTRY/$PREFIX/cert-generator:$TAG" "$REGISTRY/$PREFIX/cert-generator:latest"
+          crane copy "$REGISTRY/$PREFIX/zookeeper-certgen:$TAG" "$REGISTRY/$PREFIX/zookeeper-certgen:latest"
+
   summary:
     name: Summary
-    if: github.event_name != 'pull_request'
-    needs: [build-base, build-backend, build-frontend, scan-backend, scan-frontend]
+    needs: [promote]
     runs-on: ubuntu-latest
-
     steps:
-      - name: Set lowercase image prefix
-        run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV
-
       - name: Generate summary
         run: |
-          echo "## Docker Images Published" >> $GITHUB_STEP_SUMMARY
+          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            SHA="${{ github.event.inputs.sha || github.sha }}"
+          else
+            SHA="${{ github.event.workflow_run.head_sha }}"
+          fi
+          TAG="sha-${SHA::7}"
+
+          echo "## Docker Images Promoted to Latest" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "All Stack Tests passed. Images promoted from \`$TAG\` to \`latest\`." >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "| Image | Pull Command |" >> $GITHUB_STEP_SUMMARY
           echo "|-------|--------------|" >> $GITHUB_STEP_SUMMARY
-          echo "| Base | \`docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/base:latest\` |" >> $GITHUB_STEP_SUMMARY
-          echo "| Backend | \`docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend:latest\` |" >> $GITHUB_STEP_SUMMARY
-          echo "| Frontend | \`docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend:latest\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Base | \`docker pull $REGISTRY/$PREFIX/base:latest\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Backend | \`docker pull $REGISTRY/$PREFIX/backend:latest\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Frontend | \`docker pull $REGISTRY/$PREFIX/frontend:latest\` |" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          echo "### Scan Results" >> $GITHUB_STEP_SUMMARY
-          echo "- Backend scan: ✅ Passed" >> $GITHUB_STEP_SUMMARY
-          echo "- Frontend scan: ✅ Passed" >> $GITHUB_STEP_SUMMARY
+          echo "### Security Scans" >> $GITHUB_STEP_SUMMARY
+          echo "- Backend: Passed" >> $GITHUB_STEP_SUMMARY
+          echo "- Frontend: Passed" >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index c2804f73..f5257e4b 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -2,19 +2,22 @@ name: Stack Tests
 
 on:
   push:
-    branches: [main, dev]
+    branches: [main]
+    tags: ['v*']
     paths:
       - 'backend/**'
       - 'frontend/**'
+      - 'cert-generator/**'
       - 'docker-compose.yaml'
       - 'deploy.sh'
       - '.github/workflows/stack-tests.yml'
       - '.github/actions/**'
   pull_request:
-    branches: [main, dev]
+    branches: [main]
     paths:
       - 'backend/**'
       - 'frontend/**'
+      - 'cert-generator/**'
       - 'docker-compose.yaml'
       - 'deploy.sh'
       - '.github/workflows/stack-tests.yml'
@@ -22,6 +25,7 @@ on:
   workflow_dispatch:
 
 env:
+  REGISTRY: ghcr.io
   MONGO_IMAGE: mongo:8.0
   REDIS_IMAGE: redis:7-alpine
   KAFKA_IMAGE: confluentinc/cp-kafka:7.8.2
@@ -102,17 +106,41 @@ jobs:
           fail_ci_if_error: false
           verbose: true
 
-  # Build all images once, cache for test jobs
+  # Build all images, push to GHCR with immutable SHA tag (push events only).
+  # PRs fall back to artifact transfer (can't push to GHCR from forks).
   build-images:
-    name: Build Images
+    name: Build & Push Images
     needs: [backend-unit, frontend-unit]
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    outputs:
+      sha-tag: ${{ steps.tags.outputs.sha-tag }}
+      image-prefix: ${{ steps.tags.outputs.image-prefix }}
     steps:
       - uses: actions/checkout@v6
 
       - name: Setup Docker Buildx
         uses: docker/setup-buildx-action@v3
 
+      - name: Log in to GHCR
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Compute image tags
+        id: tags
+        run: |
+          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
+          SHA_TAG="sha-${GITHUB_SHA::7}"
+          echo "sha-tag=$SHA_TAG" >> $GITHUB_OUTPUT
+          echo "image-prefix=$PREFIX" >> $GITHUB_OUTPUT
+
+      # ── Base image (cached separately — rarely changes) ──────────────
       - name: Cache base image
         uses: actions/cache@v5
         id: base-cache
@@ -139,15 +167,50 @@ jobs:
         if: steps.base-cache.outputs.cache-hit != 'true'
         run: docker save integr8scode-base:latest | zstd -T0 -3 > /tmp/base-image.tar.zst
 
+      - name: Push base to GHCR
+        if: github.event_name != 'pull_request'
+        run: |
+          docker tag integr8scode-base:latest \
+            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }}
+          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }}
+
+      # ── Backend + workers (depend on local base image) ───────────────
       - name: Build all images
         run: |
           docker build -t integr8scode-backend:latest --build-context base=docker-image://integr8scode-base:latest -f ./backend/Dockerfile ./backend
-          docker build -t integr8scode-coordinator:latest -f backend/workers/Dockerfile.coordinator --build-context base=docker-image://integr8scode-base:latest ./backend
-          docker build -t integr8scode-k8s-worker:latest -f backend/workers/Dockerfile.k8s_worker --build-context base=docker-image://integr8scode-base:latest ./backend
-          docker build -t integr8scode-pod-monitor:latest -f backend/workers/Dockerfile.pod_monitor --build-context base=docker-image://integr8scode-base:latest ./backend
-          docker build -t integr8scode-result-processor:latest -f backend/workers/Dockerfile.result_processor --build-context base=docker-image://integr8scode-base:latest ./backend
-          docker build -t integr8scode-saga-orchestrator:latest -f backend/workers/Dockerfile.saga_orchestrator --build-context base=docker-image://integr8scode-base:latest ./backend
-
+          docker build -t integr8scode-coordinator:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.coordinator ./backend
+          docker build -t integr8scode-k8s-worker:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.k8s_worker ./backend
+          docker build -t integr8scode-pod-monitor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.pod_monitor ./backend
+          docker build -t integr8scode-result-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.result_processor ./backend
+          docker build -t integr8scode-saga-orchestrator:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.saga_orchestrator ./backend
+          docker build -t integr8scode-event-replay:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.event_replay ./backend
+          docker build -t integr8scode-dlq-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.dlq_processor ./backend
+
+      - name: Push backend and workers to GHCR
+        if: github.event_name != 'pull_request'
+        env:
+          TAG: ${{ steps.tags.outputs.sha-tag }}
+          IMG: ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}
+        run: |
+          docker tag integr8scode-backend:latest "$IMG/backend:$TAG"
+          docker tag integr8scode-coordinator:latest "$IMG/coordinator:$TAG"
+          docker tag integr8scode-k8s-worker:latest "$IMG/k8s-worker:$TAG"
+          docker tag integr8scode-pod-monitor:latest "$IMG/pod-monitor:$TAG"
+          docker tag integr8scode-result-processor:latest "$IMG/result-processor:$TAG"
+          docker tag integr8scode-saga-orchestrator:latest "$IMG/saga-orchestrator:$TAG"
+          docker tag integr8scode-event-replay:latest "$IMG/event-replay:$TAG"
+          docker tag integr8scode-dlq-processor:latest "$IMG/dlq-processor:$TAG"
+
+          docker push "$IMG/backend:$TAG"
+          docker push "$IMG/coordinator:$TAG"
+          docker push "$IMG/k8s-worker:$TAG"
+          docker push "$IMG/pod-monitor:$TAG"
+          docker push "$IMG/result-processor:$TAG"
+          docker push "$IMG/saga-orchestrator:$TAG"
+          docker push "$IMG/event-replay:$TAG"
+          docker push "$IMG/dlq-processor:$TAG"
+
+      # ── Utility images (GHA-cached, independent of base) ────────────
       - name: Build cert-generator image
         uses: docker/build-push-action@v6
         with:
@@ -158,6 +221,31 @@ jobs:
           cache-from: type=gha,scope=cert-generator
           cache-to: type=gha,mode=max,scope=cert-generator
 
+      - name: Push cert-generator to GHCR
+        if: github.event_name != 'pull_request'
+        run: |
+          docker tag integr8scode-cert-generator:latest \
+            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }}
+          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }}
+
+      - name: Build zookeeper-certgen image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./backend/zookeeper
+          file: ./backend/zookeeper/Dockerfile.certgen
+          load: true
+          tags: integr8scode-zookeeper-certgen:latest
+          cache-from: type=gha,scope=zookeeper-certgen
+          cache-to: type=gha,mode=max,scope=zookeeper-certgen
+
+      - name: Push zookeeper-certgen to GHCR
+        if: github.event_name != 'pull_request'
+        run: |
+          docker tag integr8scode-zookeeper-certgen:latest \
+            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }}
+          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }}
+
+      # ── Frontend (dev for E2E, prod for scanning/deployment) ─────────
       - name: Build frontend image
         uses: docker/build-push-action@v6
         with:
@@ -168,7 +256,33 @@ jobs:
           cache-from: type=gha,scope=frontend
           cache-to: type=gha,mode=max,scope=frontend
 
+      - name: Push frontend-dev to GHCR
+        if: github.event_name != 'pull_request'
+        run: |
+          docker tag integr8scode-frontend:latest \
+            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }}
+          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }}
+
+      - name: Build frontend-prod image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./frontend
+          file: ./frontend/Dockerfile.prod
+          load: true
+          tags: integr8scode-frontend-prod:latest
+          cache-from: type=gha,scope=frontend-prod
+          cache-to: type=gha,mode=max,scope=frontend-prod
+
+      - name: Push frontend-prod to GHCR
+        if: github.event_name != 'pull_request'
+        run: |
+          docker tag integr8scode-frontend-prod:latest \
+            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }}
+          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }}
+
+      # ── Save images for PR builds (artifact fallback) ────────────────
       - name: Save all images
+        if: github.event_name == 'pull_request'
         run: |
           docker save \
             integr8scode-backend:latest \
@@ -177,18 +291,22 @@ jobs:
             integr8scode-pod-monitor:latest \
             integr8scode-result-processor:latest \
             integr8scode-saga-orchestrator:latest \
+            integr8scode-event-replay:latest \
+            integr8scode-dlq-processor:latest \
             integr8scode-cert-generator:latest \
+            integr8scode-zookeeper-certgen:latest \
             integr8scode-frontend:latest \
             | zstd -T0 -3 > /tmp/all-images.tar.zst
 
       - name: Upload images artifact
+        if: github.event_name == 'pull_request'
         uses: actions/upload-artifact@v6
         with:
           name: docker-images
           path: /tmp/all-images.tar.zst
           retention-days: 1
 
-  # Parallel test jobs (backend-e2e, frontend-e2e)
+  # Parallel E2E test jobs
   backend-e2e:
     name: Backend E2E Tests
     needs: [build-images]
@@ -201,13 +319,52 @@ jobs:
         with:
           images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
+      # Push events: pull pre-built images from GHCR
+      - name: Pull images from GHCR
+        if: github.event_name != 'pull_request'
+        env:
+          TAG: ${{ needs.build-images.outputs.sha-tag }}
+          IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }}
+        run: |
+          echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
+
+          docker pull "$IMG/base:$TAG" &
+          docker pull "$IMG/backend:$TAG" &
+          docker pull "$IMG/frontend-dev:$TAG" &
+          docker pull "$IMG/coordinator:$TAG" &
+          docker pull "$IMG/k8s-worker:$TAG" &
+          docker pull "$IMG/pod-monitor:$TAG" &
+          docker pull "$IMG/result-processor:$TAG" &
+          docker pull "$IMG/saga-orchestrator:$TAG" &
+          docker pull "$IMG/event-replay:$TAG" &
+          docker pull "$IMG/dlq-processor:$TAG" &
+          docker pull "$IMG/cert-generator:$TAG" &
+          docker pull "$IMG/zookeeper-certgen:$TAG" &
+          wait
+
+          docker tag "$IMG/base:$TAG" integr8scode-base:latest
+          docker tag "$IMG/backend:$TAG" integr8scode-backend:latest
+          docker tag "$IMG/frontend-dev:$TAG" integr8scode-frontend:latest
+          docker tag "$IMG/coordinator:$TAG" integr8scode-coordinator:latest
+          docker tag "$IMG/k8s-worker:$TAG" integr8scode-k8s-worker:latest
+          docker tag "$IMG/pod-monitor:$TAG" integr8scode-pod-monitor:latest
+          docker tag "$IMG/result-processor:$TAG" integr8scode-result-processor:latest
+          docker tag "$IMG/saga-orchestrator:$TAG" integr8scode-saga-orchestrator:latest
+          docker tag "$IMG/event-replay:$TAG" integr8scode-event-replay:latest
+          docker tag "$IMG/dlq-processor:$TAG" integr8scode-dlq-processor:latest
+          docker tag "$IMG/cert-generator:$TAG" integr8scode-cert-generator:latest
+          docker tag "$IMG/zookeeper-certgen:$TAG" integr8scode-zookeeper-certgen:latest
+
+      # PR events: load from artifact
       - name: Download built images
+        if: github.event_name == 'pull_request'
         uses: actions/download-artifact@v7
         with:
           name: docker-images
           path: /tmp
 
       - name: Load built images
+        if: github.event_name == 'pull_request'
         run: zstd -d -c /tmp/all-images.tar.zst | docker load
 
       - name: Setup k3s
@@ -314,13 +471,52 @@ jobs:
         with:
           images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
+      # Push events: pull pre-built images from GHCR
+      - name: Pull images from GHCR
+        if: github.event_name != 'pull_request'
+        env:
+          TAG: ${{ needs.build-images.outputs.sha-tag }}
+          IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }}
+        run: |
+          echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
+
+          docker pull "$IMG/base:$TAG" &
+          docker pull "$IMG/backend:$TAG" &
+          docker pull "$IMG/frontend-dev:$TAG" &
+          docker pull "$IMG/coordinator:$TAG" &
+          docker pull "$IMG/k8s-worker:$TAG" &
+          docker pull "$IMG/pod-monitor:$TAG" &
+          docker pull "$IMG/result-processor:$TAG" &
+          docker pull "$IMG/saga-orchestrator:$TAG" &
+          docker pull "$IMG/event-replay:$TAG" &
+          docker pull "$IMG/dlq-processor:$TAG" &
+          docker pull "$IMG/cert-generator:$TAG" &
+          docker pull "$IMG/zookeeper-certgen:$TAG" &
+          wait
+
+          docker tag "$IMG/base:$TAG" integr8scode-base:latest
+          docker tag "$IMG/backend:$TAG" integr8scode-backend:latest
+          docker tag "$IMG/frontend-dev:$TAG" integr8scode-frontend:latest
+          docker tag "$IMG/coordinator:$TAG" integr8scode-coordinator:latest
+          docker tag "$IMG/k8s-worker:$TAG" integr8scode-k8s-worker:latest
+          docker tag "$IMG/pod-monitor:$TAG" integr8scode-pod-monitor:latest
+          docker tag "$IMG/result-processor:$TAG" integr8scode-result-processor:latest
+          docker tag "$IMG/saga-orchestrator:$TAG" integr8scode-saga-orchestrator:latest
+          docker tag "$IMG/event-replay:$TAG" integr8scode-event-replay:latest
+          docker tag "$IMG/dlq-processor:$TAG" integr8scode-dlq-processor:latest
+          docker tag "$IMG/cert-generator:$TAG" integr8scode-cert-generator:latest
+          docker tag "$IMG/zookeeper-certgen:$TAG" integr8scode-zookeeper-certgen:latest
+
+      # PR events: load from artifact
       - name: Download built images
+        if: github.event_name == 'pull_request'
         uses: actions/download-artifact@v7
         with:
           name: docker-images
           path: /tmp
 
       - name: Load built images
+        if: github.event_name == 'pull_request'
         run: zstd -d -c /tmp/all-images.tar.zst | docker load
 
       - name: Setup k3s

From feeebe73ff9aa60dbe6c6f5de188e5fdec9357dc Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 18:58:07 +0100
Subject: [PATCH 03/17]  Replaced two separate scan jobs (scan-backend,
 scan-frontend) with a single matrix job (scan) that scans all 12 images in
 parallel:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  - fail-fast: false — one image's vulnerability findings don't cancel the other scans
  - Each matrix entry runs as its own parallel job on a separate runner
  - SARIF results uploaded per-image with unique categories (trivy-base, trivy-backend, etc.)
  - trivyignores: 'backend/.trivyignore' applied to all images (CVE exemptions are image-agnostic)
  - checkout@v6 included so the .trivyignore file is available

  Updated promote.needs from [scan-backend, scan-frontend] to [scan] — waits for all 12 matrix entries to pass before promoting anything to latest.

  Updated the summary security section to reflect that all 12 images are scanned.
---
 .github/workflows/docker.yml      | 81 +++++++++++--------------------
 .github/workflows/stack-tests.yml | 20 ++++++--
 2 files changed, 45 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index c6535c54..9778e7a3 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -17,8 +17,8 @@ env:
   REGISTRY: ghcr.io
 
 jobs:
-  scan-backend:
-    name: Scan Backend
+  scan:
+    name: Scan ${{ matrix.image }}
     if: >
       github.event_name == 'workflow_dispatch' ||
       (github.event.workflow_run.conclusion == 'success' &&
@@ -28,6 +28,22 @@ jobs:
       contents: read
       security-events: write
       packages: read
+    strategy:
+      fail-fast: false
+      matrix:
+        image:
+          - base
+          - backend
+          - frontend
+          - coordinator
+          - k8s-worker
+          - pod-monitor
+          - result-processor
+          - saga-orchestrator
+          - event-replay
+          - dlq-processor
+          - cert-generator
+          - zookeeper-certgen
     steps:
       - uses: actions/checkout@v6
 
@@ -41,14 +57,14 @@ jobs:
             SHA="${{ github.event.workflow_run.head_sha }}"
           fi
           TAG="sha-${SHA::7}"
-          echo "image=${{ env.REGISTRY }}/$PREFIX/backend:$TAG" >> $GITHUB_OUTPUT
+          echo "image=${{ env.REGISTRY }}/$PREFIX/${{ matrix.image }}:$TAG" >> $GITHUB_OUTPUT
 
       - name: Run Trivy vulnerability scanner
         uses: aquasecurity/trivy-action@0.33.1
         with:
           image-ref: ${{ steps.ref.outputs.image }}
           format: 'sarif'
-          output: 'trivy-backend-results.sarif'
+          output: 'trivy-${{ matrix.image }}-results.sarif'
           ignore-unfixed: true
           severity: 'CRITICAL,HIGH'
           timeout: '5m0s'
@@ -59,55 +75,13 @@ jobs:
         if: always()
         uses: github/codeql-action/upload-sarif@v4
         with:
-          sarif_file: 'trivy-backend-results.sarif'
-          category: 'trivy-backend'
-
-  scan-frontend:
-    name: Scan Frontend
-    if: >
-      github.event_name == 'workflow_dispatch' ||
-      (github.event.workflow_run.conclusion == 'success' &&
-       github.event.workflow_run.head_branch == 'main')
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      security-events: write
-      packages: read
-    steps:
-      - name: Compute image ref
-        id: ref
-        run: |
-          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
-          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
-            SHA="${{ github.event.inputs.sha || github.sha }}"
-          else
-            SHA="${{ github.event.workflow_run.head_sha }}"
-          fi
-          TAG="sha-${SHA::7}"
-          echo "image=${{ env.REGISTRY }}/$PREFIX/frontend:$TAG" >> $GITHUB_OUTPUT
-
-      - name: Run Trivy vulnerability scanner
-        uses: aquasecurity/trivy-action@0.33.1
-        with:
-          image-ref: ${{ steps.ref.outputs.image }}
-          format: 'sarif'
-          output: 'trivy-frontend-results.sarif'
-          ignore-unfixed: true
-          severity: 'CRITICAL,HIGH'
-          timeout: '5m0s'
-          version: 'v0.68.2'
-
-      - name: Upload Trivy scan results
-        if: always()
-        uses: github/codeql-action/upload-sarif@v4
-        with:
-          sarif_file: 'trivy-frontend-results.sarif'
-          category: 'trivy-frontend'
+          sarif_file: 'trivy-${{ matrix.image }}-results.sarif'
+          category: 'trivy-${{ matrix.image }}'
 
   # Promote SHA tag → latest using crane (registry-level manifest copy, no rebuild)
   promote:
     name: Promote to Latest
-    needs: [scan-backend, scan-frontend]
+    needs: [scan]
     if: >
       github.event_name == 'workflow_dispatch' ||
       (github.event.workflow_run.conclusion == 'success' &&
@@ -169,7 +143,11 @@ jobs:
 
           echo "## Docker Images Promoted to Latest" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          echo "All Stack Tests passed. Images promoted from \`$TAG\` to \`latest\`." >> $GITHUB_STEP_SUMMARY
+          if [ "$GITHUB_EVENT_NAME" = "workflow_dispatch" ]; then
+            echo "Images promoted manually from \`$TAG\` to \`latest\` — Stack Tests may not have run." >> $GITHUB_STEP_SUMMARY
+          else
+            echo "All Stack Tests passed. Images promoted from \`$TAG\` to \`latest\`." >> $GITHUB_STEP_SUMMARY
+          fi
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "| Image | Pull Command |" >> $GITHUB_STEP_SUMMARY
           echo "|-------|--------------|" >> $GITHUB_STEP_SUMMARY
@@ -178,5 +156,4 @@ jobs:
           echo "| Frontend | \`docker pull $REGISTRY/$PREFIX/frontend:latest\` |" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "### Security Scans" >> $GITHUB_STEP_SUMMARY
-          echo "- Backend: Passed" >> $GITHUB_STEP_SUMMARY
-          echo "- Frontend: Passed" >> $GITHUB_STEP_SUMMARY
+          echo "All 12 images scanned with Trivy (CRITICAL + HIGH, unfixed ignored)." >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index f5257e4b..08651bb8 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -320,14 +320,20 @@ jobs:
           images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
       # Push events: pull pre-built images from GHCR
+      - name: Log in to GHCR
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Pull images from GHCR
         if: github.event_name != 'pull_request'
         env:
           TAG: ${{ needs.build-images.outputs.sha-tag }}
           IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }}
         run: |
-          echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
-
           docker pull "$IMG/base:$TAG" &
           docker pull "$IMG/backend:$TAG" &
           docker pull "$IMG/frontend-dev:$TAG" &
@@ -472,14 +478,20 @@ jobs:
           images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
       # Push events: pull pre-built images from GHCR
+      - name: Log in to GHCR
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Pull images from GHCR
         if: github.event_name != 'pull_request'
         env:
           TAG: ${{ needs.build-images.outputs.sha-tag }}
           IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }}
         run: |
-          echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
-
           docker pull "$IMG/base:$TAG" &
           docker pull "$IMG/backend:$TAG" &
           docker pull "$IMG/frontend-dev:$TAG" &

From 3dccec636705b2bfb66350a0a1d0a1eef1f872cd Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 19:28:32 +0100
Subject: [PATCH 04/17]  What changed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  docker-compose.yaml (+15 lines): Every buildable service now has an image: field pointing to ghcr.io/hardmax71/integr8scode/{service}:${IMAGE_TAG:-latest}. kafka-init and user-seed share the backend image. Compose now knows where to pull pre-built images from.

  deploy.sh (+10 lines): Added --no-build flag to cmd_dev(). Passes --no-build to compose, preventing any build fallback.

  stack-tests.yml (-149 lines):
  - Build job: push condition changed from event_name != 'pull_request' to !github.event.pull_request.head.repo.fork (same-repo PRs can push to GHCR). Artifact save/upload removed entirely.
  - Both E2E jobs: Deleted all GHCR login, parallel pull, retag, artifact download, and load steps. Replaced with a single IMAGE_TAG env var on the "Start stack" step. Compose pulls SHA-tagged images from GHCR automatically using the image: fields.
  - Both E2E jobs have if: !fork guard — fork PRs skip E2E (unit tests still run).

  How it works

  | Scenario                                                | What happens                                     |
  |---------------------------------------------------------|--------------------------------------------------|
  | ./deploy.sh dev (local, first time)                     | Compose pulls latest from GHCR — no build needed |
  | ./deploy.sh dev --build (local, with changes)           | Builds locally, tags with GHCR name              |
  | CI: IMAGE_TAG=sha-xxx ./deploy.sh dev --no-build --wait | Compose pulls sha-tagged images from GHCR        |
  | ./deploy.sh prod                                        | Helm uses GHCR images (unchanged)                |
---
 .github/workflows/stack-tests.yml | 164 ++++--------------------------
 deploy.sh                         |  10 +-
 docker-compose.yaml               |  15 ++-
 3 files changed, 40 insertions(+), 149 deletions(-)

diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index 08651bb8..48f604fc 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -106,8 +106,8 @@ jobs:
           fail_ci_if_error: false
           verbose: true
 
-  # Build all images, push to GHCR with immutable SHA tag (push events only).
-  # PRs fall back to artifact transfer (can't push to GHCR from forks).
+  # Build all images, push to GHCR with immutable SHA tag.
+  # Fork PRs skip GHCR push (no write access) — E2E tests require pushed images.
   build-images:
     name: Build & Push Images
     needs: [backend-unit, frontend-unit]
@@ -125,7 +125,7 @@ jobs:
         uses: docker/setup-buildx-action@v3
 
       - name: Log in to GHCR
-        if: github.event_name != 'pull_request'
+        if: ${{ !github.event.pull_request.head.repo.fork }}
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
@@ -168,7 +168,7 @@ jobs:
         run: docker save integr8scode-base:latest | zstd -T0 -3 > /tmp/base-image.tar.zst
 
       - name: Push base to GHCR
-        if: github.event_name != 'pull_request'
+        if: ${{ !github.event.pull_request.head.repo.fork }}
         run: |
           docker tag integr8scode-base:latest \
             ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }}
@@ -187,7 +187,7 @@ jobs:
           docker build -t integr8scode-dlq-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.dlq_processor ./backend
 
       - name: Push backend and workers to GHCR
-        if: github.event_name != 'pull_request'
+        if: ${{ !github.event.pull_request.head.repo.fork }}
         env:
           TAG: ${{ steps.tags.outputs.sha-tag }}
           IMG: ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}
@@ -222,7 +222,7 @@ jobs:
           cache-to: type=gha,mode=max,scope=cert-generator
 
       - name: Push cert-generator to GHCR
-        if: github.event_name != 'pull_request'
+        if: ${{ !github.event.pull_request.head.repo.fork }}
         run: |
           docker tag integr8scode-cert-generator:latest \
             ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }}
@@ -239,7 +239,7 @@ jobs:
           cache-to: type=gha,mode=max,scope=zookeeper-certgen
 
       - name: Push zookeeper-certgen to GHCR
-        if: github.event_name != 'pull_request'
+        if: ${{ !github.event.pull_request.head.repo.fork }}
         run: |
           docker tag integr8scode-zookeeper-certgen:latest \
             ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }}
@@ -257,7 +257,7 @@ jobs:
           cache-to: type=gha,mode=max,scope=frontend
 
       - name: Push frontend-dev to GHCR
-        if: github.event_name != 'pull_request'
+        if: ${{ !github.event.pull_request.head.repo.fork }}
         run: |
           docker tag integr8scode-frontend:latest \
             ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }}
@@ -274,42 +274,17 @@ jobs:
           cache-to: type=gha,mode=max,scope=frontend-prod
 
       - name: Push frontend-prod to GHCR
-        if: github.event_name != 'pull_request'
+        if: ${{ !github.event.pull_request.head.repo.fork }}
         run: |
           docker tag integr8scode-frontend-prod:latest \
             ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }}
           docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }}
 
-      # ── Save images for PR builds (artifact fallback) ────────────────
-      - name: Save all images
-        if: github.event_name == 'pull_request'
-        run: |
-          docker save \
-            integr8scode-backend:latest \
-            integr8scode-coordinator:latest \
-            integr8scode-k8s-worker:latest \
-            integr8scode-pod-monitor:latest \
-            integr8scode-result-processor:latest \
-            integr8scode-saga-orchestrator:latest \
-            integr8scode-event-replay:latest \
-            integr8scode-dlq-processor:latest \
-            integr8scode-cert-generator:latest \
-            integr8scode-zookeeper-certgen:latest \
-            integr8scode-frontend:latest \
-            | zstd -T0 -3 > /tmp/all-images.tar.zst
-
-      - name: Upload images artifact
-        if: github.event_name == 'pull_request'
-        uses: actions/upload-artifact@v6
-        with:
-          name: docker-images
-          path: /tmp/all-images.tar.zst
-          retention-days: 1
-
-  # Parallel E2E test jobs
+  # Parallel E2E test jobs — compose pulls from GHCR using IMAGE_TAG
   backend-e2e:
     name: Backend E2E Tests
     needs: [build-images]
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v6
@@ -319,60 +294,6 @@ jobs:
         with:
           images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
-      # Push events: pull pre-built images from GHCR
-      - name: Log in to GHCR
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Pull images from GHCR
-        if: github.event_name != 'pull_request'
-        env:
-          TAG: ${{ needs.build-images.outputs.sha-tag }}
-          IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }}
-        run: |
-          docker pull "$IMG/base:$TAG" &
-          docker pull "$IMG/backend:$TAG" &
-          docker pull "$IMG/frontend-dev:$TAG" &
-          docker pull "$IMG/coordinator:$TAG" &
-          docker pull "$IMG/k8s-worker:$TAG" &
-          docker pull "$IMG/pod-monitor:$TAG" &
-          docker pull "$IMG/result-processor:$TAG" &
-          docker pull "$IMG/saga-orchestrator:$TAG" &
-          docker pull "$IMG/event-replay:$TAG" &
-          docker pull "$IMG/dlq-processor:$TAG" &
-          docker pull "$IMG/cert-generator:$TAG" &
-          docker pull "$IMG/zookeeper-certgen:$TAG" &
-          wait
-
-          docker tag "$IMG/base:$TAG" integr8scode-base:latest
-          docker tag "$IMG/backend:$TAG" integr8scode-backend:latest
-          docker tag "$IMG/frontend-dev:$TAG" integr8scode-frontend:latest
-          docker tag "$IMG/coordinator:$TAG" integr8scode-coordinator:latest
-          docker tag "$IMG/k8s-worker:$TAG" integr8scode-k8s-worker:latest
-          docker tag "$IMG/pod-monitor:$TAG" integr8scode-pod-monitor:latest
-          docker tag "$IMG/result-processor:$TAG" integr8scode-result-processor:latest
-          docker tag "$IMG/saga-orchestrator:$TAG" integr8scode-saga-orchestrator:latest
-          docker tag "$IMG/event-replay:$TAG" integr8scode-event-replay:latest
-          docker tag "$IMG/dlq-processor:$TAG" integr8scode-dlq-processor:latest
-          docker tag "$IMG/cert-generator:$TAG" integr8scode-cert-generator:latest
-          docker tag "$IMG/zookeeper-certgen:$TAG" integr8scode-zookeeper-certgen:latest
-
-      # PR events: load from artifact
-      - name: Download built images
-        if: github.event_name == 'pull_request'
-        uses: actions/download-artifact@v7
-        with:
-          name: docker-images
-          path: /tmp
-
-      - name: Load built images
-        if: github.event_name == 'pull_request'
-        run: zstd -d -c /tmp/all-images.tar.zst | docker load
-
       - name: Setup k3s
         uses: ./.github/actions/k3s-setup
 
@@ -382,7 +303,9 @@ jobs:
           cp backend/secrets.example.toml backend/secrets.toml
 
       - name: Start stack
-        run: ./deploy.sh dev --wait
+        env:
+          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+        run: ./deploy.sh dev --no-build --wait
 
       - name: Seed test users
         run: docker compose exec -T backend uv run python scripts/seed_users.py
@@ -441,6 +364,7 @@ jobs:
   frontend-e2e:
     name: Frontend E2E Tests
     needs: [build-images]
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v6
@@ -477,60 +401,6 @@ jobs:
         with:
           images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
-      # Push events: pull pre-built images from GHCR
-      - name: Log in to GHCR
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Pull images from GHCR
-        if: github.event_name != 'pull_request'
-        env:
-          TAG: ${{ needs.build-images.outputs.sha-tag }}
-          IMG: ${{ env.REGISTRY }}/${{ needs.build-images.outputs.image-prefix }}
-        run: |
-          docker pull "$IMG/base:$TAG" &
-          docker pull "$IMG/backend:$TAG" &
-          docker pull "$IMG/frontend-dev:$TAG" &
-          docker pull "$IMG/coordinator:$TAG" &
-          docker pull "$IMG/k8s-worker:$TAG" &
-          docker pull "$IMG/pod-monitor:$TAG" &
-          docker pull "$IMG/result-processor:$TAG" &
-          docker pull "$IMG/saga-orchestrator:$TAG" &
-          docker pull "$IMG/event-replay:$TAG" &
-          docker pull "$IMG/dlq-processor:$TAG" &
-          docker pull "$IMG/cert-generator:$TAG" &
-          docker pull "$IMG/zookeeper-certgen:$TAG" &
-          wait
-
-          docker tag "$IMG/base:$TAG" integr8scode-base:latest
-          docker tag "$IMG/backend:$TAG" integr8scode-backend:latest
-          docker tag "$IMG/frontend-dev:$TAG" integr8scode-frontend:latest
-          docker tag "$IMG/coordinator:$TAG" integr8scode-coordinator:latest
-          docker tag "$IMG/k8s-worker:$TAG" integr8scode-k8s-worker:latest
-          docker tag "$IMG/pod-monitor:$TAG" integr8scode-pod-monitor:latest
-          docker tag "$IMG/result-processor:$TAG" integr8scode-result-processor:latest
-          docker tag "$IMG/saga-orchestrator:$TAG" integr8scode-saga-orchestrator:latest
-          docker tag "$IMG/event-replay:$TAG" integr8scode-event-replay:latest
-          docker tag "$IMG/dlq-processor:$TAG" integr8scode-dlq-processor:latest
-          docker tag "$IMG/cert-generator:$TAG" integr8scode-cert-generator:latest
-          docker tag "$IMG/zookeeper-certgen:$TAG" integr8scode-zookeeper-certgen:latest
-
-      # PR events: load from artifact
-      - name: Download built images
-        if: github.event_name == 'pull_request'
-        uses: actions/download-artifact@v7
-        with:
-          name: docker-images
-          path: /tmp
-
-      - name: Load built images
-        if: github.event_name == 'pull_request'
-        run: zstd -d -c /tmp/all-images.tar.zst | docker load
-
       - name: Setup k3s
         uses: ./.github/actions/k3s-setup
 
@@ -540,7 +410,9 @@ jobs:
           cp backend/secrets.example.toml backend/secrets.toml
 
       - name: Start stack
-        run: ./deploy.sh dev --wait
+        env:
+          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+        run: ./deploy.sh dev --no-build --wait
 
       - name: Seed test users
         run: docker compose exec -T backend uv run python scripts/seed_users.py
diff --git a/deploy.sh b/deploy.sh
index 68f8b25a..a7dc8bec 100755
--- a/deploy.sh
+++ b/deploy.sh
@@ -56,7 +56,8 @@ show_help() {
     echo ""
     echo "Commands:"
     echo "  dev [options]      Start full stack (docker-compose)"
-    echo "                     --build             Rebuild images"
+    echo "                     --build             Rebuild images locally"
+    echo "                     --no-build          Use pre-built images only (no build fallback)"
     echo "                     --wait              Wait for services to be healthy"
     echo "                     --timeout <secs>    Health check timeout (default: 300)"
     echo "                     --observability     Include Grafana, Jaeger, etc."
@@ -99,6 +100,7 @@ cmd_dev() {
     print_header "Starting Local Development Environment"
 
     local BUILD_FLAG=""
+    local NO_BUILD_FLAG=""
     local WAIT_FLAG=""
     local WAIT_TIMEOUT="300"
     local PROFILE_FLAGS=""
@@ -109,6 +111,10 @@ cmd_dev() {
                 BUILD_FLAG="--build"
                 print_info "Rebuilding images..."
                 ;;
+            --no-build)
+                NO_BUILD_FLAG="--no-build"
+                print_info "Using pre-built images (skipping build)..."
+                ;;
             --wait)
                 WAIT_FLAG="--wait"
                 ;;
@@ -133,7 +139,7 @@ cmd_dev() {
         WAIT_TIMEOUT_FLAG="--wait-timeout $WAIT_TIMEOUT"
     fi
 
-    docker compose $PROFILE_FLAGS up -d $BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG
+    docker compose $PROFILE_FLAGS up -d $BUILD_FLAG $NO_BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG
 
     echo ""
     print_success "Development environment started!"
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 8d3cd5fd..060a955c 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,10 +1,10 @@
 services:
   # Shared base image for all Python backend services
   base:
+    image: ghcr.io/hardmax71/integr8scode/base:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: Dockerfile.base
-    image: integr8scode-base:latest
 
   shared-ca:
     image: alpine:latest
@@ -15,6 +15,7 @@ services:
       - app-network
 
   cert-generator:
+    image: ghcr.io/hardmax71/integr8scode/cert-generator:${IMAGE_TAG:-latest}
     build:
       context: ./cert-generator
       dockerfile: Dockerfile
@@ -80,6 +81,7 @@ services:
       start_period: 5s
 
   backend:
+    image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: Dockerfile
@@ -127,6 +129,7 @@ services:
       start_period: 5s
 
   frontend:
+    image: ghcr.io/hardmax71/integr8scode/frontend-dev:${IMAGE_TAG:-latest}
     container_name: frontend
     build:
       context: ./frontend
@@ -175,6 +178,7 @@ services:
   # Kafka Infrastructure for Event-Driven Design
   # Certificate generator for Zookeeper/Kafka SSL
   zookeeper-certgen:
+    image: ghcr.io/hardmax71/integr8scode/zookeeper-certgen:${IMAGE_TAG:-latest}
     build:
       context: ./backend/zookeeper
       dockerfile: Dockerfile.certgen
@@ -362,6 +366,7 @@ services:
 
   # Kafka topic initialization
   kafka-init:
+    image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: Dockerfile
@@ -388,6 +393,7 @@ services:
 
   # Seed default users (runs once after mongo is ready)
   user-seed:
+    image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: Dockerfile
@@ -412,6 +418,7 @@ services:
 
   # Event-driven workers
   coordinator:
+    image: ghcr.io/hardmax71/integr8scode/coordinator:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.coordinator
@@ -436,6 +443,7 @@ services:
     restart: unless-stopped
 
   k8s-worker:
+    image: ghcr.io/hardmax71/integr8scode/k8s-worker:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.k8s_worker
@@ -463,6 +471,7 @@ services:
     restart: unless-stopped
 
   pod-monitor:
+    image: ghcr.io/hardmax71/integr8scode/pod-monitor:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.pod_monitor
@@ -488,6 +497,7 @@ services:
     restart: unless-stopped
 
   result-processor:
+    image: ghcr.io/hardmax71/integr8scode/result-processor:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.result_processor
@@ -515,6 +525,7 @@ services:
     restart: unless-stopped
 
   saga-orchestrator:
+    image: ghcr.io/hardmax71/integr8scode/saga-orchestrator:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.saga_orchestrator
@@ -560,6 +571,7 @@ services:
 
   # Event replay service
   event-replay:
+    image: ghcr.io/hardmax71/integr8scode/event-replay:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.event_replay
@@ -586,6 +598,7 @@ services:
 
   # DLQ Processor Service
   dlq-processor:
+    image: ghcr.io/hardmax71/integr8scode/dlq-processor:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.dlq_processor

From 16152ba309c70cb5730967cfdbec6390f0d7e9f7 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 20:21:21 +0100
Subject: [PATCH 05/17]  1. Playwright Sharding (frontend-e2e)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  - Added strategy.matrix with shardIndex: [1, 2] and shardTotal: [2]
  - fail-fast: false so one shard failing doesn't cancel the other
  - Test command: npx playwright test --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }}
  - Artifact names include shard index to avoid collisions: playwright-report-1, playwright-report-2, frontend-e2e-logs-1, etc.
  - Job name shows shard: Frontend E2E (1/2), Frontend E2E (2/2)

  2. GHCR Pre-pull (both E2E jobs)

  - Immediately after checkout, docker compose pull --quiet starts in the background via nohup
  - While GHCR images pull, the subsequent setup steps run in parallel:
    - backend-e2e: Docker cache load + k3s install (~85s of overlap)
    - frontend-e2e: Node setup + npm ci + Playwright install + Docker cache + k3s (~150s of overlap)
  - A "Wait for GHCR images" step before "Start stack" ensures pull is complete
  - "Start stack" then finds images already local — skips pulling entirely
---
 .github/workflows/stack-tests.yml | 51 ++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index 48f604fc..b535dbde 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -289,6 +289,14 @@ jobs:
     steps:
       - uses: actions/checkout@v6
 
+      - name: Pre-pull GHCR images (background)
+        env:
+          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+        run: |
+          nohup bash -c "IMAGE_TAG=$IMAGE_TAG docker compose pull --quiet" \
+            > /tmp/ghcr-pull.log 2>&1 &
+          echo $! > /tmp/ghcr-pull.pid
+
       - name: Cache and load Docker images
         uses: ./.github/actions/docker-cache
         with:
@@ -302,6 +310,17 @@ jobs:
           cp backend/config.test.toml backend/config.toml
           cp backend/secrets.example.toml backend/secrets.toml
 
+      - name: Wait for GHCR images
+        run: |
+          if [ -f /tmp/ghcr-pull.pid ]; then
+            PID=$(cat /tmp/ghcr-pull.pid)
+            if kill -0 "$PID" 2>/dev/null; then
+              echo "Waiting for GHCR image pull to complete..."
+              tail --pid="$PID" -f /dev/null 2>/dev/null || true
+            fi
+            cat /tmp/ghcr-pull.log 2>/dev/null || true
+          fi
+
       - name: Start stack
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
@@ -362,13 +381,26 @@ jobs:
           path: logs/
 
   frontend-e2e:
-    name: Frontend E2E Tests
+    name: Frontend E2E (${{ matrix.shardIndex }}/${{ matrix.shardTotal }})
     needs: [build-images]
     if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        shardIndex: [1, 2]
+        shardTotal: [2]
     steps:
       - uses: actions/checkout@v6
 
+      - name: Pre-pull GHCR images (background)
+        env:
+          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+        run: |
+          nohup bash -c "IMAGE_TAG=$IMAGE_TAG docker compose pull --quiet" \
+            > /tmp/ghcr-pull.log 2>&1 &
+          echo $! > /tmp/ghcr-pull.pid
+
       - name: Setup Node.js
         uses: actions/setup-node@v6
         with:
@@ -409,6 +441,17 @@ jobs:
           cp backend/config.test.toml backend/config.toml
           cp backend/secrets.example.toml backend/secrets.toml
 
+      - name: Wait for GHCR images
+        run: |
+          if [ -f /tmp/ghcr-pull.pid ]; then
+            PID=$(cat /tmp/ghcr-pull.pid)
+            if kill -0 "$PID" 2>/dev/null; then
+              echo "Waiting for GHCR image pull to complete..."
+              tail --pid="$PID" -f /dev/null 2>/dev/null || true
+            fi
+            cat /tmp/ghcr-pull.log 2>/dev/null || true
+          fi
+
       - name: Start stack
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
@@ -420,13 +463,13 @@ jobs:
       - name: Run Playwright tests
         timeout-minutes: 10
         working-directory: frontend
-        run: CI=true npx playwright test
+        run: CI=true npx playwright test --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }}
 
       - name: Upload Playwright report
         uses: actions/upload-artifact@v6
         if: always()
         with:
-          name: playwright-report
+          name: playwright-report-${{ matrix.shardIndex }}
           path: frontend/playwright-report/
 
       - name: Collect logs on failure
@@ -441,5 +484,5 @@ jobs:
         if: failure()
         uses: actions/upload-artifact@v6
         with:
-          name: frontend-e2e-logs
+          name: frontend-e2e-logs-${{ matrix.shardIndex }}
           path: logs/

From 7dd2244093cef6a39b531091cbdc3d6921b5fb38 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 20:45:23 +0100
Subject: [PATCH 06/17] =?UTF-8?q?Here's=20what=20this=20adds=20=E2=80=94?=
 =?UTF-8?q?=20infrastructure=20pre-warming:?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  How it works

  Both E2E jobs now have this timeline:

  Step 2: Pre-pull GHCR images ──────────────────────────────── (background)
  Step 3-7: Node/Playwright/Docker cache setup ──────────────── (foreground, ~50s)
  Step 8: Docker-cache loads infra images ───────────────────── (~15s)
  Step 9: Pre-warm infrastructure ───────────────────────────── (background, starts immediately)
           ├── mongo + redis start (~5s to healthy)
           ├── shared-ca + cert-gen + zk-certgen start (~5s)
           ├── zookeeper starts after zk-certgen (~15s)
           ├── kafka starts after zookeeper healthy (~20s)
           └── schema-registry starts after kafka (~10s)
  Step 10: k3s install ──────────────────────────────────────── (~42s, OVERLAPS with infra chain)
  Step 12: Wait for background tasks ────────────────────────── (both should be done)
  Step 13: Start stack ──────────────────────────────────────── (infra already healthy, only app services)

  Expected impact on "Start stack"

  | Component                       | Before                    | After                        |
  |---------------------------------|---------------------------|------------------------------|
  | Infra initialization (zk chain) | ~50s (during Start stack) | 0s (already done during k3s) |
  | App image pull                  | ~60s                      | 0s (pre-pulled)              |
  | App service startup             | ~30s                      | ~30s                         |
  | Health check waits              | ~20s                      | ~20s                         |
  | Total "Start stack"             | ~2:20                     | ~0:50                        |
---
 .github/workflows/stack-tests.yml | 59 ++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index b535dbde..8394b17e 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -302,6 +302,15 @@ jobs:
         with:
           images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
+      - name: Pre-warm infrastructure (background)
+        env:
+          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+        run: |
+          nohup docker compose up -d --no-build \
+            mongo redis shared-ca cert-generator zookeeper-certgen zookeeper kafka schema-registry \
+            > /tmp/infra-warm.log 2>&1 &
+          echo $! > /tmp/infra-warm.pid
+
       - name: Setup k3s
         uses: ./.github/actions/k3s-setup
 
@@ -310,16 +319,19 @@ jobs:
           cp backend/config.test.toml backend/config.toml
           cp backend/secrets.example.toml backend/secrets.toml
 
-      - name: Wait for GHCR images
+      - name: Wait for background tasks
         run: |
-          if [ -f /tmp/ghcr-pull.pid ]; then
-            PID=$(cat /tmp/ghcr-pull.pid)
-            if kill -0 "$PID" 2>/dev/null; then
-              echo "Waiting for GHCR image pull to complete..."
-              tail --pid="$PID" -f /dev/null 2>/dev/null || true
+          for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do
+            if [ -f "$pidfile" ]; then
+              PID=$(cat "$pidfile")
+              if kill -0 "$PID" 2>/dev/null; then
+                echo "Waiting for $(basename $pidfile .pid)..."
+                tail --pid="$PID" -f /dev/null 2>/dev/null || true
+              fi
             fi
-            cat /tmp/ghcr-pull.log 2>/dev/null || true
-          fi
+          done
+          cat /tmp/ghcr-pull.log 2>/dev/null || true
+          cat /tmp/infra-warm.log 2>/dev/null || true
 
       - name: Start stack
         env:
@@ -433,6 +445,18 @@ jobs:
         with:
           images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
 
+      - name: Pre-warm infrastructure (background)
+        env:
+          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+        run: |
+          # Start infra services in background while k3s installs.
+          # Compose handles dependency ordering (zookeeper-certgen → zookeeper → kafka).
+          # GHCR images for cert-generator/zookeeper-certgen should be pre-pulled by now.
+          nohup docker compose up -d --no-build \
+            mongo redis shared-ca cert-generator zookeeper-certgen zookeeper kafka schema-registry \
+            > /tmp/infra-warm.log 2>&1 &
+          echo $! > /tmp/infra-warm.pid
+
       - name: Setup k3s
         uses: ./.github/actions/k3s-setup
 
@@ -441,16 +465,19 @@ jobs:
           cp backend/config.test.toml backend/config.toml
           cp backend/secrets.example.toml backend/secrets.toml
 
-      - name: Wait for GHCR images
+      - name: Wait for background tasks
         run: |
-          if [ -f /tmp/ghcr-pull.pid ]; then
-            PID=$(cat /tmp/ghcr-pull.pid)
-            if kill -0 "$PID" 2>/dev/null; then
-              echo "Waiting for GHCR image pull to complete..."
-              tail --pid="$PID" -f /dev/null 2>/dev/null || true
+          for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do
+            if [ -f "$pidfile" ]; then
+              PID=$(cat "$pidfile")
+              if kill -0 "$PID" 2>/dev/null; then
+                echo "Waiting for $(basename $pidfile .pid)..."
+                tail --pid="$PID" -f /dev/null 2>/dev/null || true
+              fi
             fi
-            cat /tmp/ghcr-pull.log 2>/dev/null || true
-          fi
+          done
+          cat /tmp/ghcr-pull.log 2>/dev/null || true
+          cat /tmp/infra-warm.log 2>/dev/null || true
 
       - name: Start stack
         env:

From 69ed72a1cbb8ab1fcfa3820ff143db39c40ad565 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 20:55:48 +0100
Subject: [PATCH 07/17] The root cause: cert-generator service in
 docker-compose.yaml mounts ~/.kube:/root/.kube. When Docker creates that bind
 mount source directory, it creates it as root:root. Then k3s-setup's sudo k3s
 kubectl config   view --raw > /home/runner/.kube/config fails because the
 shell redirect (>) runs as the runner user who can't write to the root-owned
 directory.

---
 .github/workflows/stack-tests.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index 8394b17e..924316cc 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -306,8 +306,10 @@ jobs:
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
         run: |
+          # Start infra services in background while k3s installs.
+          # cert-generator is excluded: it needs k3s and mounts ~/.kube (conflicts with k3s-setup).
           nohup docker compose up -d --no-build \
-            mongo redis shared-ca cert-generator zookeeper-certgen zookeeper kafka schema-registry \
+            mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \
             > /tmp/infra-warm.log 2>&1 &
           echo $! > /tmp/infra-warm.pid
 
@@ -451,9 +453,9 @@ jobs:
         run: |
           # Start infra services in background while k3s installs.
           # Compose handles dependency ordering (zookeeper-certgen → zookeeper → kafka).
-          # GHCR images for cert-generator/zookeeper-certgen should be pre-pulled by now.
+          # cert-generator is excluded: it needs k3s and mounts ~/.kube (conflicts with k3s-setup).
           nohup docker compose up -d --no-build \
-            mongo redis shared-ca cert-generator zookeeper-certgen zookeeper kafka schema-registry \
+            mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \
             > /tmp/infra-warm.log 2>&1 &
           echo $! > /tmp/infra-warm.pid
 

From 815d6c605deec62dd5ea6094438eecae609ffe4b Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 21:24:09 +0100
Subject: [PATCH 08/17] Backend E2E step reorder (stack-tests.yml):
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  | Before                                  | After                                  |
  |-----------------------------------------|----------------------------------------|
  | 1. checkout                             | 1. checkout                            |
  | 2. GHCR pre-pull (bg)                   | 2. GHCR pre-pull (bg)                  |
  | 3. docker-cache                         | 3. config copy (moved up)              |
  | 4. infra pre-warm (bg)                  | 4. Install k3s (split from composite)  |
  | 5. k3s-setup (composite, ~45s blocking) | 5. docker-cache (runs during k3s boot) |
  | 6. config copy                          | 6. infra pre-warm (bg)                 |
  | 7. wait for bg                          | 7. Finalize k3s (~25s+ after install)  |
  | 8. start stack                          | 8. wait for bg                         |
  |                                         | 9. start stack                         |

  Key gain: k3s boot (30s) now overlaps with docker-cache (10-18s) instead of blocking sequentially. The composite k3s-setup action is inlined as "Install k3s" + "Finalize k3s", same pattern as frontend-e2e.

  Complete optimization summary across both files:
  1. docker-compose.yaml — Tightened health check intervals (5s→2-3s) and start periods (10s→3-5s) across all 7 services
  2. frontend-e2e — Inlined k3s, overlaps boot with Node + npm ci + Playwright (~50s overlap)
  3. backend-e2e — Inlined k3s, overlaps boot with docker-cache (~15s overlap)
  4. Both YAML files validated
---
 .github/workflows/stack-tests.yml | 87 +++++++++++++++++++++----------
 docker-compose.yaml               | 42 +++++++--------
 2 files changed, 81 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index 924316cc..e6019318 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -289,6 +289,7 @@ jobs:
     steps:
       - uses: actions/checkout@v6
 
+      # ── Phase 1: Start background tasks + infra ──
       - name: Pre-pull GHCR images (background)
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
@@ -297,6 +298,16 @@ jobs:
             > /tmp/ghcr-pull.log 2>&1 &
           echo $! > /tmp/ghcr-pull.pid
 
+      - name: Use test environment config
+        run: |
+          cp backend/config.test.toml backend/config.toml
+          cp backend/secrets.example.toml backend/secrets.toml
+
+      # ── Phase 2: Install k3s, then overlap boot with docker-cache + infra ──
+      - name: Install k3s
+        run: |
+          curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh -
+
       - name: Cache and load Docker images
         uses: ./.github/actions/docker-cache
         with:
@@ -306,21 +317,27 @@ jobs:
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
         run: |
-          # Start infra services in background while k3s installs.
-          # cert-generator is excluded: it needs k3s and mounts ~/.kube (conflicts with k3s-setup).
+          # Start infra while k3s finishes booting (~25s+ since install).
+          # cert-generator excluded: needs k3s and mounts ~/.kube.
           nohup docker compose up -d --no-build \
             mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \
             > /tmp/infra-warm.log 2>&1 &
           echo $! > /tmp/infra-warm.pid
 
-      - name: Setup k3s
-        uses: ./.github/actions/k3s-setup
-
-      - name: Use test environment config
+      # ── Phase 3: Finalize k3s (should be ready — 25s+ since install) ──
+      - name: Finalize k3s
         run: |
-          cp backend/config.test.toml backend/config.toml
-          cp backend/secrets.example.toml backend/secrets.toml
-
+          mkdir -p /home/runner/.kube
+          sudo k3s kubectl config view --raw > /home/runner/.kube/config
+          sudo chmod 600 /home/runner/.kube/config
+          export KUBECONFIG=/home/runner/.kube/config
+          timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done'
+          kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f -
+          sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \
+            /home/runner/.kube/config > backend/kubeconfig.yaml
+          chmod 644 backend/kubeconfig.yaml
+
+      # ── Phase 4: Start remaining services (infra already healthy from pre-warm) ──
       - name: Wait for background tasks
         run: |
           for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do
@@ -407,6 +424,7 @@ jobs:
     steps:
       - uses: actions/checkout@v6
 
+      # ── Phase 1: Start background tasks + infra (runs during all subsequent steps) ──
       - name: Pre-pull GHCR images (background)
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
@@ -415,6 +433,27 @@ jobs:
             > /tmp/ghcr-pull.log 2>&1 &
           echo $! > /tmp/ghcr-pull.pid
 
+      - name: Cache and load Docker images
+        uses: ./.github/actions/docker-cache
+        with:
+          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
+
+      - name: Pre-warm infrastructure (background)
+        env:
+          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+        run: |
+          # Start infra while k3s installs + Playwright sets up (~60s of overlap).
+          # cert-generator excluded: needs k3s and mounts ~/.kube.
+          nohup docker compose up -d --no-build \
+            mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \
+            > /tmp/infra-warm.log 2>&1 &
+          echo $! > /tmp/infra-warm.pid
+
+      # ── Phase 2: k3s install + Node/Playwright setup (overlapped) ──
+      - name: Install k3s
+        run: |
+          curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh -
+
       - name: Setup Node.js
         uses: actions/setup-node@v6
         with:
@@ -442,31 +481,25 @@ jobs:
         working-directory: frontend
         run: npx playwright install chromium
 
-      - name: Cache and load Docker images
-        uses: ./.github/actions/docker-cache
-        with:
-          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
-
-      - name: Pre-warm infrastructure (background)
-        env:
-          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+      # ── Phase 3: Finalize k3s (should be ready — 50s+ since install) ──
+      - name: Finalize k3s
         run: |
-          # Start infra services in background while k3s installs.
-          # Compose handles dependency ordering (zookeeper-certgen → zookeeper → kafka).
-          # cert-generator is excluded: it needs k3s and mounts ~/.kube (conflicts with k3s-setup).
-          nohup docker compose up -d --no-build \
-            mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \
-            > /tmp/infra-warm.log 2>&1 &
-          echo $! > /tmp/infra-warm.pid
-
-      - name: Setup k3s
-        uses: ./.github/actions/k3s-setup
+          mkdir -p /home/runner/.kube
+          sudo k3s kubectl config view --raw > /home/runner/.kube/config
+          sudo chmod 600 /home/runner/.kube/config
+          export KUBECONFIG=/home/runner/.kube/config
+          timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done'
+          kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f -
+          sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \
+            /home/runner/.kube/config > backend/kubeconfig.yaml
+          chmod 644 backend/kubeconfig.yaml
 
       - name: Use test environment config
         run: |
           cp backend/config.test.toml backend/config.toml
           cp backend/secrets.example.toml backend/secrets.toml
 
+      # ── Phase 4: Start remaining services (infra already healthy from pre-warm) ──
       - name: Wait for background tasks
         run: |
           for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 060a955c..9f9fdd27 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -58,10 +58,10 @@ services:
         hard: 65536
     healthcheck:
       test: echo 'db.runCommand("ping").ok' | mongosh localhost/integr8scode -u ${MONGO_ROOT_USER:-root} -p ${MONGO_ROOT_PASSWORD:-rootpassword} --authenticationDatabase admin --quiet
-      interval: 5s
+      interval: 3s
       timeout: 5s
-      retries: 10
-      start_period: 10s
+      retries: 15
+      start_period: 5s
 
   redis:
     image: redis:7-alpine
@@ -75,10 +75,10 @@ services:
       - app-network
     healthcheck:
       test: ["CMD", "redis-cli", "ping"]
-      interval: 5s
-      timeout: 5s
-      retries: 5
-      start_period: 5s
+      interval: 2s
+      timeout: 3s
+      retries: 10
+      start_period: 2s
 
   backend:
     image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest}
@@ -123,10 +123,10 @@ services:
       - "host.docker.internal:host-gateway"
     healthcheck:
       test: ["CMD-SHELL", "curl -k -f -s https://localhost:443/api/v1/health/live >/dev/null || exit 1"]
-      interval: 3s
+      interval: 2s
       timeout: 3s
-      retries: 50
-      start_period: 5s
+      retries: 30
+      start_period: 3s
 
   frontend:
     image: ghcr.io/hardmax71/integr8scode/frontend-dev:${IMAGE_TAG:-latest}
@@ -153,10 +153,10 @@ services:
       - NODE_EXTRA_CA_CERTS=/shared_ca/mkcert-ca.pem
     healthcheck:
       test: ["CMD-SHELL", "curl -k -f -s https://localhost:5001 >/dev/null || exit 1"]
-      interval: 3s
+      interval: 2s
       timeout: 3s
       retries: 30
-      start_period: 5s
+      start_period: 3s
 
 
   grafana:
@@ -261,10 +261,10 @@ services:
         hard: 65536
     healthcheck:
       test: ["CMD-SHELL", "echo ruok | nc localhost 2181 | grep imok"]
-      interval: 5s
+      interval: 3s
       timeout: 5s
-      retries: 10
-      start_period: 10s
+      retries: 15
+      start_period: 5s
 
   kafka:
     image: confluentinc/cp-kafka:7.8.2
@@ -322,10 +322,10 @@ services:
         hard: 65536
     healthcheck:
       test: ["CMD-SHELL", "kafka-broker-api-versions --bootstrap-server localhost:9092"]
-      interval: 5s
+      interval: 3s
       timeout: 10s
-      retries: 12
-      start_period: 5s
+      retries: 15
+      start_period: 3s
 
   schema-registry:
     image: confluentinc/cp-schema-registry:7.8.2
@@ -343,10 +343,10 @@ services:
       - app-network
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8081/config"]
-      interval: 5s
+      interval: 3s
       timeout: 5s
-      retries: 10
-      start_period: 10s
+      retries: 15
+      start_period: 5s
 
   kafdrop:
     image: obsidiandynamics/kafdrop:3.31.0

From 073a3d7aef0d2f88da42ad7e6aa0a9a737ed9331 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 21:33:17 +0100
Subject: [PATCH 09/17] =?UTF-8?q?=20Before=20(19=20steps,=206=20sequential?=
 =?UTF-8?q?=20push=20steps=20=3D=20~81s=20pushing):=20=20=20Build=20base?=
 =?UTF-8?q?=20=E2=86=92=20Push=20base=20(13s)=20=E2=86=92=20Build=208=20wo?=
 =?UTF-8?q?rkers=20=E2=86=92=20Push=208=20workers=20(35s=20sequential)=20?=
 =?UTF-8?q?=20=20=E2=86=92=20Build=20cert-gen=20=E2=86=92=20Push=20cert-ge?=
 =?UTF-8?q?n=20(7s)=20=E2=86=92=20Build=20zk-certgen=20=E2=86=92=20Push=20?=
 =?UTF-8?q?zk-certgen=20(8s)=20=20=20=E2=86=92=20Build=20frontend=20?=
 =?UTF-8?q?=E2=86=92=20Push=20frontend-dev=20(12s)=20=E2=86=92=20Build=20f?=
 =?UTF-8?q?rontend-prod=20=E2=86=92=20Push=20frontend-prod=20(6s)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  After (14 steps, 1 parallel push step):
  Build base → Build 8 workers → Build cert-gen → Build zk-certgen
  → Build frontend → Build frontend-prod → Push all 13 in parallel (~15-20s)

  Expected savings: ~60s (81s sequential → ~20s parallel). Job should drop from 2m 48s → ~1m 50s.

  The builds are all done first (same total time), then all 13 pushes fire concurrently. Since they share base layers, Docker deduplicates — the first push uploads shared layers and the rest skip them.
---
 .github/workflows/stack-tests.yml | 86 +++++++++++--------------------
 1 file changed, 29 insertions(+), 57 deletions(-)

diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index e6019318..ce43f6c4 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -167,15 +167,8 @@ jobs:
         if: steps.base-cache.outputs.cache-hit != 'true'
         run: docker save integr8scode-base:latest | zstd -T0 -3 > /tmp/base-image.tar.zst
 
-      - name: Push base to GHCR
-        if: ${{ !github.event.pull_request.head.repo.fork }}
-        run: |
-          docker tag integr8scode-base:latest \
-            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }}
-          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/base:${{ steps.tags.outputs.sha-tag }}
-
       # ── Backend + workers (depend on local base image) ───────────────
-      - name: Build all images
+      - name: Build backend and worker images
         run: |
           docker build -t integr8scode-backend:latest --build-context base=docker-image://integr8scode-base:latest -f ./backend/Dockerfile ./backend
           docker build -t integr8scode-coordinator:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.coordinator ./backend
@@ -186,30 +179,6 @@ jobs:
           docker build -t integr8scode-event-replay:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.event_replay ./backend
           docker build -t integr8scode-dlq-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.dlq_processor ./backend
 
-      - name: Push backend and workers to GHCR
-        if: ${{ !github.event.pull_request.head.repo.fork }}
-        env:
-          TAG: ${{ steps.tags.outputs.sha-tag }}
-          IMG: ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}
-        run: |
-          docker tag integr8scode-backend:latest "$IMG/backend:$TAG"
-          docker tag integr8scode-coordinator:latest "$IMG/coordinator:$TAG"
-          docker tag integr8scode-k8s-worker:latest "$IMG/k8s-worker:$TAG"
-          docker tag integr8scode-pod-monitor:latest "$IMG/pod-monitor:$TAG"
-          docker tag integr8scode-result-processor:latest "$IMG/result-processor:$TAG"
-          docker tag integr8scode-saga-orchestrator:latest "$IMG/saga-orchestrator:$TAG"
-          docker tag integr8scode-event-replay:latest "$IMG/event-replay:$TAG"
-          docker tag integr8scode-dlq-processor:latest "$IMG/dlq-processor:$TAG"
-
-          docker push "$IMG/backend:$TAG"
-          docker push "$IMG/coordinator:$TAG"
-          docker push "$IMG/k8s-worker:$TAG"
-          docker push "$IMG/pod-monitor:$TAG"
-          docker push "$IMG/result-processor:$TAG"
-          docker push "$IMG/saga-orchestrator:$TAG"
-          docker push "$IMG/event-replay:$TAG"
-          docker push "$IMG/dlq-processor:$TAG"
-
       # ── Utility images (GHA-cached, independent of base) ────────────
       - name: Build cert-generator image
         uses: docker/build-push-action@v6
@@ -221,13 +190,6 @@ jobs:
           cache-from: type=gha,scope=cert-generator
           cache-to: type=gha,mode=max,scope=cert-generator
 
-      - name: Push cert-generator to GHCR
-        if: ${{ !github.event.pull_request.head.repo.fork }}
-        run: |
-          docker tag integr8scode-cert-generator:latest \
-            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }}
-          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/cert-generator:${{ steps.tags.outputs.sha-tag }}
-
       - name: Build zookeeper-certgen image
         uses: docker/build-push-action@v6
         with:
@@ -238,13 +200,6 @@ jobs:
           cache-from: type=gha,scope=zookeeper-certgen
           cache-to: type=gha,mode=max,scope=zookeeper-certgen
 
-      - name: Push zookeeper-certgen to GHCR
-        if: ${{ !github.event.pull_request.head.repo.fork }}
-        run: |
-          docker tag integr8scode-zookeeper-certgen:latest \
-            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }}
-          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/zookeeper-certgen:${{ steps.tags.outputs.sha-tag }}
-
       # ── Frontend (dev for E2E, prod for scanning/deployment) ─────────
       - name: Build frontend image
         uses: docker/build-push-action@v6
@@ -256,13 +211,6 @@ jobs:
           cache-from: type=gha,scope=frontend
           cache-to: type=gha,mode=max,scope=frontend
 
-      - name: Push frontend-dev to GHCR
-        if: ${{ !github.event.pull_request.head.repo.fork }}
-        run: |
-          docker tag integr8scode-frontend:latest \
-            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }}
-          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend-dev:${{ steps.tags.outputs.sha-tag }}
-
       - name: Build frontend-prod image
         uses: docker/build-push-action@v6
         with:
@@ -273,12 +221,36 @@ jobs:
           cache-from: type=gha,scope=frontend-prod
           cache-to: type=gha,mode=max,scope=frontend-prod
 
-      - name: Push frontend-prod to GHCR
+      # ── Push all images to GHCR in parallel ────────────────────────
+      - name: Push all images to GHCR
         if: ${{ !github.event.pull_request.head.repo.fork }}
+        env:
+          TAG: ${{ steps.tags.outputs.sha-tag }}
+          IMG: ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}
         run: |
-          docker tag integr8scode-frontend-prod:latest \
-            ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }}
-          docker push ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}/frontend:${{ steps.tags.outputs.sha-tag }}
+          # Tag all images for GHCR
+          docker tag integr8scode-base:latest "$IMG/base:$TAG"
+          docker tag integr8scode-backend:latest "$IMG/backend:$TAG"
+          docker tag integr8scode-coordinator:latest "$IMG/coordinator:$TAG"
+          docker tag integr8scode-k8s-worker:latest "$IMG/k8s-worker:$TAG"
+          docker tag integr8scode-pod-monitor:latest "$IMG/pod-monitor:$TAG"
+          docker tag integr8scode-result-processor:latest "$IMG/result-processor:$TAG"
+          docker tag integr8scode-saga-orchestrator:latest "$IMG/saga-orchestrator:$TAG"
+          docker tag integr8scode-event-replay:latest "$IMG/event-replay:$TAG"
+          docker tag integr8scode-dlq-processor:latest "$IMG/dlq-processor:$TAG"
+          docker tag integr8scode-cert-generator:latest "$IMG/cert-generator:$TAG"
+          docker tag integr8scode-zookeeper-certgen:latest "$IMG/zookeeper-certgen:$TAG"
+          docker tag integr8scode-frontend:latest "$IMG/frontend-dev:$TAG"
+          docker tag integr8scode-frontend-prod:latest "$IMG/frontend:$TAG"
+
+          # Push all 13 images in parallel
+          for name in base backend coordinator k8s-worker pod-monitor \
+                      result-processor saga-orchestrator event-replay \
+                      dlq-processor cert-generator zookeeper-certgen \
+                      frontend-dev frontend; do
+            docker push "$IMG/$name:$TAG" &
+          done
+          wait
 
   # Parallel E2E test jobs — compose pulls from GHCR using IMAGE_TAG
   backend-e2e:

From 9f164c7d4184f5321b4e6eb752f7dd0a718ba0cf Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 21:55:54 +0100
Subject: [PATCH 10/17] Two changes made:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  1. Parallel GHCR pushes (build-images job):
  - Merged 6 separate push steps into 1 step that pushes all 13 images in parallel via for ... do docker push & done; wait
  - Expected: ~81s sequential → ~15-20s parallel (saves ~60s)

  2. Targeted health checks (both E2E jobs):
  - Replaced deploy.sh dev --no-build --wait (waits for ALL 15+ containers) with:
    - docker compose up -d --no-build (returns immediately, ~3s)
    - curl loop that only waits for backend (backend-e2e) or backend + frontend (frontend-e2e)
  - Workers start in background and become ready while tests run their initial setup
  - Expected: "Start stack" drops from ~2:01 to ~5s + "Wait for health" ~40-60s = ~45-65s total (saves ~60s)
---
 .github/workflows/stack-tests.yml | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index ce43f6c4..5fa93822 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -327,7 +327,13 @@ jobs:
       - name: Start stack
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: ./deploy.sh dev --no-build --wait
+        run: docker compose up -d --no-build
+
+      - name: Wait for backend
+        run: |
+          echo "Waiting for backend health..."
+          timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done'
+          echo "Backend ready"
 
       - name: Seed test users
         run: docker compose exec -T backend uv run python scripts/seed_users.py
@@ -489,7 +495,16 @@ jobs:
       - name: Start stack
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: ./deploy.sh dev --no-build --wait
+        run: docker compose up -d --no-build
+
+      - name: Wait for backend and frontend
+        run: |
+          echo "Waiting for backend health..."
+          timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done'
+          echo "Backend ready"
+          echo "Waiting for frontend health..."
+          timeout 60 bash -c 'until curl -ksf https://localhost:5001 2>/dev/null; do sleep 2; done'
+          echo "Frontend ready"
 
       - name: Seed test users
         run: docker compose exec -T backend uv run python scripts/seed_users.py

From 89871ae7ccb69a89ae0c439a504704ee707fdb09 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 22:19:16 +0100
Subject: [PATCH 11/17] =?UTF-8?q?=20Root=20cause=20analysis:=20docker=20co?=
 =?UTF-8?q?mpose=20up=20-d=20--no-build=20(even=20without=20--wait)=20take?=
 =?UTF-8?q?s=201:23=20because=20depends=5Fon:=20condition:=20service=5Fhea?=
 =?UTF-8?q?lthy=20in=20docker-compose.yaml=20forces=20compose=20to=20wait?=
 =?UTF-8?q?=20for=20the=20entire=20dependency=20chain=20before=20=20=20cre?=
 =?UTF-8?q?ating=20dependent=20containers.=20Removing=20--wait=20only=20sk?=
 =?UTF-8?q?ipped=20the=20final=20"all=20healthy"=20check=20=E2=80=94=20the?=
 =?UTF-8?q?=20internal=20chain=20is=20the=20real=20bottleneck.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  Changes made (3 optimizations):

  1. Removed docker-cache step (saves ~1:08 blocking time)

  The docker-cache composite action was loading 5 infra images from GHA cache in ~68s of blocking foreground time. But docker compose pull (pre-pull) already fetches ALL images in background. Removed the redundant step.

  2. Merged pre-pull + pre-warm into single sequential background task

  Instead of: pre-pull (bg) → docker-cache (blocking 1:08) → pre-warm (bg)
  Now: docker compose pull && docker compose up -d ... infra all in one background process. Infra starts pulling + booting immediately after checkout, overlapping with all subsequent setup steps.

  3. Pre-start cert-generator after k3s finalize

  cert-generator is on the critical path: cert-gen(complete) → backend(healthy) → frontend. Starting it right after kubeconfig exists gives it a ~15-20s head start while we wait for pre-pull to finish.
---
 .github/workflows/stack-tests.yml | 131 ++++++++++++++----------------
 1 file changed, 61 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index 5fa93822..b07d51a7 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -261,42 +261,31 @@ jobs:
     steps:
       - uses: actions/checkout@v6
 
-      # ── Phase 1: Start background tasks + infra ──
-      - name: Pre-pull GHCR images (background)
+      # ── Phase 1: Pull images + start infra in background (overlap with k3s) ──
+      - name: Pull images and pre-warm infra (background)
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
         run: |
-          nohup bash -c "IMAGE_TAG=$IMAGE_TAG docker compose pull --quiet" \
-            > /tmp/ghcr-pull.log 2>&1 &
-          echo $! > /tmp/ghcr-pull.pid
+          # Pull all images (GHCR + Docker Hub) then start infra services.
+          # This runs throughout k3s install/finalize (~20s of overlap).
+          nohup bash -c '
+            IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1
+            echo "--- pull done, starting infra ---"
+            docker compose up -d --no-build \
+              mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1
+          ' > /tmp/infra-pull.log 2>&1 &
+          echo $! > /tmp/infra-pull.pid
 
       - name: Use test environment config
         run: |
           cp backend/config.test.toml backend/config.toml
           cp backend/secrets.example.toml backend/secrets.toml
 
-      # ── Phase 2: Install k3s, then overlap boot with docker-cache + infra ──
+      # ── Phase 2: k3s install + finalize (overlaps with image pull + infra boot) ──
       - name: Install k3s
         run: |
           curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh -
 
-      - name: Cache and load Docker images
-        uses: ./.github/actions/docker-cache
-        with:
-          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
-
-      - name: Pre-warm infrastructure (background)
-        env:
-          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: |
-          # Start infra while k3s finishes booting (~25s+ since install).
-          # cert-generator excluded: needs k3s and mounts ~/.kube.
-          nohup docker compose up -d --no-build \
-            mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \
-            > /tmp/infra-warm.log 2>&1 &
-          echo $! > /tmp/infra-warm.pid
-
-      # ── Phase 3: Finalize k3s (should be ready — 25s+ since install) ──
       - name: Finalize k3s
         run: |
           mkdir -p /home/runner/.kube
@@ -309,20 +298,26 @@ jobs:
             /home/runner/.kube/config > backend/kubeconfig.yaml
           chmod 644 backend/kubeconfig.yaml
 
-      # ── Phase 4: Start remaining services (infra already healthy from pre-warm) ──
-      - name: Wait for background tasks
+      # Start cert-generator now that kubeconfig exists (runs during wait step)
+      - name: Start cert-generator (background)
+        env:
+          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+        run: |
+          nohup docker compose up -d --no-build cert-generator \
+            > /tmp/cert-gen.log 2>&1 &
+
+      # ── Phase 3: Wait for pulls + start stack ──
+      - name: Wait for image pull and infra
         run: |
-          for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do
-            if [ -f "$pidfile" ]; then
-              PID=$(cat "$pidfile")
-              if kill -0 "$PID" 2>/dev/null; then
-                echo "Waiting for $(basename $pidfile .pid)..."
-                tail --pid="$PID" -f /dev/null 2>/dev/null || true
-              fi
+          if [ -f /tmp/infra-pull.pid ]; then
+            PID=$(cat /tmp/infra-pull.pid)
+            if kill -0 "$PID" 2>/dev/null; then
+              echo "Waiting for image pull + infra startup..."
+              tail --pid="$PID" -f /dev/null 2>/dev/null || true
             fi
-          done
-          cat /tmp/ghcr-pull.log 2>/dev/null || true
-          cat /tmp/infra-warm.log 2>/dev/null || true
+          fi
+          cat /tmp/infra-pull.log 2>/dev/null || true
+          cat /tmp/cert-gen.log 2>/dev/null || true
 
       - name: Start stack
         env:
@@ -402,32 +397,22 @@ jobs:
     steps:
       - uses: actions/checkout@v6
 
-      # ── Phase 1: Start background tasks + infra (runs during all subsequent steps) ──
-      - name: Pre-pull GHCR images (background)
+      # ── Phase 1: Pull images + start infra in background (runs during all subsequent steps) ──
+      - name: Pull images and pre-warm infra (background)
         env:
           IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
         run: |
-          nohup bash -c "IMAGE_TAG=$IMAGE_TAG docker compose pull --quiet" \
-            > /tmp/ghcr-pull.log 2>&1 &
-          echo $! > /tmp/ghcr-pull.pid
-
-      - name: Cache and load Docker images
-        uses: ./.github/actions/docker-cache
-        with:
-          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
-
-      - name: Pre-warm infrastructure (background)
-        env:
-          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: |
-          # Start infra while k3s installs + Playwright sets up (~60s of overlap).
-          # cert-generator excluded: needs k3s and mounts ~/.kube.
-          nohup docker compose up -d --no-build \
-            mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry \
-            > /tmp/infra-warm.log 2>&1 &
-          echo $! > /tmp/infra-warm.pid
-
-      # ── Phase 2: k3s install + Node/Playwright setup (overlapped) ──
+          # Pull all images (GHCR + Docker Hub) then start infra services.
+          # This runs throughout k3s + Node + Playwright setup (~80s of overlap).
+          nohup bash -c '
+            IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1
+            echo "--- pull done, starting infra ---"
+            docker compose up -d --no-build \
+              mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1
+          ' > /tmp/infra-pull.log 2>&1 &
+          echo $! > /tmp/infra-pull.pid
+
+      # ── Phase 2: k3s install + Node/Playwright setup (overlapped with pull + infra) ──
       - name: Install k3s
         run: |
           curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh -
@@ -472,25 +457,31 @@ jobs:
             /home/runner/.kube/config > backend/kubeconfig.yaml
           chmod 644 backend/kubeconfig.yaml
 
+      # Start cert-generator now that kubeconfig exists (runs during wait step)
+      - name: Start cert-generator (background)
+        env:
+          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
+        run: |
+          nohup docker compose up -d --no-build cert-generator \
+            > /tmp/cert-gen.log 2>&1 &
+
       - name: Use test environment config
         run: |
           cp backend/config.test.toml backend/config.toml
           cp backend/secrets.example.toml backend/secrets.toml
 
-      # ── Phase 4: Start remaining services (infra already healthy from pre-warm) ──
-      - name: Wait for background tasks
+      # ── Phase 4: Wait for pulls + start stack ──
+      - name: Wait for image pull and infra
         run: |
-          for pidfile in /tmp/ghcr-pull.pid /tmp/infra-warm.pid; do
-            if [ -f "$pidfile" ]; then
-              PID=$(cat "$pidfile")
-              if kill -0 "$PID" 2>/dev/null; then
-                echo "Waiting for $(basename $pidfile .pid)..."
-                tail --pid="$PID" -f /dev/null 2>/dev/null || true
-              fi
+          if [ -f /tmp/infra-pull.pid ]; then
+            PID=$(cat /tmp/infra-pull.pid)
+            if kill -0 "$PID" 2>/dev/null; then
+              echo "Waiting for image pull + infra startup..."
+              tail --pid="$PID" -f /dev/null 2>/dev/null || true
             fi
-          done
-          cat /tmp/ghcr-pull.log 2>/dev/null || true
-          cat /tmp/infra-warm.log 2>/dev/null || true
+          fi
+          cat /tmp/infra-pull.log 2>/dev/null || true
+          cat /tmp/cert-gen.log 2>/dev/null || true
 
       - name: Start stack
         env:

From 33e8c33280eda1d716c7526c2cbaf68e4add91ec Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 22:44:21 +0100
Subject: [PATCH 12/17] =?UTF-8?q?=20What=20changed:=20frontend.depends=5Fo?=
 =?UTF-8?q?n.backend:=20service=5Fhealthy=20=E2=86=92=20service=5Fstarted?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  Impact: Compose no longer waits for backend to pass its health check (~35s) before creating the frontend container. Backend and frontend now boot in parallel during docker compose up -d.

  For frontend-e2e: "Start stack" should drop from 1:20 to 45-50s (no backend health wait in compose), and "Wait for backend+frontend" picks up the slack but runs in parallel (45s). Net: 2:03 → ~1:30, saving ~33s → job drops
   to ~5:00.

  For backend-e2e: Smaller impact since backend tests don't need frontend. "Start stack" drops slightly (~10s) since compose returns earlier. Job should be ~5:30.

  At this point we're approaching the hard floor:
  - Backend E2E: 3:00 tests + 100s minimum setup = ~4:40 floor, currently ~5:30 (50s over)
  - Frontend E2E: 2:11 tests + 80s minimum setup = ~3:31 floor, currently ~5:00 (89s over, mostly from the depends_on chain which is inherent to docker-compose)
---
 docker-compose.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-compose.yaml b/docker-compose.yaml
index 9f9fdd27..80c215c8 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -138,7 +138,7 @@ services:
       cert-generator:
         condition: service_completed_successfully
       backend:
-        condition: service_healthy
+        condition: service_started
     volumes:
       - ./frontend:/app
       - /app/node_modules

From c9ac036d5aa82fbf70e0f88bd55e9e0312d4281f Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 23:12:32 +0100
Subject: [PATCH 13/17] fixes

---
 .github/actions/k3s-setup/action.yml |  6 +++++-
 .github/workflows/stack-tests.yml    | 29 ++++++++++++++++++++++------
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/.github/actions/k3s-setup/action.yml b/.github/actions/k3s-setup/action.yml
index d21c4a43..523ca0ad 100644
--- a/.github/actions/k3s-setup/action.yml
+++ b/.github/actions/k3s-setup/action.yml
@@ -24,7 +24,11 @@ runs:
       run: |
         # --bind-address 0.0.0.0: Listen on all interfaces so Docker containers can reach it
         # --tls-san host.docker.internal: Include in cert SANs for Docker container access
-        curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh -
+        K3S_VERSION="${K3S_VERSION:-v1.32.11+k3s1}"
+        K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
+        curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
+        chmod +x /tmp/k3s-install.sh
+        INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh
         mkdir -p /home/runner/.kube
         sudo k3s kubectl config view --raw > /home/runner/.kube/config
         sudo chmod 600 /home/runner/.kube/config
diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index b07d51a7..27f48d98 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -31,6 +31,7 @@ env:
   KAFKA_IMAGE: confluentinc/cp-kafka:7.8.2
   ZOOKEEPER_IMAGE: confluentinc/cp-zookeeper:7.8.2
   SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.8.2
+  K3S_VERSION: v1.32.11+k3s1
 
 jobs:
   # Fast unit tests (no infrastructure needed)
@@ -137,8 +138,8 @@ jobs:
         run: |
           PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
           SHA_TAG="sha-${GITHUB_SHA::7}"
-          echo "sha-tag=$SHA_TAG" >> $GITHUB_OUTPUT
-          echo "image-prefix=$PREFIX" >> $GITHUB_OUTPUT
+          echo "sha-tag=$SHA_TAG" >> "$GITHUB_OUTPUT"
+          echo "image-prefix=$PREFIX" >> "$GITHUB_OUTPUT"
 
       # ── Base image (cached separately — rarely changes) ──────────────
       - name: Cache base image
@@ -243,14 +244,24 @@ jobs:
           docker tag integr8scode-frontend:latest "$IMG/frontend-dev:$TAG"
           docker tag integr8scode-frontend-prod:latest "$IMG/frontend:$TAG"
 
-          # Push all 13 images in parallel
+          # Push all 13 images in parallel, tracking each PID
+          declare -A PIDS
           for name in base backend coordinator k8s-worker pod-monitor \
                       result-processor saga-orchestrator event-replay \
                       dlq-processor cert-generator zookeeper-certgen \
                       frontend-dev frontend; do
             docker push "$IMG/$name:$TAG" &
+            PIDS[$name]=$!
           done
-          wait
+
+          FAILED=0
+          for name in "${!PIDS[@]}"; do
+            if ! wait "${PIDS[$name]}"; then
+              echo "::error::Failed to push $name"
+              FAILED=1
+            fi
+          done
+          [ "$FAILED" -eq 0 ] || exit 1
 
   # Parallel E2E test jobs — compose pulls from GHCR using IMAGE_TAG
   backend-e2e:
@@ -284,7 +295,10 @@ jobs:
       # ── Phase 2: k3s install + finalize (overlaps with image pull + infra boot) ──
       - name: Install k3s
         run: |
-          curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh -
+          K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
+          curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
+          chmod +x /tmp/k3s-install.sh
+          INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh
 
       - name: Finalize k3s
         run: |
@@ -415,7 +429,10 @@ jobs:
       # ── Phase 2: k3s install + Node/Playwright setup (overlapped with pull + infra) ──
       - name: Install k3s
         run: |
-          curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh -
+          K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
+          curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
+          chmod +x /tmp/k3s-install.sh
+          INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh
 
       - name: Setup Node.js
         uses: actions/setup-node@v6

From 68c18b8ea40c0536b6e18ef241f3c46707de512c Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 23:41:14 +0100
Subject: [PATCH 14/17]   Created 2 composite actions, deleted 2 unused ones:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  | Action                 | Purpose                                                                     |
  |------------------------|-----------------------------------------------------------------------------|
  | e2e-boot (new)         | GHCR login + pull/prewarm infra (bg) + k3s install                          |
  | e2e-ready (new)        | Finalize k3s + cert-gen + config + wait + start stack + health check + seed |
  | k3s-setup (deleted)    | Was inlined previously, never referenced                                    |
  | docker-cache (deleted) | Replaced by docker compose pull, never referenced                           |

  Step count reduction:
  - backend-e2e: 20 steps → 8 steps (checkout + 2 actions + test + coverage + logs)
  - frontend-e2e: 20 steps → 13 steps (checkout + e2e-boot + 5 Node/Playwright + e2e-ready + test + report + logs)

  Performance preserved: The split point between e2e-boot and e2e-ready is exactly where frontend-e2e interposes Node/Playwright setup, so k3s still boots in the background during that work.
---
 .github/actions/docker-cache/action.yml |  64 --------
 .github/actions/e2e-boot/action.yml     |  41 +++++
 .github/actions/e2e-ready/action.yml    |  78 ++++++++++
 .github/actions/k3s-setup/action.yml    |  61 --------
 .github/workflows/stack-tests.yml       | 199 ++++--------------------
 5 files changed, 152 insertions(+), 291 deletions(-)
 delete mode 100644 .github/actions/docker-cache/action.yml
 create mode 100644 .github/actions/e2e-boot/action.yml
 create mode 100644 .github/actions/e2e-ready/action.yml
 delete mode 100644 .github/actions/k3s-setup/action.yml

diff --git a/.github/actions/docker-cache/action.yml b/.github/actions/docker-cache/action.yml
deleted file mode 100644
index 253885e2..00000000
--- a/.github/actions/docker-cache/action.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-name: 'Docker Image Cache'
-description: 'Cache and load Docker images for CI jobs'
-
-inputs:
-  images:
-    description: 'Space-separated list of Docker images to cache'
-    required: true
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Generate cache key from images
-      id: cache-key
-      shell: bash
-      env:
-        IMAGES_INPUT: ${{ inputs.images }}
-      run: |
-        # Create a stable hash from the sorted image list
-        # Using env var to prevent script injection
-        IMAGES_HASH=$(echo "$IMAGES_INPUT" | tr ' ' '\n' | sort | md5sum | cut -d' ' -f1)
-        echo "key=docker-${{ runner.os }}-${IMAGES_HASH}" >> $GITHUB_OUTPUT
-
-    - name: Cache Docker images
-      uses: actions/cache@v5
-      id: docker-cache
-      with:
-        path: /tmp/docker-cache
-        key: ${{ steps.cache-key.outputs.key }}
-
-    - name: Load cached Docker images
-      if: steps.docker-cache.outputs.cache-hit == 'true'
-      shell: bash
-      run: |
-        echo "Loading cached images..."
-        for f in /tmp/docker-cache/*.tar.zst; do
-          zstd -d -c "$f" | docker load &
-        done
-        wait
-        docker images
-
-    - name: Pull and save Docker images
-      if: steps.docker-cache.outputs.cache-hit != 'true'
-      shell: bash
-      env:
-        IMAGES_INPUT: ${{ inputs.images }}
-      run: |
-        mkdir -p /tmp/docker-cache
-
-        echo "Pulling images in parallel..."
-        for img in $IMAGES_INPUT; do
-          docker pull "$img" &
-        done
-        wait
-
-        echo "Saving images with zstd compression..."
-        for img in $IMAGES_INPUT; do
-          # Create filename from image name (replace special chars)
-          filename=$(echo "$img" | tr '/:' '_')
-          docker save "$img" | zstd -T0 -3 > "/tmp/docker-cache/${filename}.tar.zst" &
-        done
-        wait
-
-        echo "Cache size:"
-        du -sh /tmp/docker-cache/
diff --git a/.github/actions/e2e-boot/action.yml b/.github/actions/e2e-boot/action.yml
new file mode 100644
index 00000000..be4d7bdf
--- /dev/null
+++ b/.github/actions/e2e-boot/action.yml
@@ -0,0 +1,41 @@
+name: 'E2E Boot'
+description: 'Kick off slow background tasks: GHCR auth, image pull + infra pre-warm, k3s install'
+
+inputs:
+  image-tag:
+    description: 'GHCR image tag (e.g., sha-abc1234)'
+    required: true
+  github-token:
+    description: 'GitHub token for GHCR authentication'
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Log in to GHCR
+      uses: docker/login-action@v3
+      with:
+        registry: ${{ env.REGISTRY }}
+        username: ${{ github.actor }}
+        password: ${{ inputs.github-token }}
+
+    - name: Pull images and pre-warm infra (background)
+      shell: bash
+      env:
+        IMAGE_TAG: ${{ inputs.image-tag }}
+      run: |
+        nohup bash -c '
+          IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1
+          echo "--- pull done, starting infra ---"
+          docker compose up -d --no-build \
+            mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1
+        ' > /tmp/infra-pull.log 2>&1 &
+        echo $! > /tmp/infra-pull.pid
+
+    - name: Install k3s
+      shell: bash
+      run: |
+        K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
+        curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
+        chmod +x /tmp/k3s-install.sh
+        INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh
diff --git a/.github/actions/e2e-ready/action.yml b/.github/actions/e2e-ready/action.yml
new file mode 100644
index 00000000..c2b90d39
--- /dev/null
+++ b/.github/actions/e2e-ready/action.yml
@@ -0,0 +1,78 @@
+name: 'E2E Ready'
+description: 'Finalize k3s, wait for infra, start compose stack, health-check, seed test users'
+
+inputs:
+  image-tag:
+    description: 'GHCR image tag (e.g., sha-abc1234)'
+    required: true
+  wait-for-frontend:
+    description: 'Also wait for frontend health check (default: false)'
+    required: false
+    default: 'false'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Finalize k3s
+      shell: bash
+      run: |
+        mkdir -p /home/runner/.kube
+        sudo k3s kubectl config view --raw > /home/runner/.kube/config
+        sudo chmod 600 /home/runner/.kube/config
+        export KUBECONFIG=/home/runner/.kube/config
+        timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done'
+        kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f -
+        sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \
+          /home/runner/.kube/config > backend/kubeconfig.yaml
+        chmod 644 backend/kubeconfig.yaml
+
+    - name: Start cert-generator (background)
+      shell: bash
+      env:
+        IMAGE_TAG: ${{ inputs.image-tag }}
+      run: |
+        nohup docker compose up -d --no-build cert-generator \
+          > /tmp/cert-gen.log 2>&1 &
+
+    - name: Use test environment config
+      shell: bash
+      run: |
+        cp backend/config.test.toml backend/config.toml
+        cp backend/secrets.example.toml backend/secrets.toml
+
+    - name: Wait for image pull and infra
+      shell: bash
+      run: |
+        if [ -f /tmp/infra-pull.pid ]; then
+          PID=$(cat /tmp/infra-pull.pid)
+          if kill -0 "$PID" 2>/dev/null; then
+            echo "Waiting for image pull + infra startup..."
+            tail --pid="$PID" -f /dev/null 2>/dev/null || true
+          fi
+        fi
+        cat /tmp/infra-pull.log 2>/dev/null || true
+        cat /tmp/cert-gen.log 2>/dev/null || true
+
+    - name: Start stack
+      shell: bash
+      env:
+        IMAGE_TAG: ${{ inputs.image-tag }}
+      run: docker compose up -d --no-build
+
+    - name: Wait for services
+      shell: bash
+      env:
+        WAIT_FOR_FRONTEND: ${{ inputs.wait-for-frontend }}
+      run: |
+        echo "Waiting for backend health..."
+        timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done'
+        echo "Backend ready"
+        if [ "$WAIT_FOR_FRONTEND" = "true" ]; then
+          echo "Waiting for frontend health..."
+          timeout 60 bash -c 'until curl -ksf https://localhost:5001 2>/dev/null; do sleep 2; done'
+          echo "Frontend ready"
+        fi
+
+    - name: Seed test users
+      shell: bash
+      run: docker compose exec -T backend uv run python scripts/seed_users.py
diff --git a/.github/actions/k3s-setup/action.yml b/.github/actions/k3s-setup/action.yml
deleted file mode 100644
index 523ca0ad..00000000
--- a/.github/actions/k3s-setup/action.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: 'K3s Setup'
-description: 'Install k3s and create kubeconfig for Docker containers'
-
-inputs:
-  namespace:
-    description: 'Kubernetes namespace to create'
-    required: false
-    default: 'integr8scode'
-  kubeconfig-path:
-    description: 'Path to write the Docker-accessible kubeconfig'
-    required: false
-    default: 'backend/kubeconfig.yaml'
-
-outputs:
-  kubeconfig:
-    description: 'Path to the kubeconfig file for Docker containers'
-    value: ${{ inputs.kubeconfig-path }}
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Install k3s
-      shell: bash
-      run: |
-        # --bind-address 0.0.0.0: Listen on all interfaces so Docker containers can reach it
-        # --tls-san host.docker.internal: Include in cert SANs for Docker container access
-        K3S_VERSION="${K3S_VERSION:-v1.32.11+k3s1}"
-        K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
-        curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
-        chmod +x /tmp/k3s-install.sh
-        INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh
-        mkdir -p /home/runner/.kube
-        sudo k3s kubectl config view --raw > /home/runner/.kube/config
-        sudo chmod 600 /home/runner/.kube/config
-
-    - name: Wait for k3s to be ready
-      shell: bash
-      run: |
-        export KUBECONFIG=/home/runner/.kube/config
-        timeout 90 bash -c 'until kubectl cluster-info; do sleep 5; done'
-
-    - name: Create namespace
-      shell: bash
-      env:
-        NAMESPACE: ${{ inputs.namespace }}
-      run: |
-        export KUBECONFIG=/home/runner/.kube/config
-        kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
-
-    - name: Create kubeconfig for Docker containers
-      shell: bash
-      env:
-        KUBECONFIG_PATH: ${{ inputs.kubeconfig-path }}
-      run: |
-        # Replace localhost/0.0.0.0 with host.docker.internal for container access
-        # (k3s may use 0.0.0.0 when started with --bind-address 0.0.0.0)
-        sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \
-          /home/runner/.kube/config > "$KUBECONFIG_PATH"
-        chmod 644 "$KUBECONFIG_PATH"
-        echo "Kubeconfig written to $KUBECONFIG_PATH"
-        echo "Server URL: $(grep server "$KUBECONFIG_PATH" | head -1)"
diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index 27f48d98..309433d6 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -269,83 +269,20 @@ jobs:
     needs: [build-images]
     if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: read
     steps:
       - uses: actions/checkout@v6
 
-      # ── Phase 1: Pull images + start infra in background (overlap with k3s) ──
-      - name: Pull images and pre-warm infra (background)
-        env:
-          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: |
-          # Pull all images (GHCR + Docker Hub) then start infra services.
-          # This runs throughout k3s install/finalize (~20s of overlap).
-          nohup bash -c '
-            IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1
-            echo "--- pull done, starting infra ---"
-            docker compose up -d --no-build \
-              mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1
-          ' > /tmp/infra-pull.log 2>&1 &
-          echo $! > /tmp/infra-pull.pid
-
-      - name: Use test environment config
-        run: |
-          cp backend/config.test.toml backend/config.toml
-          cp backend/secrets.example.toml backend/secrets.toml
-
-      # ── Phase 2: k3s install + finalize (overlaps with image pull + infra boot) ──
-      - name: Install k3s
-        run: |
-          K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
-          curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
-          chmod +x /tmp/k3s-install.sh
-          INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh
-
-      - name: Finalize k3s
-        run: |
-          mkdir -p /home/runner/.kube
-          sudo k3s kubectl config view --raw > /home/runner/.kube/config
-          sudo chmod 600 /home/runner/.kube/config
-          export KUBECONFIG=/home/runner/.kube/config
-          timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done'
-          kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f -
-          sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \
-            /home/runner/.kube/config > backend/kubeconfig.yaml
-          chmod 644 backend/kubeconfig.yaml
-
-      # Start cert-generator now that kubeconfig exists (runs during wait step)
-      - name: Start cert-generator (background)
-        env:
-          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: |
-          nohup docker compose up -d --no-build cert-generator \
-            > /tmp/cert-gen.log 2>&1 &
-
-      # ── Phase 3: Wait for pulls + start stack ──
-      - name: Wait for image pull and infra
-        run: |
-          if [ -f /tmp/infra-pull.pid ]; then
-            PID=$(cat /tmp/infra-pull.pid)
-            if kill -0 "$PID" 2>/dev/null; then
-              echo "Waiting for image pull + infra startup..."
-              tail --pid="$PID" -f /dev/null 2>/dev/null || true
-            fi
-          fi
-          cat /tmp/infra-pull.log 2>/dev/null || true
-          cat /tmp/cert-gen.log 2>/dev/null || true
-
-      - name: Start stack
-        env:
-          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: docker compose up -d --no-build
-
-      - name: Wait for backend
-        run: |
-          echo "Waiting for backend health..."
-          timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done'
-          echo "Backend ready"
+      - uses: ./.github/actions/e2e-boot
+        with:
+          image-tag: ${{ needs.build-images.outputs.sha-tag }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Seed test users
-        run: docker compose exec -T backend uv run python scripts/seed_users.py
+      - uses: ./.github/actions/e2e-ready
+        with:
+          image-tag: ${{ needs.build-images.outputs.sha-tag }}
 
       - name: Run E2E tests
         timeout-minutes: 15
@@ -376,19 +313,11 @@ jobs:
         run: |
           mkdir -p logs
           docker compose logs --timestamps > logs/docker-compose.log 2>&1
-          docker compose logs --timestamps backend > logs/backend.log 2>&1
-          docker compose logs --timestamps mongo > logs/mongo.log 2>&1 || true
-          docker compose logs --timestamps redis > logs/redis.log 2>&1 || true
-          docker compose logs --timestamps kafka > logs/kafka.log 2>&1 || true
-          docker compose logs --timestamps zookeeper > logs/zookeeper.log 2>&1 || true
-          docker compose logs --timestamps schema-registry > logs/schema-registry.log 2>&1 || true
-          docker compose logs --timestamps coordinator > logs/coordinator.log 2>&1 || true
-          docker compose logs --timestamps k8s-worker > logs/k8s-worker.log 2>&1 || true
-          docker compose logs --timestamps pod-monitor > logs/pod-monitor.log 2>&1 || true
-          docker compose logs --timestamps result-processor > logs/result-processor.log 2>&1 || true
-          docker compose logs --timestamps saga-orchestrator > logs/saga-orchestrator.log 2>&1 || true
-          docker compose logs --timestamps event-replay > logs/event-replay.log 2>&1 || true
-          docker compose logs --timestamps dlq-processor > logs/dlq-processor.log 2>&1 || true
+          for svc in backend mongo redis kafka zookeeper schema-registry \
+                     coordinator k8s-worker pod-monitor result-processor \
+                     saga-orchestrator event-replay dlq-processor; do
+            docker compose logs --timestamps "$svc" > "logs/$svc.log" 2>&1 || true
+          done
           kubectl get events --sort-by='.metadata.creationTimestamp' -A > logs/k8s-events.log 2>&1 || true
 
       - name: Upload logs
@@ -403,6 +332,9 @@ jobs:
     needs: [build-images]
     if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: read
     strategy:
       fail-fast: false
       matrix:
@@ -411,29 +343,13 @@ jobs:
     steps:
       - uses: actions/checkout@v6
 
-      # ── Phase 1: Pull images + start infra in background (runs during all subsequent steps) ──
-      - name: Pull images and pre-warm infra (background)
-        env:
-          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: |
-          # Pull all images (GHCR + Docker Hub) then start infra services.
-          # This runs throughout k3s + Node + Playwright setup (~80s of overlap).
-          nohup bash -c '
-            IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1
-            echo "--- pull done, starting infra ---"
-            docker compose up -d --no-build \
-              mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1
-          ' > /tmp/infra-pull.log 2>&1 &
-          echo $! > /tmp/infra-pull.pid
-
-      # ── Phase 2: k3s install + Node/Playwright setup (overlapped with pull + infra) ──
-      - name: Install k3s
-        run: |
-          K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
-          curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
-          chmod +x /tmp/k3s-install.sh
-          INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh
+      # Phase 1: kick off image pull + infra + k3s in background
+      - uses: ./.github/actions/e2e-boot
+        with:
+          image-tag: ${{ needs.build-images.outputs.sha-tag }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
+      # Phase 2: Node + Playwright setup (overlaps with k3s boot + image pull)
       - name: Setup Node.js
         uses: actions/setup-node@v6
         with:
@@ -461,61 +377,11 @@ jobs:
         working-directory: frontend
         run: npx playwright install chromium
 
-      # ── Phase 3: Finalize k3s (should be ready — 50s+ since install) ──
-      - name: Finalize k3s
-        run: |
-          mkdir -p /home/runner/.kube
-          sudo k3s kubectl config view --raw > /home/runner/.kube/config
-          sudo chmod 600 /home/runner/.kube/config
-          export KUBECONFIG=/home/runner/.kube/config
-          timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done'
-          kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f -
-          sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \
-            /home/runner/.kube/config > backend/kubeconfig.yaml
-          chmod 644 backend/kubeconfig.yaml
-
-      # Start cert-generator now that kubeconfig exists (runs during wait step)
-      - name: Start cert-generator (background)
-        env:
-          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: |
-          nohup docker compose up -d --no-build cert-generator \
-            > /tmp/cert-gen.log 2>&1 &
-
-      - name: Use test environment config
-        run: |
-          cp backend/config.test.toml backend/config.toml
-          cp backend/secrets.example.toml backend/secrets.toml
-
-      # ── Phase 4: Wait for pulls + start stack ──
-      - name: Wait for image pull and infra
-        run: |
-          if [ -f /tmp/infra-pull.pid ]; then
-            PID=$(cat /tmp/infra-pull.pid)
-            if kill -0 "$PID" 2>/dev/null; then
-              echo "Waiting for image pull + infra startup..."
-              tail --pid="$PID" -f /dev/null 2>/dev/null || true
-            fi
-          fi
-          cat /tmp/infra-pull.log 2>/dev/null || true
-          cat /tmp/cert-gen.log 2>/dev/null || true
-
-      - name: Start stack
-        env:
-          IMAGE_TAG: ${{ needs.build-images.outputs.sha-tag }}
-        run: docker compose up -d --no-build
-
-      - name: Wait for backend and frontend
-        run: |
-          echo "Waiting for backend health..."
-          timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done'
-          echo "Backend ready"
-          echo "Waiting for frontend health..."
-          timeout 60 bash -c 'until curl -ksf https://localhost:5001 2>/dev/null; do sleep 2; done'
-          echo "Frontend ready"
-
-      - name: Seed test users
-        run: docker compose exec -T backend uv run python scripts/seed_users.py
+      # Phase 3: finalize k3s + start stack (k3s has been booting since e2e-boot)
+      - uses: ./.github/actions/e2e-ready
+        with:
+          image-tag: ${{ needs.build-images.outputs.sha-tag }}
+          wait-for-frontend: 'true'
 
       - name: Run Playwright tests
         timeout-minutes: 10
@@ -533,9 +399,10 @@ jobs:
         if: failure()
         run: |
           mkdir -p logs
-          docker compose logs > logs/docker-compose.log 2>&1
-          docker compose logs backend > logs/backend.log 2>&1
-          docker compose logs frontend > logs/frontend.log 2>&1
+          docker compose logs --timestamps > logs/docker-compose.log 2>&1
+          for svc in backend frontend; do
+            docker compose logs --timestamps "$svc" > "logs/$svc.log" 2>&1 || true
+          done
 
       - name: Upload logs
         if: failure()

From 5ecb455737809e3d8be8ba18f2e139b470d78d39 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sat, 31 Jan 2026 23:56:43 +0100
Subject: [PATCH 15/17] fixes

---
 .github/actions/e2e-boot/action.yml  | 2 ++
 .github/actions/e2e-ready/action.yml | 7 +++++++
 .github/workflows/stack-tests.yml    | 1 +
 3 files changed, 10 insertions(+)

diff --git a/.github/actions/e2e-boot/action.yml b/.github/actions/e2e-boot/action.yml
index be4d7bdf..01850ea1 100644
--- a/.github/actions/e2e-boot/action.yml
+++ b/.github/actions/e2e-boot/action.yml
@@ -29,6 +29,7 @@ runs:
           echo "--- pull done, starting infra ---"
           docker compose up -d --no-build \
             mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1
+          echo $? > /tmp/infra-pull.exit
         ' > /tmp/infra-pull.log 2>&1 &
         echo $! > /tmp/infra-pull.pid
 
@@ -37,5 +38,6 @@ runs:
       run: |
         K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
         curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
+        echo "$K3S_INSTALL_SHA256  /tmp/k3s-install.sh" | sha256sum -c -
         chmod +x /tmp/k3s-install.sh
         INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh
diff --git a/.github/actions/e2e-ready/action.yml b/.github/actions/e2e-ready/action.yml
index c2b90d39..fb794382 100644
--- a/.github/actions/e2e-ready/action.yml
+++ b/.github/actions/e2e-ready/action.yml
@@ -52,6 +52,13 @@ runs:
         fi
         cat /tmp/infra-pull.log 2>/dev/null || true
         cat /tmp/cert-gen.log 2>/dev/null || true
+        if [ -f /tmp/infra-pull.exit ]; then
+          EXIT_CODE=$(cat /tmp/infra-pull.exit)
+          if [ "$EXIT_CODE" != "0" ]; then
+            echo "::error::Background image pull / infra pre-warm failed (exit $EXIT_CODE)"
+            exit 1
+          fi
+        fi
 
     - name: Start stack
       shell: bash
diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index 309433d6..373565ab 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -32,6 +32,7 @@ env:
   ZOOKEEPER_IMAGE: confluentinc/cp-zookeeper:7.8.2
   SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.8.2
   K3S_VERSION: v1.32.11+k3s1
+  K3S_INSTALL_SHA256: d75e014f2d2ab5d30a318efa5c326f3b0b7596f194afcff90fa7a7a91166d5f7
 
 jobs:
   # Fast unit tests (no infrastructure needed)

From b62ebbd65a877397adf65ca4e69f9d0dafe9e189 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sun, 1 Feb 2026 00:47:02 +0100
Subject: [PATCH 16/17] updated docs + branch = main for all CI workflows
 (removed dev)

---
 .github/workflows/frontend-ci.yml |   4 +-
 .github/workflows/mypy.yml        |   4 +-
 .github/workflows/ruff.yml        |   4 +-
 .github/workflows/security.yml    |   4 +-
 docs/operations/cicd.md           | 499 +++++++++++++++++++-----------
 5 files changed, 324 insertions(+), 191 deletions(-)

diff --git a/.github/workflows/frontend-ci.yml b/.github/workflows/frontend-ci.yml
index fe29a033..e6303aa1 100644
--- a/.github/workflows/frontend-ci.yml
+++ b/.github/workflows/frontend-ci.yml
@@ -2,12 +2,12 @@ name: Frontend CI
 
 on:
   push:
-    branches: [main, dev]
+    branches: [main]
     paths:
       - 'frontend/**'
       - '.github/workflows/frontend-ci.yml'
   pull_request:
-    branches: [main, dev]
+    branches: [main]
     paths:
       - 'frontend/**'
       - '.github/workflows/frontend-ci.yml'
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
index d4752b08..34070e65 100644
--- a/.github/workflows/mypy.yml
+++ b/.github/workflows/mypy.yml
@@ -2,9 +2,9 @@ name: MyPy Type Checking
 
 on:
   push:
-    branches: [ main, dev ]
+    branches: [ main ]
   pull_request:
-    branches: [ main, dev ]
+    branches: [ main ]
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 3ddec835..c670ce34 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -2,9 +2,9 @@ name: Ruff Linting
 
 on:
   push:
-    branches: [ main, dev ]
+    branches: [ main ]
   pull_request:
-    branches: [ main, dev ]
+    branches: [ main ]
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
index 4452c432..10837590 100644
--- a/.github/workflows/security.yml
+++ b/.github/workflows/security.yml
@@ -2,9 +2,9 @@ name: Security Scanning
 
 on:
   push:
-    branches: [ main, dev ]
+    branches: [ main ]
   pull_request:
-    branches: [ main, dev ]
+    branches: [ main ]
   workflow_dispatch:
 
 jobs:
diff --git a/docs/operations/cicd.md b/docs/operations/cicd.md
index 54ff0130..b19f45f9 100644
--- a/docs/operations/cicd.md
+++ b/docs/operations/cicd.md
@@ -1,41 +1,40 @@
 # CI/CD Pipeline
 
-The project uses GitHub Actions to automate code quality checks, security scanning, testing, and documentation
-deployment. Every push to `main` or `dev` and every pull request triggers the pipeline, with workflows running in
-parallel to provide fast feedback.
+The project uses GitHub Actions to automate code quality checks, security scanning, testing, image publishing, and
+documentation deployment. The pipeline is split across several workflow files that trigger independently based on path
+filters, so only relevant checks run for each change.
 
 ## Pipeline overview
 
 ```mermaid
 graph LR
-    subgraph "Code Quality"
+    subgraph "Code Quality (lightweight)"
         Ruff["Ruff Linting"]
         MyPy["MyPy Type Check"]
-        ESLint["ESLint + TypeScript"]
+        ESLint["ESLint + Svelte Check"]
     end
 
     subgraph "Security"
         Bandit["Bandit SAST"]
+        SBOM["SBOM & Grype"]
     end
 
-    subgraph "Docker Build & Scan"
-        Base["Build Base"]
-        Backend["Build Backend"]
-        Frontend["Build Frontend"]
-        ScanBE["Scan Backend"]
-        ScanFE["Scan Frontend"]
-        Base --> Backend
-        Base --> Frontend
-        Backend --> ScanBE
-        Frontend --> ScanFE
-    end
-
-    subgraph "Testing (stack-tests.yml)"
+    subgraph "Stack Tests"
         UnitBE["Backend Unit"]
         UnitFE["Frontend Unit"]
-        Stack["Stack Tests"]
-        UnitBE --> Stack
-        UnitFE --> Stack
+        Build["Build & Push Images"]
+        E2E_BE["Backend E2E"]
+        E2E_FE["Frontend E2E"]
+        UnitBE --> Build
+        UnitFE --> Build
+        Build --> E2E_BE
+        Build --> E2E_FE
+    end
+
+    subgraph "Docker Scan & Promote"
+        Scan["Trivy Scan (12 images)"]
+        Promote["Promote SHA → latest"]
+        Scan --> Promote
     end
 
     subgraph "Documentation"
@@ -43,143 +42,289 @@ graph LR
         Pages["GitHub Pages"]
     end
 
-    Push["Push / PR"] --> Ruff
-    Push --> MyPy
-    Push --> ESLint
-    Push --> Bandit
-    Push --> Base
-    Push --> UnitBE
-    Push --> UnitFE
-    Push --> Docs
+    Push["Push / PR"] --> Ruff & MyPy & ESLint & Bandit & SBOM & UnitBE & UnitFE & Docs
+    Build -->|main, all tests pass| Scan
     Docs -->|main only| Pages
 ```
 
-All workflows trigger on pushes to `main` and `dev` branches, pull requests against those branches, and can be triggered
-manually via `workflow_dispatch`. Path filters ensure workflows only run when relevant files change.
-
-## Linting and type checking
-
-Three lightweight workflows run first since they catch obvious issues quickly.
-
-**Backend (Python):**
-- [Ruff](https://docs.astral.sh/ruff/) checks for style violations, import ordering, and common bugs
-- [mypy](https://mypy.readthedocs.io/) with strict settings catches type mismatches and missing return types
-
-**Frontend (TypeScript):**
-- ESLint checks for code quality issues
-- TypeScript compiler (`tsc --noEmit`) verifies type correctness
-
-Both use dependency caching to skip reinstallation when lockfiles haven't changed.
-
-## Security scanning
-
-The security workflow uses [Bandit](https://bandit.readthedocs.io/) to perform static analysis on Python source files,
-flagging issues like hardcoded credentials, SQL injection patterns, and unsafe deserialization. It excludes the test
-directory and reports only medium-severity and above findings. Container-level vulnerability scanning with Trivy runs
-as part of the Docker workflow.
+The two heavyweight workflows are **Stack Tests** (builds images, runs all tests) and **Docker Scan & Promote**
+(scans images with Trivy and promotes to `latest`). They're connected: Docker Scan & Promote triggers automatically
+after Stack Tests succeeds on `main`, forming a build-test-scan-promote pipeline where the `latest` tag only moves
+forward when everything passes.
 
-## Docker build and scan
-
-The Docker workflow is structured as multiple jobs with dependencies, enabling parallel execution and early failure
-detection. If any job fails, dependent jobs are skipped immediately.
-
-```mermaid
-graph TD
-    A[build-base] --> B[build-backend]
-    A --> C[build-frontend]
-    B --> D[scan-backend]
-    C --> E[scan-frontend]
-    D --> F[summary]
-    E --> F
-
-    style A fill:#e1f5fe
-    style B fill:#fff3e0
-    style C fill:#fff3e0
-    style D fill:#ffebee
-    style E fill:#ffebee
-    style F fill:#e8f5e9
-```
+## Workflow files
 
-| Job              | Depends On       | Purpose                                              |
-|------------------|------------------|------------------------------------------------------|
-| `build-base`     | -                | Build shared base image with Python and dependencies |
-| `build-backend`  | `build-base`     | Build backend image using base as build context      |
-| `build-frontend` | `build-base`     | Build frontend image (runs parallel with backend)    |
-| `scan-backend`   | `build-backend`  | Trivy vulnerability scan on backend image            |
-| `scan-frontend`  | `build-frontend` | Trivy vulnerability scan on frontend image           |
-| `summary`        | All scans        | Generate summary (main branch only)                  |
+| Workflow                | File                                         | Trigger                                       | Purpose                                    |
+|-------------------------|----------------------------------------------|-----------------------------------------------|--------------------------------------------|
+| Stack Tests             | `.github/workflows/stack-tests.yml`          | Push/PR to `main`, tags `v*`                   | Unit tests, image build, E2E tests         |
+| Docker Scan & Promote   | `.github/workflows/docker.yml`               | After Stack Tests completes on `main`          | Trivy scan + promote SHA tag to `latest`   |
+| SBOM & Supply Chain     | `.github/workflows/sbom-compliance.yml`      | Push/PR to `main`, weekly schedule             | SPDX SBOM generation + Grype vulnerability scan |
+| Ruff Linting            | `.github/workflows/ruff.yml`                 | Push/PR to `main`                              | Python code style and import checks        |
+| MyPy Type Checking      | `.github/workflows/mypy.yml`                 | Push/PR to `main`                              | Python static type analysis                |
+| Frontend CI             | `.github/workflows/frontend-ci.yml`          | Push/PR to `main` (frontend changes)           | ESLint + Svelte type check                 |
+| Security Scanning       | `.github/workflows/security.yml`             | Push/PR to `main`                              | Bandit SAST                                |
+| Documentation           | `.github/workflows/docs.yml`                 | Push/PR (`docs/`, `mkdocs.yml`)                | MkDocs build and GitHub Pages deploy       |
 
-### Base image
+## Composite actions
 
-The base image (`Dockerfile.base`) contains Python, system dependencies, and all pip packages. It
-uses [uv](https://docs.astral.sh/uv/) to install dependencies from the lockfile with `uv sync --locked --no-dev`,
-ensuring reproducible builds without development tools.
+Shared steps are extracted into reusable composite actions under `.github/actions/`. This eliminates duplication between
+the backend and frontend E2E jobs, which both need k3s and the full docker compose stack but set it up differently.
 
-### Security scanning
+| Action                  | File                                         | Purpose                                    |
+|-------------------------|----------------------------------------------|--------------------------------------------|
+| E2E Boot                | `.github/actions/e2e-boot/action.yml`        | GHCR login, background image pull + infra pre-warm, k3s install |
+| E2E Ready               | `.github/actions/e2e-ready/action.yml`       | Finalize k3s, start compose stack, health checks, seed users |
 
-After each image builds, [Trivy](https://trivy.dev/) scans it for known vulnerabilities in OS packages and Python
-dependencies. The scan fails if it finds any critical or high severity issues with available fixes.
+The split is intentional. Frontend E2E needs to install Node.js and Playwright browsers _between_ boot and ready,
+overlapping that work with k3s installation to save wall-clock time. Backend E2E calls them back-to-back since it has
+no setup to overlap.
 
-## Stack tests (unified testing)
+## Stack Tests (the main workflow)
 
-The `stack-tests.yml` workflow consolidates all testing that requires infrastructure into a single job, avoiding
-redundant stack setup across multiple jobs.
+This is the core testing workflow. It builds all 13 container images, pushes them to GHCR with immutable SHA-based
+tags, then runs E2E tests on separate runners that pull images from the registry.
 
 ```mermaid
 graph TD
-    subgraph "Parallel (fast)"
-        A[Backend Unit Tests]
-        B[Frontend Unit Tests]
-    end
-
-    subgraph "Build"
-        C[Build Images]
+    subgraph "Phase 1: Fast feedback"
+        A["Backend Unit Tests"]
+        B["Frontend Unit Tests"]
     end
 
-    subgraph "Backend E2E (own runner)"
-        D1[Setup k3s + Stack]
-        E[Backend E2E Tests]
-        D1 --> E
+    subgraph "Phase 2: Build"
+        C["Build & Push 13 Images to GHCR"]
     end
 
-    subgraph "Frontend E2E (own runner)"
-        D2[Setup k3s + Stack]
-        F[Frontend E2E Tests]
-        D2 --> F
+    subgraph "Phase 3: E2E (parallel runners)"
+        D["Backend E2E<br/>(k3s + full stack)"]
+        E["Frontend E2E Shard 1/2<br/>(k3s + Playwright)"]
+        F["Frontend E2E Shard 2/2<br/>(k3s + Playwright)"]
     end
 
     A --> C
     B --> C
-    C --> D1
-    C --> D2
+    C --> D & E & F
 
     style A fill:#e8f5e9
     style B fill:#e8f5e9
     style C fill:#e1f5fe
-    style D1 fill:#e1f5fe
-    style D2 fill:#e1f5fe
+    style D fill:#fff3e0
     style E fill:#fff3e0
     style F fill:#fff3e0
 ```
 
-### Test execution order
+### Phase 1: Unit tests
+
+Backend and frontend unit tests run in parallel. They need no infrastructure and complete quickly. If either fails,
+the image build is skipped entirely.
+
+### Phase 2: Build and push
+
+All 13 images are built on a single runner and pushed to GHCR with an immutable `sha-<7chars>` tag:
+
+| Image                | Source                                      |
+|----------------------|---------------------------------------------|
+| `base`               | `backend/Dockerfile.base`                   |
+| `backend`            | `backend/Dockerfile`                        |
+| `coordinator`        | `backend/workers/Dockerfile.coordinator`    |
+| `k8s-worker`         | `backend/workers/Dockerfile.k8s_worker`     |
+| `pod-monitor`        | `backend/workers/Dockerfile.pod_monitor`    |
+| `result-processor`   | `backend/workers/Dockerfile.result_processor` |
+| `saga-orchestrator`  | `backend/workers/Dockerfile.saga_orchestrator` |
+| `event-replay`       | `backend/workers/Dockerfile.event_replay`   |
+| `dlq-processor`      | `backend/workers/Dockerfile.dlq_processor`  |
+| `cert-generator`     | `cert-generator/Dockerfile`                 |
+| `zookeeper-certgen`  | `backend/zookeeper/Dockerfile.certgen`      |
+| `frontend-dev`       | `frontend/Dockerfile`                       |
+| `frontend`           | `frontend/Dockerfile.prod`                  |
+
+The base image is cached separately as a zstd-compressed tarball since its dependencies rarely change. Worker images
+depend on it via `--build-context base=docker-image://integr8scode-base:latest`. Utility and frontend images use GHA
+layer caching.
+
+All 13 images are pushed to GHCR in parallel, with each push tracked by PID so individual failures are reported:
+
+```yaml
+declare -A PIDS
+for name in base backend coordinator k8s-worker ...; do
+  docker push "$IMG/$name:$TAG" &
+  PIDS[$name]=$!
+done
+FAILED=0
+for name in "${!PIDS[@]}"; do
+  if ! wait "${PIDS[$name]}"; then
+    echo "::error::Failed to push $name"
+    FAILED=1
+  fi
+done
+[ "$FAILED" -eq 0 ] || exit 1
+```
 
-1. **Unit tests (parallel)**: Backend and frontend unit tests run simultaneously. They require no infrastructure and
-   complete quickly (~1-2 min each).
+Fork PRs skip the GHCR push (no write access), so E2E tests only run for non-fork PRs.
 
-2. **Image build**: After unit tests pass, all Docker images are built with GHA layer caching.
+### Phase 3: E2E tests
 
-3. **E2E tests (parallel)**: Backend and frontend E2E tests run in parallel on separate runners, each with its own
-   isolated stack (k3s + docker compose):
-    - Backend E2E tests (pytest with k8s)
-    - Frontend E2E tests (Playwright)
+Backend and frontend E2E tests run on separate runners. Each runner provisions its own k3s cluster and docker compose
+stack, pulling pre-built images from GHCR.
+
+#### E2E Boot (`.github/actions/e2e-boot`)
+
+This action kicks off three slow tasks that can overlap:
+
+1. **GHCR login** using `docker/login-action@v3`
+2. **Background image pull + infra pre-warm** — pulls all compose images then starts infrastructure services
+   (mongo, redis, kafka, zookeeper, schema-registry) in a background `nohup` process. The exit status is persisted
+   to `/tmp/infra-pull.exit` so the next action can check for failures.
+3. **k3s install** — downloads and installs a pinned k3s version with SHA256 checksum verification (see
+   [supply-chain hardening](#supply-chain-hardening) below)
+
+#### E2E Ready (`.github/actions/e2e-ready`)
+
+This action finalizes the environment after boot tasks complete:
+
+1. **Finalize k3s** — copies kubeconfig, rewrites the API server address to `host.docker.internal` so containers
+   inside docker compose can reach the k3s API server, creates the `integr8scode` namespace
+2. **Start cert-generator** in the background
+3. **Copy test config** — uses `config.test.toml` and `secrets.example.toml`
+4. **Wait for image pull and infra** — blocks until the background pull completes and checks the exit code from
+   `/tmp/infra-pull.exit`, failing fast if the background process had errors
+5. **Start compose stack** with `docker compose up -d --no-build`
+6. **Health checks** — waits for backend (`/api/v1/health/live`), and optionally frontend (`https://localhost:5001`)
+7. **Seed test users** via `scripts/seed_users.py`
+
+#### Frontend E2E sharding
+
+Frontend E2E tests use Playwright with 2 shards running in parallel on separate runners. Between `e2e-boot` and
+`e2e-ready`, each shard installs Node.js dependencies and Playwright browsers (with caching), overlapping that work
+with k3s booting in the background.
+
+```
+e2e-boot (GHCR login + pull + k3s install)
+    |
+    ├── npm ci + playwright install (overlapped with k3s)
+    |
+e2e-ready (finalize k3s + start stack + health check)
+    |
+    └── npx playwright test --shard=N/2
+```
 
 ### Coverage reporting
 
-Each test suite reports coverage to [Codecov](https://codecov.io/):
-- `backend-unit` flag for unit tests
-- `backend-e2e` flag for E2E tests
-- `frontend-unit` flag for frontend unit tests
+Each test suite reports coverage to [Codecov](https://codecov.io/) with separate flags:
+
+- `backend-unit` — backend unit tests
+- `backend-e2e` — backend E2E tests
+- `frontend-unit` — frontend unit tests (Vitest with `lcov` output)
+
+### Log collection on failure
+
+When E2E tests fail, logs are collected automatically and uploaded as artifacts:
+
+- All docker compose service logs with timestamps
+- Individual service logs for each worker
+- Kubernetes events sorted by timestamp (backend E2E only)
+
+## Docker Scan & Promote
+
+This workflow implements the promotion model: the `latest` tag is never set during the build. Only this workflow
+sets it, and only after all tests pass.
+
+```mermaid
+graph LR
+    ST["Stack Tests<br/>(main, success)"] -->|workflow_run trigger| Scan
+    Scan["Trivy Scan<br/>(12 images in parallel)"] --> Promote["crane copy<br/>sha-xxx → latest"]
+    Promote --> Summary["Step Summary"]
+```
+
+### Trigger
+
+Runs automatically when `Stack Tests` completes successfully on `main`. Can also be triggered manually via
+`workflow_dispatch` with an optional SHA input to promote a specific commit.
+
+### Scan
+
+Uses [Trivy](https://trivy.dev/) (pinned at `v0.68.2`) to scan all 12 deployed images in parallel via matrix strategy.
+Scans for `CRITICAL` and `HIGH` severity vulnerabilities with unfixed issues ignored. Results are uploaded as SARIF
+files to GitHub's Security tab.
+
+### Promote
+
+Uses [crane](https://github.com/google/go-containerregistry/blob/main/cmd/crane/README.md) to copy manifests at the
+registry level (`crane copy sha-tag latest`), avoiding any rebuild or re-push. This is a fast, atomic operation that
+simply re-tags existing image manifests.
+
+## SBOM & Supply Chain Security
+
+The `sbom-compliance.yml` workflow generates [SPDX](https://spdx.dev/) Software Bills of Materials for both backend
+(Python) and frontend (JavaScript) components. It runs on every push/PR to `main` and weekly on a schedule.
+
+For each component:
+
+1. **Generate SBOM** using [anchore/sbom-action](https://github.com/anchore/sbom-action) — produces an SPDX JSON file
+   listing all direct and transitive dependencies
+2. **Scan SBOM** using [anchore/scan-action](https://github.com/anchore/scan-action) (Grype) — checks for known
+   vulnerabilities with a `high` severity cutoff
+3. **Upload** — SBOM artifacts are retained for 5 days; vulnerability results are uploaded as SARIF to GitHub's
+   Security tab
+
+## Supply-chain hardening
+
+### k3s version pinning and checksum verification
+
+The k3s installation in CI is hardened against supply-chain attacks:
+
+1. **Pinned version** — `K3S_VERSION` is set as a workflow-level env var (`v1.32.11+k3s1`), not fetched dynamically
+2. **Source pinning** — the install script is fetched from the k3s GitHub repository at the exact tagged version
+   (e.g., `https://raw.githubusercontent.com/k3s-io/k3s/v1.32.11%2Bk3s1/install.sh`), not from the `get.k3s.io` CDN
+3. **SHA256 verification** — the install script is verified against a known checksum before execution:
+
+```bash
+K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
+curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
+echo "$K3S_INSTALL_SHA256  /tmp/k3s-install.sh" | sha256sum -c -
+chmod +x /tmp/k3s-install.sh
+INSTALL_K3S_VERSION="$K3S_VERSION" ... /tmp/k3s-install.sh
+```
+
+This prevents the common `curl | sh` anti-pattern where a compromised CDN or MITM could inject malicious code.
+
+### GHCR image tags
+
+Images are tagged with `sha-<7chars>` (immutable, tied to a specific commit) during build. The `latest` tag is only
+applied by the Docker Scan & Promote workflow after all tests and security scans pass. This means:
+
+- Every E2E test runs against exactly the images built from that commit
+- `latest` is never stale or untested
+- Any commit's images can be pulled by their SHA tag for debugging
+
+### Dependency pinning
+
+All GitHub Actions are pinned to major versions (e.g., `actions/checkout@v6`, `docker/build-push-action@v6`). Trivy is
+pinned to a specific version (`aquasecurity/trivy-action@0.33.1`) for scan reproducibility.
+
+## Linting and type checking
+
+Three lightweight workflows run independently since they catch obvious issues quickly.
+
+**Backend (Python):**
+
+- [Ruff](https://docs.astral.sh/ruff/) checks for style violations, import ordering, and common bugs
+- [mypy](https://mypy.readthedocs.io/) with strict settings catches type mismatches and missing return types
+
+**Frontend (TypeScript/Svelte):**
+
+- ESLint checks for code quality issues
+- `svelte-check` verifies TypeScript types and Svelte component correctness
+
+Both use dependency caching ([uv](https://docs.astral.sh/uv/) for Python, npm for Node.js) to skip reinstallation
+when lockfiles haven't changed.
+
+## Security scanning
+
+The `security.yml` workflow uses [Bandit](https://bandit.readthedocs.io/) to perform static analysis on Python source
+files, flagging issues like hardcoded credentials, SQL injection patterns, and unsafe deserialization. It excludes the
+test directory and reports only medium-severity and above findings. Container-level vulnerability scanning with Trivy
+runs as part of the [Docker Scan & Promote](#docker-scan--promote) workflow.
 
 ## Documentation
 
@@ -189,6 +334,47 @@ the [Material theme](https://squidfunk.github.io/mkdocs-material/). It triggers
 
 On pushes to main, the workflow deploys the built site to GitHub Pages.
 
+## Build optimizations
+
+### Docker layer caching
+
+All image builds use [docker/build-push-action](https://github.com/docker/build-push-action) with GitHub Actions
+cache. Each service has its own cache scope, preventing pollution between unrelated builds:
+
+```yaml
+- name: Build cert-generator image
+  uses: docker/build-push-action@v6
+  with:
+    context: ./cert-generator
+    file: ./cert-generator/Dockerfile
+    load: true
+    tags: integr8scode-cert-generator:latest
+    cache-from: type=gha,scope=cert-generator
+    cache-to: type=gha,mode=max,scope=cert-generator
+```
+
+### Base image caching
+
+The base image (Python + all pip dependencies) changes infrequently, so it's cached as a zstd-compressed tarball keyed
+on `Dockerfile.base`, `pyproject.toml`, and `uv.lock`. On cache hit the image is loaded directly with `docker load`,
+skipping the entire build.
+
+### Background infra pre-warm
+
+The `e2e-boot` action pulls all docker compose images and starts infrastructure services _in the background_ while k3s
+installs. This overlaps network-bound (image pull) and CPU-bound (k3s compilation) work, saving several minutes per
+E2E job.
+
+### Frontend Playwright caching
+
+Playwright browsers are cached by `package-lock.json` hash. On cache hit, only system dependencies are installed
+(`playwright install-deps chromium`), skipping the browser download.
+
+### Parallel image push
+
+All 13 images are pushed to GHCR concurrently using background processes with PID tracking. Each push failure is
+reported individually via `::error::` annotations.
+
 ## Running locally
 
 You can run most checks locally before pushing.
@@ -197,10 +383,10 @@ You can run most checks locally before pushing.
 cd backend
 
 # Linting
-uv run ruff check .
+uv run ruff check . --config pyproject.toml
 
 # Type checking
-uv run mypy .
+uv run mypy --config-file pyproject.toml --strict .
 
 # Security scan
 uv tool run bandit -r . -x tests/ -ll
@@ -216,76 +402,23 @@ cd frontend
 npm run lint
 
 # Type checking
-npx tsc --noEmit
+npm run check
 
 # Unit tests
 npm run test
 ```
 
-For E2E tests, use the same deployment as CI:
+For E2E tests, use the deployment script to bring up the full stack:
 
 ```bash
-# Start full stack (requires k8s configured locally)
-./deploy.sh dev
+# Start full stack with k8s configured locally
+./deploy.sh dev --wait
 
-# Run tests inside the running backend container
+# Run backend E2E tests inside the running container
 docker compose exec -T backend uv run pytest tests/e2e -v
 
 # Run frontend E2E tests
 cd frontend && npx playwright test
 ```
 
-Or use `./deploy.sh test` which handles everything automatically.
-
-## Build optimizations
-
-The CI pipeline employs several caching strategies to minimize build times.
-
-### Docker layer caching
-
-All image builds use [docker/build-push-action](https://github.com/docker/build-push-action) with GitHub Actions cache:
-
-```yaml
-- name: Build base image
-  uses: docker/build-push-action@v6
-  with:
-    context: ./backend
-    file: ./backend/Dockerfile.base
-    load: true
-    tags: integr8scode-base:latest
-    cache-from: type=gha,scope=backend-base
-    cache-to: type=gha,mode=max,scope=backend-base
-```
-
-Each service has its own cache scope (`backend-base`, `backend`, `frontend`, `cert-generator`), preventing cache
-pollution between unrelated builds.
-
-### Infrastructure image caching
-
-A reusable action at `.github/actions/docker-cache` handles infrastructure images (MongoDB, Redis, Kafka, Schema
-Registry). It stores pulled images as zstd-compressed tarballs in the GitHub Actions cache, saving ~30 seconds per run
-and avoiding Docker Hub rate limits.
-
-### k3s setup action
-
-A reusable composite action at `.github/actions/k3s-setup` handles Kubernetes setup:
-- Installs k3s with traefik disabled
-- Creates the `integr8scode` namespace
-- Generates a kubeconfig accessible from Docker containers (via `host.docker.internal`)
-
-This eliminates copy-paste across workflows and ensures consistent k8s setup.
-
-## Workflow files
-
-| Workflow           | File                                 | Purpose                            |
-|--------------------|--------------------------------------|------------------------------------|
-| Ruff Linting       | `.github/workflows/ruff.yml`         | Python code style and import checks |
-| MyPy Type Checking | `.github/workflows/mypy.yml`         | Python static type analysis        |
-| Frontend CI        | `.github/workflows/frontend-ci.yml`  | TypeScript lint and type check     |
-| Security Scanning  | `.github/workflows/security.yml`     | Bandit SAST                        |
-| Docker Build & Scan| `.github/workflows/docker.yml`       | Image build and Trivy scan         |
-| Stack Tests        | `.github/workflows/stack-tests.yml`  | All unit and E2E tests               |
-| Documentation      | `.github/workflows/docs.yml`         | MkDocs build and deploy            |
-
-All workflows use [uv](https://docs.astral.sh/uv/) for Python dependency management and npm for Node.js, with caching
-enabled for both.
+Or use `./deploy.sh test` which handles stack setup, testing, and teardown automatically.

From 595c2238c17adbd3d2cfcdf2354dbaa7ff2737ba Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Sun, 1 Feb 2026 00:54:40 +0100
Subject: [PATCH 17/17] clarified 12/13 images in docs

---
 docs/operations/cicd.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/operations/cicd.md b/docs/operations/cicd.md
index b19f45f9..0db2fee6 100644
--- a/docs/operations/cicd.md
+++ b/docs/operations/cicd.md
@@ -138,6 +138,10 @@ All 13 images are built on a single runner and pushed to GHCR with an immutable
 | `frontend-dev`       | `frontend/Dockerfile`                       |
 | `frontend`           | `frontend/Dockerfile.prod`                  |
 
+Of these 13 images, 12 are scanned by Trivy and promoted to `latest` in the
+[Docker Scan & Promote](#docker-scan--promote) workflow. The `frontend-dev` image is excluded — it's the Vite dev
+server build used only for E2E tests in CI and is never deployed to production.
+
 The base image is cached separately as a zstd-compressed tarball since its dependencies rarely change. Worker images
 depend on it via `--build-context base=docker-image://integr8scode-base:latest`. Utility and frontend images use GHA
 layer caching.