diff --git a/.github/actions/docker-cache/action.yml b/.github/actions/docker-cache/action.yml
deleted file mode 100644
index 253885e2..00000000
--- a/.github/actions/docker-cache/action.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-name: 'Docker Image Cache'
-description: 'Cache and load Docker images for CI jobs'
-
-inputs:
-  images:
-    description: 'Space-separated list of Docker images to cache'
-    required: true
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Generate cache key from images
-      id: cache-key
-      shell: bash
-      env:
-        IMAGES_INPUT: ${{ inputs.images }}
-      run: |
-        # Create a stable hash from the sorted image list
-        # Using env var to prevent script injection
-        IMAGES_HASH=$(echo "$IMAGES_INPUT" | tr ' ' '\n' | sort | md5sum | cut -d' ' -f1)
-        echo "key=docker-${{ runner.os }}-${IMAGES_HASH}" >> $GITHUB_OUTPUT
-
-    - name: Cache Docker images
-      uses: actions/cache@v5
-      id: docker-cache
-      with:
-        path: /tmp/docker-cache
-        key: ${{ steps.cache-key.outputs.key }}
-
-    - name: Load cached Docker images
-      if: steps.docker-cache.outputs.cache-hit == 'true'
-      shell: bash
-      run: |
-        echo "Loading cached images..."
-        for f in /tmp/docker-cache/*.tar.zst; do
-          zstd -d -c "$f" | docker load &
-        done
-        wait
-        docker images
-
-    - name: Pull and save Docker images
-      if: steps.docker-cache.outputs.cache-hit != 'true'
-      shell: bash
-      env:
-        IMAGES_INPUT: ${{ inputs.images }}
-      run: |
-        mkdir -p /tmp/docker-cache
-
-        echo "Pulling images in parallel..."
-        for img in $IMAGES_INPUT; do
-          docker pull "$img" &
-        done
-        wait
-
-        echo "Saving images with zstd compression..."
-        for img in $IMAGES_INPUT; do
-          # Create filename from image name (replace special chars)
-          filename=$(echo "$img" | tr '/:' '_')
-          docker save "$img" | zstd -T0 -3 > "/tmp/docker-cache/${filename}.tar.zst" &
-        done
-        wait
-
-        echo "Cache size:"
-        du -sh /tmp/docker-cache/
diff --git a/.github/actions/e2e-boot/action.yml b/.github/actions/e2e-boot/action.yml
new file mode 100644
index 00000000..01850ea1
--- /dev/null
+++ b/.github/actions/e2e-boot/action.yml
@@ -0,0 +1,43 @@
+name: 'E2E Boot'
+description: 'Kick off slow background tasks: GHCR auth, image pull + infra pre-warm, k3s install'
+
+inputs:
+  image-tag:
+    description: 'GHCR image tag (e.g., sha-abc1234)'
+    required: true
+  github-token:
+    description: 'GitHub token for GHCR authentication'
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Log in to GHCR
+      uses: docker/login-action@v3
+      with:
+        registry: ${{ env.REGISTRY }}
+        username: ${{ github.actor }}
+        password: ${{ inputs.github-token }}
+
+    - name: Pull images and pre-warm infra (background)
+      shell: bash
+      env:
+        IMAGE_TAG: ${{ inputs.image-tag }}
+      run: |
+        nohup bash -c '
+          IMAGE_TAG='"$IMAGE_TAG"' docker compose pull --quiet 2>&1
+          echo "--- pull done, starting infra ---"
+          docker compose up -d --no-build \
+            mongo redis shared-ca zookeeper-certgen zookeeper kafka schema-registry 2>&1
+          echo $? > /tmp/infra-pull.exit
+        ' > /tmp/infra-pull.log 2>&1 &
+        echo $! > /tmp/infra-pull.pid
+
+    - name: Install k3s
+      shell: bash
+      run: |
+        K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
+        curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
+        echo "$K3S_INSTALL_SHA256  /tmp/k3s-install.sh" | sha256sum -c -
+        chmod +x /tmp/k3s-install.sh
+        INSTALL_K3S_VERSION="$K3S_VERSION" INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" /tmp/k3s-install.sh
diff --git a/.github/actions/e2e-ready/action.yml b/.github/actions/e2e-ready/action.yml
new file mode 100644
index 00000000..fb794382
--- /dev/null
+++ b/.github/actions/e2e-ready/action.yml
@@ -0,0 +1,85 @@
+name: 'E2E Ready'
+description: 'Finalize k3s, wait for infra, start compose stack, health-check, seed test users'
+
+inputs:
+  image-tag:
+    description: 'GHCR image tag (e.g., sha-abc1234)'
+    required: true
+  wait-for-frontend:
+    description: 'Also wait for frontend health check (default: false)'
+    required: false
+    default: 'false'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Finalize k3s
+      shell: bash
+      run: |
+        mkdir -p /home/runner/.kube
+        sudo k3s kubectl config view --raw > /home/runner/.kube/config
+        sudo chmod 600 /home/runner/.kube/config
+        export KUBECONFIG=/home/runner/.kube/config
+        timeout 90 bash -c 'until kubectl cluster-info 2>/dev/null; do sleep 3; done'
+        kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f -
+        sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \
+          /home/runner/.kube/config > backend/kubeconfig.yaml
+        chmod 644 backend/kubeconfig.yaml
+
+    - name: Start cert-generator (background)
+      shell: bash
+      env:
+        IMAGE_TAG: ${{ inputs.image-tag }}
+      run: |
+        nohup docker compose up -d --no-build cert-generator \
+          > /tmp/cert-gen.log 2>&1 &
+
+    - name: Use test environment config
+      shell: bash
+      run: |
+        cp backend/config.test.toml backend/config.toml
+        cp backend/secrets.example.toml backend/secrets.toml
+
+    - name: Wait for image pull and infra
+      shell: bash
+      run: |
+        if [ -f /tmp/infra-pull.pid ]; then
+          PID=$(cat /tmp/infra-pull.pid)
+          if kill -0 "$PID" 2>/dev/null; then
+            echo "Waiting for image pull + infra startup..."
+            tail --pid="$PID" -f /dev/null 2>/dev/null || true
+          fi
+        fi
+        cat /tmp/infra-pull.log 2>/dev/null || true
+        cat /tmp/cert-gen.log 2>/dev/null || true
+        if [ -f /tmp/infra-pull.exit ]; then
+          EXIT_CODE=$(cat /tmp/infra-pull.exit)
+          if [ "$EXIT_CODE" != "0" ]; then
+            echo "::error::Background image pull / infra pre-warm failed (exit $EXIT_CODE)"
+            exit 1
+          fi
+        fi
+
+    - name: Start stack
+      shell: bash
+      env:
+        IMAGE_TAG: ${{ inputs.image-tag }}
+      run: docker compose up -d --no-build
+
+    - name: Wait for services
+      shell: bash
+      env:
+        WAIT_FOR_FRONTEND: ${{ inputs.wait-for-frontend }}
+      run: |
+        echo "Waiting for backend health..."
+        timeout 120 bash -c 'until curl -ksf https://localhost/api/v1/health/live 2>/dev/null; do sleep 2; done'
+        echo "Backend ready"
+        if [ "$WAIT_FOR_FRONTEND" = "true" ]; then
+          echo "Waiting for frontend health..."
+          timeout 60 bash -c 'until curl -ksf https://localhost:5001 2>/dev/null; do sleep 2; done'
+          echo "Frontend ready"
+        fi
+
+    - name: Seed test users
+      shell: bash
+      run: docker compose exec -T backend uv run python scripts/seed_users.py
diff --git a/.github/actions/k3s-setup/action.yml b/.github/actions/k3s-setup/action.yml
deleted file mode 100644
index d21c4a43..00000000
--- a/.github/actions/k3s-setup/action.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: 'K3s Setup'
-description: 'Install k3s and create kubeconfig for Docker containers'
-
-inputs:
-  namespace:
-    description: 'Kubernetes namespace to create'
-    required: false
-    default: 'integr8scode'
-  kubeconfig-path:
-    description: 'Path to write the Docker-accessible kubeconfig'
-    required: false
-    default: 'backend/kubeconfig.yaml'
-
-outputs:
-  kubeconfig:
-    description: 'Path to the kubeconfig file for Docker containers'
-    value: ${{ inputs.kubeconfig-path }}
-
-runs:
-  using: 'composite'
-  steps:
-    - name: Install k3s
-      shell: bash
-      run: |
-        # --bind-address 0.0.0.0: Listen on all interfaces so Docker containers can reach it
-        # --tls-san host.docker.internal: Include in cert SANs for Docker container access
-        curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh -
-        mkdir -p /home/runner/.kube
-        sudo k3s kubectl config view --raw > /home/runner/.kube/config
-        sudo chmod 600 /home/runner/.kube/config
-
-    - name: Wait for k3s to be ready
-      shell: bash
-      run: |
-        export KUBECONFIG=/home/runner/.kube/config
-        timeout 90 bash -c 'until kubectl cluster-info; do sleep 5; done'
-
-    - name: Create namespace
-      shell: bash
-      env:
-        NAMESPACE: ${{ inputs.namespace }}
-      run: |
-        export KUBECONFIG=/home/runner/.kube/config
-        kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
-
-    - name: Create kubeconfig for Docker containers
-      shell: bash
-      env:
-        KUBECONFIG_PATH: ${{ inputs.kubeconfig-path }}
-      run: |
-        # Replace localhost/0.0.0.0 with host.docker.internal for container access
-        # (k3s may use 0.0.0.0 when started with --bind-address 0.0.0.0)
-        sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \
-          /home/runner/.kube/config > "$KUBECONFIG_PATH"
-        chmod 644 "$KUBECONFIG_PATH"
-        echo "Kubeconfig written to $KUBECONFIG_PATH"
-        echo "Server URL: $(grep server "$KUBECONFIG_PATH" | head -1)"
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 2cdd4f40..9778e7a3 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -1,215 +1,70 @@
-name: Docker Build, Scan & Publish
+name: Docker Scan & Promote
 
+# Runs after Stack Tests completes on main — promotes sha-xxx → latest.
+# "latest" is NEVER set during build. Only this workflow can set it,
+# and only after all tests pass. If any test fails, latest stays unchanged.
 on:
-  push:
-    branches: [ main ]
-    tags: [ 'v*' ]
-  pull_request:
-    branches: [ main ]
+  workflow_run:
+    workflows: ["Stack Tests"]
+    types: [completed]
   workflow_dispatch:
+    inputs:
+      sha:
+        description: 'Full commit SHA to promote (defaults to latest main)'
+        required: false
 
 env:
   REGISTRY: ghcr.io
 
 jobs:
-  build-base:
-    name: Build Base
+  scan:
+    name: Scan ${{ matrix.image }}
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.workflow_run.conclusion == 'success' &&
+       github.event.workflow_run.head_branch == 'main')
     runs-on: ubuntu-latest
     permissions:
       contents: read
-      packages: write
-
-    outputs:
-      image-tag: ${{ steps.image-tag.outputs.tag }}
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Set lowercase image prefix
-        run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/base
-          tags: |
-            type=ref,event=branch
-            type=ref,event=pr
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=sha-
-            type=raw,value=latest,enable={{is_default_branch}}
-
-      - name: Determine image tag for dependent builds
-        id: image-tag
-        run: |
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            echo "tag=pr-${{ github.event.number }}" >> $GITHUB_OUTPUT
-          else
-            echo "tag=latest" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Build and push
-        uses: docker/build-push-action@v6
-        with:
-          context: ./backend
-          file: ./backend/Dockerfile.base
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,scope=base
-          cache-to: type=gha,mode=max,scope=base
-
-  build-backend:
-    name: Build Backend
-    needs: build-base
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-
-    outputs:
-      image-ref: ${{ steps.image-ref.outputs.ref }}
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Set lowercase image prefix
-        run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend
-          tags: |
-            type=ref,event=branch
-            type=ref,event=pr
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=sha-
-            type=raw,value=latest,enable={{is_default_branch}}
-
-      - name: Set image reference for scan
-        id: image-ref
-        run: |
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend:pr-${{ github.event.number }}" >> $GITHUB_OUTPUT
-          else
-            echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend:latest" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Build and push
-        uses: docker/build-push-action@v6
-        with:
-          context: ./backend
-          file: ./backend/Dockerfile
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,scope=backend
-          cache-to: type=gha,mode=max,scope=backend
-          build-contexts: |
-            base=docker-image://${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/base:${{ needs.build-base.outputs.image-tag }}
-
-  build-frontend:
-    name: Build Frontend
-    needs: build-base
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-
-    outputs:
-      image-ref: ${{ steps.image-ref.outputs.ref }}
-
+      security-events: write
+      packages: read
+    strategy:
+      fail-fast: false
+      matrix:
+        image:
+          - base
+          - backend
+          - frontend
+          - coordinator
+          - k8s-worker
+          - pod-monitor
+          - result-processor
+          - saga-orchestrator
+          - event-replay
+          - dlq-processor
+          - cert-generator
+          - zookeeper-certgen
     steps:
       - uses: actions/checkout@v6
 
-      - name: Set lowercase image prefix
-        run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend
-          tags: |
-            type=ref,event=branch
-            type=ref,event=pr
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=sha-
-            type=raw,value=latest,enable={{is_default_branch}}
-
-      - name: Set image reference for scan
-        id: image-ref
+      - name: Compute image ref
+        id: ref
         run: |
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend:pr-${{ github.event.number }}" >> $GITHUB_OUTPUT
+          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            SHA="${{ github.event.inputs.sha || github.sha }}"
           else
-            echo "ref=${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend:latest" >> $GITHUB_OUTPUT
+            SHA="${{ github.event.workflow_run.head_sha }}"
           fi
-
-      - name: Build and push
-        uses: docker/build-push-action@v6
-        with:
-          context: ./frontend
-          file: ./frontend/Dockerfile.prod
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,scope=frontend
-          cache-to: type=gha,mode=max,scope=frontend
-
-  scan-backend:
-    name: Scan Backend
-    needs: build-backend
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      security-events: write
-
-    steps:
-      - uses: actions/checkout@v6
+          TAG="sha-${SHA::7}"
+          echo "image=${{ env.REGISTRY }}/$PREFIX/${{ matrix.image }}:$TAG" >> $GITHUB_OUTPUT
 
       - name: Run Trivy vulnerability scanner
         uses: aquasecurity/trivy-action@0.33.1
         with:
-          image-ref: ${{ needs.build-backend.outputs.image-ref }}
+          image-ref: ${{ steps.ref.outputs.image }}
           format: 'sarif'
-          output: 'trivy-backend-results.sarif'
+          output: 'trivy-${{ matrix.image }}-results.sarif'
           ignore-unfixed: true
           severity: 'CRITICAL,HIGH'
           timeout: '5m0s'
@@ -220,56 +75,85 @@ jobs:
         if: always()
         uses: github/codeql-action/upload-sarif@v4
         with:
-          sarif_file: 'trivy-backend-results.sarif'
-          category: 'trivy-backend'
-
-  scan-frontend:
-    name: Scan Frontend
-    needs: build-frontend
+          sarif_file: 'trivy-${{ matrix.image }}-results.sarif'
+          category: 'trivy-${{ matrix.image }}'
+
+  # Promote SHA tag → latest using crane (registry-level manifest copy, no rebuild)
+  promote:
+    name: Promote to Latest
+    needs: [scan]
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.workflow_run.conclusion == 'success' &&
+       github.event.workflow_run.head_branch == 'main')
     runs-on: ubuntu-latest
     permissions:
-      contents: read
-      security-events: write
-
+      packages: write
     steps:
-      - name: Run Trivy vulnerability scanner
-        uses: aquasecurity/trivy-action@0.33.1
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
         with:
-          image-ref: ${{ needs.build-frontend.outputs.image-ref }}
-          format: 'sarif'
-          output: 'trivy-frontend-results.sarif'
-          ignore-unfixed: true
-          severity: 'CRITICAL,HIGH'
-          timeout: '5m0s'
-          version: 'v0.68.2'
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Upload Trivy scan results
-        if: always()
-        uses: github/codeql-action/upload-sarif@v4
-        with:
-          sarif_file: 'trivy-frontend-results.sarif'
-          category: 'trivy-frontend'
+      - name: Install crane
+        uses: imjasonh/setup-crane@v0.4
+
+      - name: Promote images (SHA → latest)
+        run: |
+          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            SHA="${{ github.event.inputs.sha || github.sha }}"
+          else
+            SHA="${{ github.event.workflow_run.head_sha }}"
+          fi
+          TAG="sha-${SHA::7}"
+
+          echo "Promoting tag: $TAG → latest"
+          echo ""
+
+          crane copy "$REGISTRY/$PREFIX/base:$TAG" "$REGISTRY/$PREFIX/base:latest"
+          crane copy "$REGISTRY/$PREFIX/backend:$TAG" "$REGISTRY/$PREFIX/backend:latest"
+          crane copy "$REGISTRY/$PREFIX/frontend:$TAG" "$REGISTRY/$PREFIX/frontend:latest"
+          crane copy "$REGISTRY/$PREFIX/coordinator:$TAG" "$REGISTRY/$PREFIX/coordinator:latest"
+          crane copy "$REGISTRY/$PREFIX/k8s-worker:$TAG" "$REGISTRY/$PREFIX/k8s-worker:latest"
+          crane copy "$REGISTRY/$PREFIX/pod-monitor:$TAG" "$REGISTRY/$PREFIX/pod-monitor:latest"
+          crane copy "$REGISTRY/$PREFIX/result-processor:$TAG" "$REGISTRY/$PREFIX/result-processor:latest"
+          crane copy "$REGISTRY/$PREFIX/saga-orchestrator:$TAG" "$REGISTRY/$PREFIX/saga-orchestrator:latest"
+          crane copy "$REGISTRY/$PREFIX/event-replay:$TAG" "$REGISTRY/$PREFIX/event-replay:latest"
+          crane copy "$REGISTRY/$PREFIX/dlq-processor:$TAG" "$REGISTRY/$PREFIX/dlq-processor:latest"
+          crane copy "$REGISTRY/$PREFIX/cert-generator:$TAG" "$REGISTRY/$PREFIX/cert-generator:latest"
+          crane copy "$REGISTRY/$PREFIX/zookeeper-certgen:$TAG" "$REGISTRY/$PREFIX/zookeeper-certgen:latest"
 
   summary:
     name: Summary
-    if: github.event_name != 'pull_request'
-    needs: [build-base, build-backend, build-frontend, scan-backend, scan-frontend]
+    needs: [promote]
     runs-on: ubuntu-latest
-
     steps:
-      - name: Set lowercase image prefix
-        run: echo "IMAGE_PREFIX=${GITHUB_REPOSITORY_OWNER,,}/integr8scode" >> $GITHUB_ENV
-
       - name: Generate summary
         run: |
-          echo "## Docker Images Published" >> $GITHUB_STEP_SUMMARY
+          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            SHA="${{ github.event.inputs.sha || github.sha }}"
+          else
+            SHA="${{ github.event.workflow_run.head_sha }}"
+          fi
+          TAG="sha-${SHA::7}"
+
+          echo "## Docker Images Promoted to Latest" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          if [ "$GITHUB_EVENT_NAME" = "workflow_dispatch" ]; then
+            echo "Images promoted manually from \`$TAG\` to \`latest\` — Stack Tests may not have run." >> $GITHUB_STEP_SUMMARY
+          else
+            echo "All Stack Tests passed. Images promoted from \`$TAG\` to \`latest\`." >> $GITHUB_STEP_SUMMARY
+          fi
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "| Image | Pull Command |" >> $GITHUB_STEP_SUMMARY
           echo "|-------|--------------|" >> $GITHUB_STEP_SUMMARY
-          echo "| Base | \`docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/base:latest\` |" >> $GITHUB_STEP_SUMMARY
-          echo "| Backend | \`docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/backend:latest\` |" >> $GITHUB_STEP_SUMMARY
-          echo "| Frontend | \`docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/frontend:latest\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Base | \`docker pull $REGISTRY/$PREFIX/base:latest\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Backend | \`docker pull $REGISTRY/$PREFIX/backend:latest\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Frontend | \`docker pull $REGISTRY/$PREFIX/frontend:latest\` |" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          echo "### Scan Results" >> $GITHUB_STEP_SUMMARY
-          echo "- Backend scan: ✅ Passed" >> $GITHUB_STEP_SUMMARY
-          echo "- Frontend scan: ✅ Passed" >> $GITHUB_STEP_SUMMARY
+          echo "### Security Scans" >> $GITHUB_STEP_SUMMARY
+          echo "All 12 images scanned with Trivy (CRITICAL + HIGH, unfixed ignored)." >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/frontend-ci.yml b/.github/workflows/frontend-ci.yml
index fe29a033..e6303aa1 100644
--- a/.github/workflows/frontend-ci.yml
+++ b/.github/workflows/frontend-ci.yml
@@ -2,12 +2,12 @@ name: Frontend CI
 
 on:
   push:
-    branches: [main, dev]
+    branches: [main]
     paths:
       - 'frontend/**'
       - '.github/workflows/frontend-ci.yml'
   pull_request:
-    branches: [main, dev]
+    branches: [main]
     paths:
       - 'frontend/**'
       - '.github/workflows/frontend-ci.yml'
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
index d4752b08..34070e65 100644
--- a/.github/workflows/mypy.yml
+++ b/.github/workflows/mypy.yml
@@ -2,9 +2,9 @@ name: MyPy Type Checking
 
 on:
   push:
-    branches: [ main, dev ]
+    branches: [ main ]
   pull_request:
-    branches: [ main, dev ]
+    branches: [ main ]
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 3ddec835..c670ce34 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -2,9 +2,9 @@ name: Ruff Linting
 
 on:
   push:
-    branches: [ main, dev ]
+    branches: [ main ]
   pull_request:
-    branches: [ main, dev ]
+    branches: [ main ]
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
index 4452c432..10837590 100644
--- a/.github/workflows/security.yml
+++ b/.github/workflows/security.yml
@@ -2,9 +2,9 @@ name: Security Scanning
 
 on:
   push:
-    branches: [ main, dev ]
+    branches: [ main ]
   pull_request:
-    branches: [ main, dev ]
+    branches: [ main ]
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
index c2804f73..373565ab 100644
--- a/.github/workflows/stack-tests.yml
+++ b/.github/workflows/stack-tests.yml
@@ -2,19 +2,22 @@ name: Stack Tests
 
 on:
   push:
-    branches: [main, dev]
+    branches: [main]
+    tags: ['v*']
     paths:
       - 'backend/**'
       - 'frontend/**'
+      - 'cert-generator/**'
       - 'docker-compose.yaml'
       - 'deploy.sh'
       - '.github/workflows/stack-tests.yml'
       - '.github/actions/**'
   pull_request:
-    branches: [main, dev]
+    branches: [main]
     paths:
       - 'backend/**'
       - 'frontend/**'
+      - 'cert-generator/**'
       - 'docker-compose.yaml'
       - 'deploy.sh'
       - '.github/workflows/stack-tests.yml'
@@ -22,11 +25,14 @@ on:
   workflow_dispatch:
 
 env:
+  REGISTRY: ghcr.io
   MONGO_IMAGE: mongo:8.0
   REDIS_IMAGE: redis:7-alpine
   KAFKA_IMAGE: confluentinc/cp-kafka:7.8.2
   ZOOKEEPER_IMAGE: confluentinc/cp-zookeeper:7.8.2
   SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.8.2
+  K3S_VERSION: v1.32.11+k3s1
+  K3S_INSTALL_SHA256: d75e014f2d2ab5d30a318efa5c326f3b0b7596f194afcff90fa7a7a91166d5f7
 
 jobs:
   # Fast unit tests (no infrastructure needed)
@@ -102,17 +108,41 @@ jobs:
           fail_ci_if_error: false
           verbose: true
 
-  # Build all images once, cache for test jobs
+  # Build all images, push to GHCR with immutable SHA tag.
+  # Fork PRs skip GHCR push (no write access) — E2E tests require pushed images.
   build-images:
-    name: Build Images
+    name: Build & Push Images
     needs: [backend-unit, frontend-unit]
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    outputs:
+      sha-tag: ${{ steps.tags.outputs.sha-tag }}
+      image-prefix: ${{ steps.tags.outputs.image-prefix }}
     steps:
       - uses: actions/checkout@v6
 
       - name: Setup Docker Buildx
         uses: docker/setup-buildx-action@v3
 
+      - name: Log in to GHCR
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Compute image tags
+        id: tags
+        run: |
+          PREFIX="${GITHUB_REPOSITORY_OWNER,,}/integr8scode"
+          SHA_TAG="sha-${GITHUB_SHA::7}"
+          echo "sha-tag=$SHA_TAG" >> "$GITHUB_OUTPUT"
+          echo "image-prefix=$PREFIX" >> "$GITHUB_OUTPUT"
+
+      # ── Base image (cached separately — rarely changes) ──────────────
       - name: Cache base image
         uses: actions/cache@v5
         id: base-cache
@@ -139,15 +169,19 @@ jobs:
         if: steps.base-cache.outputs.cache-hit != 'true'
         run: docker save integr8scode-base:latest | zstd -T0 -3 > /tmp/base-image.tar.zst
 
-      - name: Build all images
+      # ── Backend + workers (depend on local base image) ───────────────
+      - name: Build backend and worker images
         run: |
           docker build -t integr8scode-backend:latest --build-context base=docker-image://integr8scode-base:latest -f ./backend/Dockerfile ./backend
-          docker build -t integr8scode-coordinator:latest -f backend/workers/Dockerfile.coordinator --build-context base=docker-image://integr8scode-base:latest ./backend
-          docker build -t integr8scode-k8s-worker:latest -f backend/workers/Dockerfile.k8s_worker --build-context base=docker-image://integr8scode-base:latest ./backend
-          docker build -t integr8scode-pod-monitor:latest -f backend/workers/Dockerfile.pod_monitor --build-context base=docker-image://integr8scode-base:latest ./backend
-          docker build -t integr8scode-result-processor:latest -f backend/workers/Dockerfile.result_processor --build-context base=docker-image://integr8scode-base:latest ./backend
-          docker build -t integr8scode-saga-orchestrator:latest -f backend/workers/Dockerfile.saga_orchestrator --build-context base=docker-image://integr8scode-base:latest ./backend
-
+          docker build -t integr8scode-coordinator:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.coordinator ./backend
+          docker build -t integr8scode-k8s-worker:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.k8s_worker ./backend
+          docker build -t integr8scode-pod-monitor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.pod_monitor ./backend
+          docker build -t integr8scode-result-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.result_processor ./backend
+          docker build -t integr8scode-saga-orchestrator:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.saga_orchestrator ./backend
+          docker build -t integr8scode-event-replay:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.event_replay ./backend
+          docker build -t integr8scode-dlq-processor:latest --build-context base=docker-image://integr8scode-base:latest -f backend/workers/Dockerfile.dlq_processor ./backend
+
+      # ── Utility images (GHA-cached, independent of base) ────────────
       - name: Build cert-generator image
         uses: docker/build-push-action@v6
         with:
@@ -158,6 +192,17 @@ jobs:
           cache-from: type=gha,scope=cert-generator
           cache-to: type=gha,mode=max,scope=cert-generator
 
+      - name: Build zookeeper-certgen image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./backend/zookeeper
+          file: ./backend/zookeeper/Dockerfile.certgen
+          load: true
+          tags: integr8scode-zookeeper-certgen:latest
+          cache-from: type=gha,scope=zookeeper-certgen
+          cache-to: type=gha,mode=max,scope=zookeeper-certgen
+
+      # ── Frontend (dev for E2E, prod for scanning/deployment) ─────────
       - name: Build frontend image
         uses: docker/build-push-action@v6
         with:
@@ -168,61 +213,77 @@ jobs:
           cache-from: type=gha,scope=frontend
           cache-to: type=gha,mode=max,scope=frontend
 
-      - name: Save all images
-        run: |
-          docker save \
-            integr8scode-backend:latest \
-            integr8scode-coordinator:latest \
-            integr8scode-k8s-worker:latest \
-            integr8scode-pod-monitor:latest \
-            integr8scode-result-processor:latest \
-            integr8scode-saga-orchestrator:latest \
-            integr8scode-cert-generator:latest \
-            integr8scode-frontend:latest \
-            | zstd -T0 -3 > /tmp/all-images.tar.zst
-
-      - name: Upload images artifact
-        uses: actions/upload-artifact@v6
+      - name: Build frontend-prod image
+        uses: docker/build-push-action@v6
         with:
-          name: docker-images
-          path: /tmp/all-images.tar.zst
-          retention-days: 1
-
-  # Parallel test jobs (backend-e2e, frontend-e2e)
+          context: ./frontend
+          file: ./frontend/Dockerfile.prod
+          load: true
+          tags: integr8scode-frontend-prod:latest
+          cache-from: type=gha,scope=frontend-prod
+          cache-to: type=gha,mode=max,scope=frontend-prod
+
+      # ── Push all images to GHCR in parallel ────────────────────────
+      - name: Push all images to GHCR
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        env:
+          TAG: ${{ steps.tags.outputs.sha-tag }}
+          IMG: ${{ env.REGISTRY }}/${{ steps.tags.outputs.image-prefix }}
+        run: |
+          # Tag all images for GHCR
+          docker tag integr8scode-base:latest "$IMG/base:$TAG"
+          docker tag integr8scode-backend:latest "$IMG/backend:$TAG"
+          docker tag integr8scode-coordinator:latest "$IMG/coordinator:$TAG"
+          docker tag integr8scode-k8s-worker:latest "$IMG/k8s-worker:$TAG"
+          docker tag integr8scode-pod-monitor:latest "$IMG/pod-monitor:$TAG"
+          docker tag integr8scode-result-processor:latest "$IMG/result-processor:$TAG"
+          docker tag integr8scode-saga-orchestrator:latest "$IMG/saga-orchestrator:$TAG"
+          docker tag integr8scode-event-replay:latest "$IMG/event-replay:$TAG"
+          docker tag integr8scode-dlq-processor:latest "$IMG/dlq-processor:$TAG"
+          docker tag integr8scode-cert-generator:latest "$IMG/cert-generator:$TAG"
+          docker tag integr8scode-zookeeper-certgen:latest "$IMG/zookeeper-certgen:$TAG"
+          docker tag integr8scode-frontend:latest "$IMG/frontend-dev:$TAG"
+          docker tag integr8scode-frontend-prod:latest "$IMG/frontend:$TAG"
+
+          # Push all 13 images in parallel, tracking each PID
+          declare -A PIDS
+          for name in base backend coordinator k8s-worker pod-monitor \
+                      result-processor saga-orchestrator event-replay \
+                      dlq-processor cert-generator zookeeper-certgen \
+                      frontend-dev frontend; do
+            docker push "$IMG/$name:$TAG" &
+            PIDS[$name]=$!
+          done
+
+          FAILED=0
+          for name in "${!PIDS[@]}"; do
+            if ! wait "${PIDS[$name]}"; then
+              echo "::error::Failed to push $name"
+              FAILED=1
+            fi
+          done
+          [ "$FAILED" -eq 0 ] || exit 1
+
+  # Parallel E2E test jobs — compose pulls from GHCR using IMAGE_TAG
   backend-e2e:
     name: Backend E2E Tests
     needs: [build-images]
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: read
     steps:
       - uses: actions/checkout@v6
 
-      - name: Cache and load Docker images
-        uses: ./.github/actions/docker-cache
+      - uses: ./.github/actions/e2e-boot
         with:
-          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
+          image-tag: ${{ needs.build-images.outputs.sha-tag }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Download built images
-        uses: actions/download-artifact@v7
+      - uses: ./.github/actions/e2e-ready
         with:
-          name: docker-images
-          path: /tmp
-
-      - name: Load built images
-        run: zstd -d -c /tmp/all-images.tar.zst | docker load
-
-      - name: Setup k3s
-        uses: ./.github/actions/k3s-setup
-
-      - name: Use test environment config
-        run: |
-          cp backend/config.test.toml backend/config.toml
-          cp backend/secrets.example.toml backend/secrets.toml
-
-      - name: Start stack
-        run: ./deploy.sh dev --wait
-
-      - name: Seed test users
-        run: docker compose exec -T backend uv run python scripts/seed_users.py
+          image-tag: ${{ needs.build-images.outputs.sha-tag }}
 
       - name: Run E2E tests
         timeout-minutes: 15
@@ -253,19 +314,11 @@ jobs:
         run: |
           mkdir -p logs
           docker compose logs --timestamps > logs/docker-compose.log 2>&1
-          docker compose logs --timestamps backend > logs/backend.log 2>&1
-          docker compose logs --timestamps mongo > logs/mongo.log 2>&1 || true
-          docker compose logs --timestamps redis > logs/redis.log 2>&1 || true
-          docker compose logs --timestamps kafka > logs/kafka.log 2>&1 || true
-          docker compose logs --timestamps zookeeper > logs/zookeeper.log 2>&1 || true
-          docker compose logs --timestamps schema-registry > logs/schema-registry.log 2>&1 || true
-          docker compose logs --timestamps coordinator > logs/coordinator.log 2>&1 || true
-          docker compose logs --timestamps k8s-worker > logs/k8s-worker.log 2>&1 || true
-          docker compose logs --timestamps pod-monitor > logs/pod-monitor.log 2>&1 || true
-          docker compose logs --timestamps result-processor > logs/result-processor.log 2>&1 || true
-          docker compose logs --timestamps saga-orchestrator > logs/saga-orchestrator.log 2>&1 || true
-          docker compose logs --timestamps event-replay > logs/event-replay.log 2>&1 || true
-          docker compose logs --timestamps dlq-processor > logs/dlq-processor.log 2>&1 || true
+          for svc in backend mongo redis kafka zookeeper schema-registry \
+                     coordinator k8s-worker pod-monitor result-processor \
+                     saga-orchestrator event-replay dlq-processor; do
+            docker compose logs --timestamps "$svc" > "logs/$svc.log" 2>&1 || true
+          done
           kubectl get events --sort-by='.metadata.creationTimestamp' -A > logs/k8s-events.log 2>&1 || true
 
       - name: Upload logs
@@ -276,12 +329,28 @@ jobs:
           path: logs/
 
   frontend-e2e:
-    name: Frontend E2E Tests
+    name: Frontend E2E (${{ matrix.shardIndex }}/${{ matrix.shardTotal }})
     needs: [build-images]
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: read
+    strategy:
+      fail-fast: false
+      matrix:
+        shardIndex: [1, 2]
+        shardTotal: [2]
     steps:
       - uses: actions/checkout@v6
 
+      # Phase 1: kick off image pull + infra + k3s in background
+      - uses: ./.github/actions/e2e-boot
+        with:
+          image-tag: ${{ needs.build-images.outputs.sha-tag }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      # Phase 2: Node + Playwright setup (overlaps with k3s boot + image pull)
       - name: Setup Node.js
         uses: actions/setup-node@v6
         with:
@@ -309,57 +378,36 @@ jobs:
         working-directory: frontend
         run: npx playwright install chromium
 
-      - name: Cache and load Docker images
-        uses: ./.github/actions/docker-cache
-        with:
-          images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
-
-      - name: Download built images
-        uses: actions/download-artifact@v7
+      # Phase 3: finalize k3s + start stack (k3s has been booting since e2e-boot)
+      - uses: ./.github/actions/e2e-ready
         with:
-          name: docker-images
-          path: /tmp
-
-      - name: Load built images
-        run: zstd -d -c /tmp/all-images.tar.zst | docker load
-
-      - name: Setup k3s
-        uses: ./.github/actions/k3s-setup
-
-      - name: Use test environment config
-        run: |
-          cp backend/config.test.toml backend/config.toml
-          cp backend/secrets.example.toml backend/secrets.toml
-
-      - name: Start stack
-        run: ./deploy.sh dev --wait
-
-      - name: Seed test users
-        run: docker compose exec -T backend uv run python scripts/seed_users.py
+          image-tag: ${{ needs.build-images.outputs.sha-tag }}
+          wait-for-frontend: 'true'
 
       - name: Run Playwright tests
         timeout-minutes: 10
         working-directory: frontend
-        run: CI=true npx playwright test
+        run: CI=true npx playwright test --shard=${{ matrix.shardIndex }}/${{ matrix.shardTotal }}
 
       - name: Upload Playwright report
         uses: actions/upload-artifact@v6
         if: always()
         with:
-          name: playwright-report
+          name: playwright-report-${{ matrix.shardIndex }}
           path: frontend/playwright-report/
 
       - name: Collect logs on failure
         if: failure()
         run: |
           mkdir -p logs
-          docker compose logs > logs/docker-compose.log 2>&1
-          docker compose logs backend > logs/backend.log 2>&1
-          docker compose logs frontend > logs/frontend.log 2>&1
+          docker compose logs --timestamps > logs/docker-compose.log 2>&1
+          for svc in backend frontend; do
+            docker compose logs --timestamps "$svc" > "logs/$svc.log" 2>&1 || true
+          done
 
       - name: Upload logs
         if: failure()
         uses: actions/upload-artifact@v6
         with:
-          name: frontend-e2e-logs
+          name: frontend-e2e-logs-${{ matrix.shardIndex }}
           path: logs/
diff --git a/deploy.sh b/deploy.sh
index f25c480f..a7dc8bec 100755
--- a/deploy.sh
+++ b/deploy.sh
@@ -56,9 +56,12 @@ show_help() {
     echo ""
     echo "Commands:"
     echo "  dev [options]      Start full stack (docker-compose)"
-    echo "                     --build             Rebuild images"
+    echo "                     --build             Rebuild images locally"
+    echo "                     --no-build          Use pre-built images only (no build fallback)"
     echo "                     --wait              Wait for services to be healthy"
     echo "                     --timeout <secs>    Health check timeout (default: 300)"
+    echo "                     --observability     Include Grafana, Jaeger, etc."
+    echo "                     --debug             Include observability + Kafdrop"
     echo "  infra [options]    Start infrastructure only (mongo, redis, kafka, etc.)"
     echo "                     --wait              Wait for services to be healthy"
     echo "                     --timeout <secs>    Health check timeout (default: 120)"
@@ -97,8 +100,10 @@ cmd_dev() {
     print_header "Starting Local Development Environment"
 
     local BUILD_FLAG=""
+    local NO_BUILD_FLAG=""
     local WAIT_FLAG=""
     local WAIT_TIMEOUT="300"
+    local PROFILE_FLAGS=""
 
     while [[ $# -gt 0 ]]; do
         case "$1" in
@@ -106,6 +111,10 @@ cmd_dev() {
                 BUILD_FLAG="--build"
                 print_info "Rebuilding images..."
                 ;;
+            --no-build)
+                NO_BUILD_FLAG="--no-build"
+                print_info "Using pre-built images (skipping build)..."
+                ;;
             --wait)
                 WAIT_FLAG="--wait"
                 ;;
@@ -113,6 +122,14 @@ cmd_dev() {
                 shift
                 WAIT_TIMEOUT="$1"
                 ;;
+            --observability)
+                PROFILE_FLAGS="--profile observability"
+                print_info "Including observability stack (Grafana, Jaeger, etc.)"
+                ;;
+            --debug)
+                PROFILE_FLAGS="--profile observability --profile debug"
+                print_info "Including observability + debug tools (Kafdrop, etc.)"
+                ;;
         esac
         shift
     done
@@ -122,7 +139,7 @@ cmd_dev() {
         WAIT_TIMEOUT_FLAG="--wait-timeout $WAIT_TIMEOUT"
     fi
 
-    docker compose --profile observability up -d $BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG
+    docker compose $PROFILE_FLAGS up -d $BUILD_FLAG $NO_BUILD_FLAG $WAIT_FLAG $WAIT_TIMEOUT_FLAG
 
     echo ""
     print_success "Development environment started!"
@@ -130,9 +147,13 @@ cmd_dev() {
     echo "Services:"
     echo "  Backend:   https://localhost:443"
     echo "  Frontend:  https://localhost:5001"
-    echo "  Kafdrop:   http://localhost:9000"
-    echo "  Jaeger:    http://localhost:16686"
-    echo "  Grafana:   http://localhost:3000"
+    if [[ "$PROFILE_FLAGS" == *"debug"* ]]; then
+        echo "  Kafdrop:   http://localhost:9000"
+    fi
+    if [[ "$PROFILE_FLAGS" == *"observability"* ]]; then
+        echo "  Jaeger:    http://localhost:16686"
+        echo "  Grafana:   http://localhost:3000"
+    fi
     echo ""
     echo "Commands:"
     echo "  ./deploy.sh logs             # View all logs"
diff --git a/docker-compose.yaml b/docker-compose.yaml
index bbbb40a1..80c215c8 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,20 +1,21 @@
 services:
   # Shared base image for all Python backend services
   base:
+    image: ghcr.io/hardmax71/integr8scode/base:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: Dockerfile.base
-    image: integr8scode-base:latest
 
   shared-ca:
     image: alpine:latest
     volumes:
       - shared_ca:/shared_ca
-    command: sh -c "mkdir -p /shared_ca && chmod 777 /shared_ca && echo 'Shared CA directory ready' && sleep 2"
+    command: sh -c "mkdir -p /shared_ca && chmod 777 /shared_ca && echo 'Shared CA directory ready'"
     networks:
       - app-network
 
   cert-generator:
+    image: ghcr.io/hardmax71/integr8scode/cert-generator:${IMAGE_TAG:-latest}
     build:
       context: ./cert-generator
       dockerfile: Dockerfile
@@ -57,10 +58,10 @@ services:
         hard: 65536
     healthcheck:
       test: echo 'db.runCommand("ping").ok' | mongosh localhost/integr8scode -u ${MONGO_ROOT_USER:-root} -p ${MONGO_ROOT_PASSWORD:-rootpassword} --authenticationDatabase admin --quiet
-      interval: 5s
+      interval: 3s
       timeout: 5s
-      retries: 10
-      start_period: 10s
+      retries: 15
+      start_period: 5s
 
   redis:
     image: redis:7-alpine
@@ -74,12 +75,13 @@ services:
       - app-network
     healthcheck:
       test: ["CMD", "redis-cli", "ping"]
-      interval: 10s
-      timeout: 5s
-      retries: 5
-      start_period: 10s
+      interval: 2s
+      timeout: 3s
+      retries: 10
+      start_period: 2s
 
   backend:
+    image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: Dockerfile
@@ -120,14 +122,14 @@ services:
     extra_hosts:
       - "host.docker.internal:host-gateway"
     healthcheck:
-      # Simpler, reliable healthcheck: curl fails non-zero for HTTP >=400 with -f
       test: ["CMD-SHELL", "curl -k -f -s https://localhost:443/api/v1/health/live >/dev/null || exit 1"]
-      interval: 3s
+      interval: 2s
       timeout: 3s
-      retries: 50
-      start_period: 10s
+      retries: 30
+      start_period: 3s
 
   frontend:
+    image: ghcr.io/hardmax71/integr8scode/frontend-dev:${IMAGE_TAG:-latest}
     container_name: frontend
     build:
       context: ./frontend
@@ -136,7 +138,7 @@ services:
       cert-generator:
         condition: service_completed_successfully
       backend:
-        condition: service_healthy
+        condition: service_started
     volumes:
       - ./frontend:/app
       - /app/node_modules
@@ -151,10 +153,10 @@ services:
       - NODE_EXTRA_CA_CERTS=/shared_ca/mkcert-ca.pem
     healthcheck:
       test: ["CMD-SHELL", "curl -k -f -s https://localhost:5001 >/dev/null || exit 1"]
-      interval: 3s
+      interval: 2s
       timeout: 3s
       retries: 30
-      start_period: 10s
+      start_period: 3s
 
 
   grafana:
@@ -176,6 +178,7 @@ services:
   # Kafka Infrastructure for Event-Driven Design
   # Certificate generator for Zookeeper/Kafka SSL
   zookeeper-certgen:
+    image: ghcr.io/hardmax71/integr8scode/zookeeper-certgen:${IMAGE_TAG:-latest}
     build:
       context: ./backend/zookeeper
       dockerfile: Dockerfile.certgen
@@ -258,10 +261,10 @@ services:
         hard: 65536
     healthcheck:
       test: ["CMD-SHELL", "echo ruok | nc localhost 2181 | grep imok"]
-      interval: 5s
+      interval: 3s
       timeout: 5s
-      retries: 10
-      start_period: 10s
+      retries: 15
+      start_period: 5s
 
   kafka:
     image: confluentinc/cp-kafka:7.8.2
@@ -319,10 +322,10 @@ services:
         hard: 65536
     healthcheck:
       test: ["CMD-SHELL", "kafka-broker-api-versions --bootstrap-server localhost:9092"]
-      interval: 5s
+      interval: 3s
       timeout: 10s
-      retries: 12
-      start_period: 15s
+      retries: 15
+      start_period: 3s
 
   schema-registry:
     image: confluentinc/cp-schema-registry:7.8.2
@@ -340,14 +343,15 @@ services:
       - app-network
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8081/config"]
-      interval: 5s
+      interval: 3s
       timeout: 5s
-      retries: 10
-      start_period: 10s
+      retries: 15
+      start_period: 5s
 
   kafdrop:
     image: obsidiandynamics/kafdrop:3.31.0
     container_name: kafdrop
+    profiles: ["debug"]
     depends_on:
       - kafka
       - schema-registry
@@ -362,6 +366,7 @@ services:
 
   # Kafka topic initialization
   kafka-init:
+    image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: Dockerfile
@@ -388,6 +393,7 @@ services:
 
   # Seed default users (runs once after mongo is ready)
   user-seed:
+    image: ghcr.io/hardmax71/integr8scode/backend:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: Dockerfile
@@ -412,6 +418,7 @@ services:
 
   # Event-driven workers
   coordinator:
+    image: ghcr.io/hardmax71/integr8scode/coordinator:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.coordinator
@@ -436,6 +443,7 @@ services:
     restart: unless-stopped
 
   k8s-worker:
+    image: ghcr.io/hardmax71/integr8scode/k8s-worker:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.k8s_worker
@@ -463,6 +471,7 @@ services:
     restart: unless-stopped
 
   pod-monitor:
+    image: ghcr.io/hardmax71/integr8scode/pod-monitor:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.pod_monitor
@@ -488,6 +497,7 @@ services:
     restart: unless-stopped
 
   result-processor:
+    image: ghcr.io/hardmax71/integr8scode/result-processor:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.result_processor
@@ -515,6 +525,7 @@ services:
     restart: unless-stopped
 
   saga-orchestrator:
+    image: ghcr.io/hardmax71/integr8scode/saga-orchestrator:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.saga_orchestrator
@@ -560,6 +571,7 @@ services:
 
   # Event replay service
   event-replay:
+    image: ghcr.io/hardmax71/integr8scode/event-replay:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.event_replay
@@ -586,6 +598,7 @@ services:
 
   # DLQ Processor Service
   dlq-processor:
+    image: ghcr.io/hardmax71/integr8scode/dlq-processor:${IMAGE_TAG:-latest}
     build:
       context: ./backend
       dockerfile: workers/Dockerfile.dlq_processor
diff --git a/docs/operations/cicd.md b/docs/operations/cicd.md
index 54ff0130..0db2fee6 100644
--- a/docs/operations/cicd.md
+++ b/docs/operations/cicd.md
@@ -1,41 +1,40 @@
 # CI/CD Pipeline
 
-The project uses GitHub Actions to automate code quality checks, security scanning, testing, and documentation
-deployment. Every push to `main` or `dev` and every pull request triggers the pipeline, with workflows running in
-parallel to provide fast feedback.
+The project uses GitHub Actions to automate code quality checks, security scanning, testing, image publishing, and
+documentation deployment. The pipeline is split across several workflow files that trigger independently based on path
+filters, so only relevant checks run for each change.
 
 ## Pipeline overview
 
 ```mermaid
 graph LR
-    subgraph "Code Quality"
+    subgraph "Code Quality (lightweight)"
         Ruff["Ruff Linting"]
         MyPy["MyPy Type Check"]
-        ESLint["ESLint + TypeScript"]
+        ESLint["ESLint + Svelte Check"]
     end
 
     subgraph "Security"
         Bandit["Bandit SAST"]
+        SBOM["SBOM & Grype"]
     end
 
-    subgraph "Docker Build & Scan"
-        Base["Build Base"]
-        Backend["Build Backend"]
-        Frontend["Build Frontend"]
-        ScanBE["Scan Backend"]
-        ScanFE["Scan Frontend"]
-        Base --> Backend
-        Base --> Frontend
-        Backend --> ScanBE
-        Frontend --> ScanFE
-    end
-
-    subgraph "Testing (stack-tests.yml)"
+    subgraph "Stack Tests"
         UnitBE["Backend Unit"]
         UnitFE["Frontend Unit"]
-        Stack["Stack Tests"]
-        UnitBE --> Stack
-        UnitFE --> Stack
+        Build["Build & Push Images"]
+        E2E_BE["Backend E2E"]
+        E2E_FE["Frontend E2E"]
+        UnitBE --> Build
+        UnitFE --> Build
+        Build --> E2E_BE
+        Build --> E2E_FE
+    end
+
+    subgraph "Docker Scan & Promote"
+        Scan["Trivy Scan (12 images)"]
+        Promote["Promote SHA → latest"]
+        Scan --> Promote
     end
 
     subgraph "Documentation"
@@ -43,143 +42,293 @@ graph LR
         Pages["GitHub Pages"]
     end
 
-    Push["Push / PR"] --> Ruff
-    Push --> MyPy
-    Push --> ESLint
-    Push --> Bandit
-    Push --> Base
-    Push --> UnitBE
-    Push --> UnitFE
-    Push --> Docs
+    Push["Push / PR"] --> Ruff & MyPy & ESLint & Bandit & SBOM & UnitBE & UnitFE & Docs
+    Build -->|main, all tests pass| Scan
     Docs -->|main only| Pages
 ```
 
-All workflows trigger on pushes to `main` and `dev` branches, pull requests against those branches, and can be triggered
-manually via `workflow_dispatch`. Path filters ensure workflows only run when relevant files change.
-
-## Linting and type checking
-
-Three lightweight workflows run first since they catch obvious issues quickly.
-
-**Backend (Python):**
-- [Ruff](https://docs.astral.sh/ruff/) checks for style violations, import ordering, and common bugs
-- [mypy](https://mypy.readthedocs.io/) with strict settings catches type mismatches and missing return types
-
-**Frontend (TypeScript):**
-- ESLint checks for code quality issues
-- TypeScript compiler (`tsc --noEmit`) verifies type correctness
-
-Both use dependency caching to skip reinstallation when lockfiles haven't changed.
-
-## Security scanning
-
-The security workflow uses [Bandit](https://bandit.readthedocs.io/) to perform static analysis on Python source files,
-flagging issues like hardcoded credentials, SQL injection patterns, and unsafe deserialization. It excludes the test
-directory and reports only medium-severity and above findings. Container-level vulnerability scanning with Trivy runs
-as part of the Docker workflow.
+The two heavyweight workflows are **Stack Tests** (builds images, runs all tests) and **Docker Scan & Promote**
+(scans images with Trivy and promotes to `latest`). They're connected: Docker Scan & Promote triggers automatically
+after Stack Tests succeeds on `main`, forming a build-test-scan-promote pipeline where the `latest` tag only moves
+forward when everything passes.
 
-## Docker build and scan
-
-The Docker workflow is structured as multiple jobs with dependencies, enabling parallel execution and early failure
-detection. If any job fails, dependent jobs are skipped immediately.
-
-```mermaid
-graph TD
-    A[build-base] --> B[build-backend]
-    A --> C[build-frontend]
-    B --> D[scan-backend]
-    C --> E[scan-frontend]
-    D --> F[summary]
-    E --> F
-
-    style A fill:#e1f5fe
-    style B fill:#fff3e0
-    style C fill:#fff3e0
-    style D fill:#ffebee
-    style E fill:#ffebee
-    style F fill:#e8f5e9
-```
+## Workflow files
 
-| Job              | Depends On       | Purpose                                              |
-|------------------|------------------|------------------------------------------------------|
-| `build-base`     | -                | Build shared base image with Python and dependencies |
-| `build-backend`  | `build-base`     | Build backend image using base as build context      |
-| `build-frontend` | `build-base`     | Build frontend image (runs parallel with backend)    |
-| `scan-backend`   | `build-backend`  | Trivy vulnerability scan on backend image            |
-| `scan-frontend`  | `build-frontend` | Trivy vulnerability scan on frontend image           |
-| `summary`        | All scans        | Generate summary (main branch only)                  |
+| Workflow                | File                                         | Trigger                                       | Purpose                                    |
+|-------------------------|----------------------------------------------|-----------------------------------------------|--------------------------------------------|
+| Stack Tests             | `.github/workflows/stack-tests.yml`          | Push/PR to `main`, tags `v*`                   | Unit tests, image build, E2E tests         |
+| Docker Scan & Promote   | `.github/workflows/docker.yml`               | After Stack Tests completes on `main`          | Trivy scan + promote SHA tag to `latest`   |
+| SBOM & Supply Chain     | `.github/workflows/sbom-compliance.yml`      | Push/PR to `main`, weekly schedule             | SPDX SBOM generation + Grype vulnerability scan |
+| Ruff Linting            | `.github/workflows/ruff.yml`                 | Push/PR to `main`                              | Python code style and import checks        |
+| MyPy Type Checking      | `.github/workflows/mypy.yml`                 | Push/PR to `main`                              | Python static type analysis                |
+| Frontend CI             | `.github/workflows/frontend-ci.yml`          | Push/PR to `main` (frontend changes)           | ESLint + Svelte type check                 |
+| Security Scanning       | `.github/workflows/security.yml`             | Push/PR to `main`                              | Bandit SAST                                |
+| Documentation           | `.github/workflows/docs.yml`                 | Push/PR (`docs/`, `mkdocs.yml`)                | MkDocs build and GitHub Pages deploy       |
 
-### Base image
+## Composite actions
 
-The base image (`Dockerfile.base`) contains Python, system dependencies, and all pip packages. It
-uses [uv](https://docs.astral.sh/uv/) to install dependencies from the lockfile with `uv sync --locked --no-dev`,
-ensuring reproducible builds without development tools.
+Shared steps are extracted into reusable composite actions under `.github/actions/`. This eliminates duplication between
+the backend and frontend E2E jobs, which both need k3s and the full docker compose stack but set it up differently.
 
-### Security scanning
+| Action                  | File                                         | Purpose                                    |
+|-------------------------|----------------------------------------------|--------------------------------------------|
+| E2E Boot                | `.github/actions/e2e-boot/action.yml`        | GHCR login, background image pull + infra pre-warm, k3s install |
+| E2E Ready               | `.github/actions/e2e-ready/action.yml`       | Finalize k3s, start compose stack, health checks, seed users |
 
-After each image builds, [Trivy](https://trivy.dev/) scans it for known vulnerabilities in OS packages and Python
-dependencies. The scan fails if it finds any critical or high severity issues with available fixes.
+The split is intentional. Frontend E2E needs to install Node.js and Playwright browsers _between_ boot and ready,
+overlapping that work with k3s installation to save wall-clock time. Backend E2E calls them back-to-back since it has
+no setup to overlap.
 
-## Stack tests (unified testing)
+## Stack Tests (the main workflow)
 
-The `stack-tests.yml` workflow consolidates all testing that requires infrastructure into a single job, avoiding
-redundant stack setup across multiple jobs.
+This is the core testing workflow. It builds all 13 container images, pushes them to GHCR with immutable SHA-based
+tags, then runs E2E tests on separate runners that pull images from the registry.
 
 ```mermaid
 graph TD
-    subgraph "Parallel (fast)"
-        A[Backend Unit Tests]
-        B[Frontend Unit Tests]
-    end
-
-    subgraph "Build"
-        C[Build Images]
+    subgraph "Phase 1: Fast feedback"
+        A["Backend Unit Tests"]
+        B["Frontend Unit Tests"]
     end
 
-    subgraph "Backend E2E (own runner)"
-        D1[Setup k3s + Stack]
-        E[Backend E2E Tests]
-        D1 --> E
+    subgraph "Phase 2: Build"
+        C["Build & Push 13 Images to GHCR"]
     end
 
-    subgraph "Frontend E2E (own runner)"
-        D2[Setup k3s + Stack]
-        F[Frontend E2E Tests]
-        D2 --> F
+    subgraph "Phase 3: E2E (parallel runners)"
+        D["Backend E2E<br/>(k3s + full stack)"]
+        E["Frontend E2E Shard 1/2<br/>(k3s + Playwright)"]
+        F["Frontend E2E Shard 2/2<br/>(k3s + Playwright)"]
     end
 
     A --> C
     B --> C
-    C --> D1
-    C --> D2
+    C --> D & E & F
 
     style A fill:#e8f5e9
     style B fill:#e8f5e9
     style C fill:#e1f5fe
-    style D1 fill:#e1f5fe
-    style D2 fill:#e1f5fe
+    style D fill:#fff3e0
     style E fill:#fff3e0
     style F fill:#fff3e0
 ```
 
-### Test execution order
+### Phase 1: Unit tests
+
+Backend and frontend unit tests run in parallel. They need no infrastructure and complete quickly. If either fails,
+the image build is skipped entirely.
 
-1. **Unit tests (parallel)**: Backend and frontend unit tests run simultaneously. They require no infrastructure and
-   complete quickly (~1-2 min each).
+### Phase 2: Build and push
+
+All 13 images are built on a single runner and pushed to GHCR with an immutable `sha-<7chars>` tag:
+
+| Image                | Source                                      |
+|----------------------|---------------------------------------------|
+| `base`               | `backend/Dockerfile.base`                   |
+| `backend`            | `backend/Dockerfile`                        |
+| `coordinator`        | `backend/workers/Dockerfile.coordinator`    |
+| `k8s-worker`         | `backend/workers/Dockerfile.k8s_worker`     |
+| `pod-monitor`        | `backend/workers/Dockerfile.pod_monitor`    |
+| `result-processor`   | `backend/workers/Dockerfile.result_processor` |
+| `saga-orchestrator`  | `backend/workers/Dockerfile.saga_orchestrator` |
+| `event-replay`       | `backend/workers/Dockerfile.event_replay`   |
+| `dlq-processor`      | `backend/workers/Dockerfile.dlq_processor`  |
+| `cert-generator`     | `cert-generator/Dockerfile`                 |
+| `zookeeper-certgen`  | `backend/zookeeper/Dockerfile.certgen`      |
+| `frontend-dev`       | `frontend/Dockerfile`                       |
+| `frontend`           | `frontend/Dockerfile.prod`                  |
+
+Of these 13 images, 12 are scanned by Trivy and promoted to `latest` in the
+[Docker Scan & Promote](#docker-scan--promote) workflow. The `frontend-dev` image is excluded — it's the Vite dev
+server build used only for E2E tests in CI and is never deployed to production.
+
+The base image is cached separately as a zstd-compressed tarball since its dependencies rarely change. Worker images
+depend on it via `--build-context base=docker-image://integr8scode-base:latest`. Utility and frontend images use GHA
+layer caching.
+
+All 13 images are pushed to GHCR in parallel, with each push tracked by PID so individual failures are reported:
+
+```yaml
+declare -A PIDS
+for name in base backend coordinator k8s-worker ...; do
+  docker push "$IMG/$name:$TAG" &
+  PIDS[$name]=$!
+done
+FAILED=0
+for name in "${!PIDS[@]}"; do
+  if ! wait "${PIDS[$name]}"; then
+    echo "::error::Failed to push $name"
+    FAILED=1
+  fi
+done
+[ "$FAILED" -eq 0 ] || exit 1
+```
 
-2. **Image build**: After unit tests pass, all Docker images are built with GHA layer caching.
+Fork PRs skip the GHCR push (no write access), so E2E tests only run for non-fork PRs.
 
-3. **E2E tests (parallel)**: Backend and frontend E2E tests run in parallel on separate runners, each with its own
-   isolated stack (k3s + docker compose):
-    - Backend E2E tests (pytest with k8s)
-    - Frontend E2E tests (Playwright)
+### Phase 3: E2E tests
+
+Backend and frontend E2E tests run on separate runners. Each runner provisions its own k3s cluster and docker compose
+stack, pulling pre-built images from GHCR.
+
+#### E2E Boot (`.github/actions/e2e-boot`)
+
+This action kicks off three slow tasks that can overlap:
+
+1. **GHCR login** using `docker/login-action@v3`
+2. **Background image pull + infra pre-warm** — pulls all compose images then starts infrastructure services
+   (mongo, redis, kafka, zookeeper, schema-registry) in a background `nohup` process. The exit status is persisted
+   to `/tmp/infra-pull.exit` so the next action can check for failures.
+3. **k3s install** — downloads and installs a pinned k3s version with SHA256 checksum verification (see
+   [supply-chain hardening](#supply-chain-hardening) below)
+
+#### E2E Ready (`.github/actions/e2e-ready`)
+
+This action finalizes the environment after boot tasks complete:
+
+1. **Finalize k3s** — copies kubeconfig, rewrites the API server address to `host.docker.internal` so containers
+   inside docker compose can reach the k3s API server, creates the `integr8scode` namespace
+2. **Start cert-generator** in the background
+3. **Copy test config** — uses `config.test.toml` and `secrets.example.toml`
+4. **Wait for image pull and infra** — blocks until the background pull completes and checks the exit code from
+   `/tmp/infra-pull.exit`, failing fast if the background process had errors
+5. **Start compose stack** with `docker compose up -d --no-build`
+6. **Health checks** — waits for backend (`/api/v1/health/live`), and optionally frontend (`https://localhost:5001`)
+7. **Seed test users** via `scripts/seed_users.py`
+
+#### Frontend E2E sharding
+
+Frontend E2E tests use Playwright with 2 shards running in parallel on separate runners. Between `e2e-boot` and
+`e2e-ready`, each shard installs Node.js dependencies and Playwright browsers (with caching), overlapping that work
+with k3s booting in the background.
+
+```
+e2e-boot (GHCR login + pull + k3s install)
+    |
+    ├── npm ci + playwright install (overlapped with k3s)
+    |
+e2e-ready (finalize k3s + start stack + health check)
+    |
+    └── npx playwright test --shard=N/2
+```
 
 ### Coverage reporting
 
-Each test suite reports coverage to [Codecov](https://codecov.io/):
-- `backend-unit` flag for unit tests
-- `backend-e2e` flag for E2E tests
-- `frontend-unit` flag for frontend unit tests
+Each test suite reports coverage to [Codecov](https://codecov.io/) with separate flags:
+
+- `backend-unit` — backend unit tests
+- `backend-e2e` — backend E2E tests
+- `frontend-unit` — frontend unit tests (Vitest with `lcov` output)
+
+### Log collection on failure
+
+When E2E tests fail, logs are collected automatically and uploaded as artifacts:
+
+- All docker compose service logs with timestamps
+- Individual service logs for each worker
+- Kubernetes events sorted by timestamp (backend E2E only)
+
+## Docker Scan & Promote
+
+This workflow implements the promotion model: the `latest` tag is never set during the build. Only this workflow
+sets it, and only after all tests pass.
+
+```mermaid
+graph LR
+    ST["Stack Tests<br/>(main, success)"] -->|workflow_run trigger| Scan
+    Scan["Trivy Scan<br/>(12 images in parallel)"] --> Promote["crane copy<br/>sha-xxx → latest"]
+    Promote --> Summary["Step Summary"]
+```
+
+### Trigger
+
+Runs automatically when `Stack Tests` completes successfully on `main`. Can also be triggered manually via
+`workflow_dispatch` with an optional SHA input to promote a specific commit.
+
+### Scan
+
+Uses [Trivy](https://trivy.dev/) (pinned at `v0.68.2`) to scan all 12 deployed images in parallel via matrix strategy.
+Scans for `CRITICAL` and `HIGH` severity vulnerabilities with unfixed issues ignored. Results are uploaded as SARIF
+files to GitHub's Security tab.
+
+### Promote
+
+Uses [crane](https://github.com/google/go-containerregistry/blob/main/cmd/crane/README.md) to copy manifests at the
+registry level (`crane copy sha-tag latest`), avoiding any rebuild or re-push. This is a fast, atomic operation that
+simply re-tags existing image manifests.
+
+## SBOM & Supply Chain Security
+
+The `sbom-compliance.yml` workflow generates [SPDX](https://spdx.dev/) Software Bills of Materials for both backend
+(Python) and frontend (JavaScript) components. It runs on every push/PR to `main` and weekly on a schedule.
+
+For each component:
+
+1. **Generate SBOM** using [anchore/sbom-action](https://github.com/anchore/sbom-action) — produces an SPDX JSON file
+   listing all direct and transitive dependencies
+2. **Scan SBOM** using [anchore/scan-action](https://github.com/anchore/scan-action) (Grype) — checks for known
+   vulnerabilities with a `high` severity cutoff
+3. **Upload** — SBOM artifacts are retained for 5 days; vulnerability results are uploaded as SARIF to GitHub's
+   Security tab
+
+## Supply-chain hardening
+
+### k3s version pinning and checksum verification
+
+The k3s installation in CI is hardened against supply-chain attacks:
+
+1. **Pinned version** — `K3S_VERSION` is set as a workflow-level env var (`v1.32.11+k3s1`), not fetched dynamically
+2. **Source pinning** — the install script is fetched from the k3s GitHub repository at the exact tagged version
+   (e.g., `https://raw.githubusercontent.com/k3s-io/k3s/v1.32.11%2Bk3s1/install.sh`), not from the `get.k3s.io` CDN
+3. **SHA256 verification** — the install script is verified against a known checksum before execution:
+
+```bash
+K3S_TAG=$(echo "$K3S_VERSION" | sed 's/+/%2B/g')
+curl -sfL "https://raw.githubusercontent.com/k3s-io/k3s/${K3S_TAG}/install.sh" -o /tmp/k3s-install.sh
+echo "$K3S_INSTALL_SHA256  /tmp/k3s-install.sh" | sha256sum -c -
+chmod +x /tmp/k3s-install.sh
+INSTALL_K3S_VERSION="$K3S_VERSION" ... /tmp/k3s-install.sh
+```
+
+This prevents the common `curl | sh` anti-pattern where a compromised CDN or MITM could inject malicious code.
+
+### GHCR image tags
+
+Images are tagged with `sha-<7chars>` (immutable, tied to a specific commit) during build. The `latest` tag is only
+applied by the Docker Scan & Promote workflow after all tests and security scans pass. This means:
+
+- Every E2E test runs against exactly the images built from that commit
+- `latest` is never stale or untested
+- Any commit's images can be pulled by their SHA tag for debugging
+
+### Dependency pinning
+
+All GitHub Actions are pinned to major versions (e.g., `actions/checkout@v6`, `docker/build-push-action@v6`). Trivy is
+pinned to a specific version (`aquasecurity/trivy-action@0.33.1`) for scan reproducibility.
+
+## Linting and type checking
+
+Three lightweight workflows run independently since they catch obvious issues quickly.
+
+**Backend (Python):**
+
+- [Ruff](https://docs.astral.sh/ruff/) checks for style violations, import ordering, and common bugs
+- [mypy](https://mypy.readthedocs.io/) with strict settings catches type mismatches and missing return types
+
+**Frontend (TypeScript/Svelte):**
+
+- ESLint checks for code quality issues
+- `svelte-check` verifies TypeScript types and Svelte component correctness
+
+Both use dependency caching ([uv](https://docs.astral.sh/uv/) for Python, npm for Node.js) to skip reinstallation
+when lockfiles haven't changed.
+
+## Security scanning
+
+The `security.yml` workflow uses [Bandit](https://bandit.readthedocs.io/) to perform static analysis on Python source
+files, flagging issues like hardcoded credentials, SQL injection patterns, and unsafe deserialization. It excludes the
+test directory and reports only medium-severity and above findings. Container-level vulnerability scanning with Trivy
+runs as part of the [Docker Scan & Promote](#docker-scan--promote) workflow.
 
 ## Documentation
 
@@ -189,6 +338,47 @@ the [Material theme](https://squidfunk.github.io/mkdocs-material/). It triggers
 
 On pushes to main, the workflow deploys the built site to GitHub Pages.
 
+## Build optimizations
+
+### Docker layer caching
+
+All image builds use [docker/build-push-action](https://github.com/docker/build-push-action) with GitHub Actions
+cache. Each service has its own cache scope, preventing pollution between unrelated builds:
+
+```yaml
+- name: Build cert-generator image
+  uses: docker/build-push-action@v6
+  with:
+    context: ./cert-generator
+    file: ./cert-generator/Dockerfile
+    load: true
+    tags: integr8scode-cert-generator:latest
+    cache-from: type=gha,scope=cert-generator
+    cache-to: type=gha,mode=max,scope=cert-generator
+```
+
+### Base image caching
+
+The base image (Python + all pip dependencies) changes infrequently, so it's cached as a zstd-compressed tarball keyed
+on `Dockerfile.base`, `pyproject.toml`, and `uv.lock`. On cache hit the image is loaded directly with `docker load`,
+skipping the entire build.
+
+### Background infra pre-warm
+
+The `e2e-boot` action pulls all docker compose images and starts infrastructure services _in the background_ while k3s
+installs. This overlaps network-bound (image pull) and CPU-bound (k3s compilation) work, saving several minutes per
+E2E job.
+
+### Frontend Playwright caching
+
+Playwright browsers are cached by `package-lock.json` hash. On cache hit, only system dependencies are installed
+(`playwright install-deps chromium`), skipping the browser download.
+
+### Parallel image push
+
+All 13 images are pushed to GHCR concurrently using background processes with PID tracking. Each push failure is
+reported individually via `::error::` annotations.
+
 ## Running locally
 
 You can run most checks locally before pushing.
@@ -197,10 +387,10 @@ You can run most checks locally before pushing.
 cd backend
 
 # Linting
-uv run ruff check .
+uv run ruff check . --config pyproject.toml
 
 # Type checking
-uv run mypy .
+uv run mypy --config-file pyproject.toml --strict .
 
 # Security scan
 uv tool run bandit -r . -x tests/ -ll
@@ -216,76 +406,23 @@ cd frontend
 npm run lint
 
 # Type checking
-npx tsc --noEmit
+npm run check
 
 # Unit tests
 npm run test
 ```
 
-For E2E tests, use the same deployment as CI:
+For E2E tests, use the deployment script to bring up the full stack:
 
 ```bash
-# Start full stack (requires k8s configured locally)
-./deploy.sh dev
+# Start full stack with k8s configured locally
+./deploy.sh dev --wait
 
-# Run tests inside the running backend container
+# Run backend E2E tests inside the running container
 docker compose exec -T backend uv run pytest tests/e2e -v
 
 # Run frontend E2E tests
 cd frontend && npx playwright test
 ```
 
-Or use `./deploy.sh test` which handles everything automatically.
-
-## Build optimizations
-
-The CI pipeline employs several caching strategies to minimize build times.
-
-### Docker layer caching
-
-All image builds use [docker/build-push-action](https://github.com/docker/build-push-action) with GitHub Actions cache:
-
-```yaml
-- name: Build base image
-  uses: docker/build-push-action@v6
-  with:
-    context: ./backend
-    file: ./backend/Dockerfile.base
-    load: true
-    tags: integr8scode-base:latest
-    cache-from: type=gha,scope=backend-base
-    cache-to: type=gha,mode=max,scope=backend-base
-```
-
-Each service has its own cache scope (`backend-base`, `backend`, `frontend`, `cert-generator`), preventing cache
-pollution between unrelated builds.
-
-### Infrastructure image caching
-
-A reusable action at `.github/actions/docker-cache` handles infrastructure images (MongoDB, Redis, Kafka, Schema
-Registry). It stores pulled images as zstd-compressed tarballs in the GitHub Actions cache, saving ~30 seconds per run
-and avoiding Docker Hub rate limits.
-
-### k3s setup action
-
-A reusable composite action at `.github/actions/k3s-setup` handles Kubernetes setup:
-- Installs k3s with traefik disabled
-- Creates the `integr8scode` namespace
-- Generates a kubeconfig accessible from Docker containers (via `host.docker.internal`)
-
-This eliminates copy-paste across workflows and ensures consistent k8s setup.
-
-## Workflow files
-
-| Workflow           | File                                 | Purpose                            |
-|--------------------|--------------------------------------|------------------------------------|
-| Ruff Linting       | `.github/workflows/ruff.yml`         | Python code style and import checks |
-| MyPy Type Checking | `.github/workflows/mypy.yml`         | Python static type analysis        |
-| Frontend CI        | `.github/workflows/frontend-ci.yml`  | TypeScript lint and type check     |
-| Security Scanning  | `.github/workflows/security.yml`     | Bandit SAST                        |
-| Docker Build & Scan| `.github/workflows/docker.yml`       | Image build and Trivy scan         |
-| Stack Tests        | `.github/workflows/stack-tests.yml`  | All unit and E2E tests               |
-| Documentation      | `.github/workflows/docs.yml`         | MkDocs build and deploy            |
-
-All workflows use [uv](https://docs.astral.sh/uv/) for Python dependency management and npm for Node.js, with caching
-enabled for both.
+Or use `./deploy.sh test` which handles stack setup, testing, and teardown automatically.