From bd168e5c9cbe9e4f6adc8458d531fd0d9792830b Mon Sep 17 00:00:00 2001 From: Maxwell Date: Sat, 27 Jun 2026 01:27:40 +0200 Subject: [PATCH 1/2] fix(observability): add Docker healthchecks for Prometheus and APISIX, document cross-stack DNS naming - Add healthcheck to APISIX gateway (port 9091, prometheus metrics endpoint) - Add healthcheck to Prometheus (port 9090, /-/ready endpoint) - Document cross-stack DNS convention in prometheus.yml header - Update README with cross-stack target naming note - Regenerate stacks (healthcheck blocks now in spanning output) Ref: #503 --- README.md | 1 + apisix/api-gateway/docker-compose.yml | 5 +++++ observability/prometheus/config/prometheus.yml | 5 +++++ observability/prometheus/docker-compose.yml | 5 +++++ stacks/README.md | 2 +- stacks/infrastructure.yml | 11 +++++++++++ stacks/observability.yml | 11 +++++++++++ 7 files changed, 39 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1dcd2f9..a7938b5 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ The observability stack provides full monitoring capabilities through metrics, l - Loki - Traefik - APISIX Gateway + - Traefik and APISIX are cross-stack targets resolved via Swarm's `_` DNS naming on the shared overlay network. #### Logs diff --git a/apisix/api-gateway/docker-compose.yml b/apisix/api-gateway/docker-compose.yml index 5f2ee0e..adfcdd0 100644 --- a/apisix/api-gateway/docker-compose.yml +++ b/apisix/api-gateway/docker-compose.yml @@ -19,6 +19,11 @@ services: - ./config/apisix.yaml:/usr/local/apisix/conf/config.yaml:ro - cache:/tmp/apisix-cache/ restart: unless-stopped + healthcheck: + test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://localhost:9091/apisix/prometheus/metrics'] + interval: 20s + timeout: 5s + retries: 3 labels: - "traefik.enable=${TRAEFIK_ENABLE}" - "traefik.http.routers.apisix.rule=Host(`${HOST}`)" diff --git a/observability/prometheus/config/prometheus.yml b/observability/prometheus/config/prometheus.yml index 54b5941..fe65fd4 100644 --- a/observability/prometheus/config/prometheus.yml +++ b/observability/prometheus/config/prometheus.yml @@ -8,6 +8,11 @@ global: rule_files: - 'alert.rules' +# DNS naming convention: +# Same-stack targets use short names (e.g., 'tempo:3200') +# Cross-stack targets use _ (e.g., 'infrastructure_traefik:8080') +# All stacks share the 'traefik-public' overlay network. + scrape_configs: # Prometheus self-monitoring - job_name: 'prometheus' diff --git a/observability/prometheus/docker-compose.yml b/observability/prometheus/docker-compose.yml index aad0628..bbb5c74 100644 --- a/observability/prometheus/docker-compose.yml +++ b/observability/prometheus/docker-compose.yml @@ -30,6 +30,11 @@ services: networks: - default restart: unless-stopped + healthcheck: + test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://localhost:9090/-/ready'] + interval: 30s + timeout: 5s + retries: 3 logging: options: max-size: "10m" diff --git a/stacks/README.md b/stacks/README.md index f7979d9..d7f3835 100644 --- a/stacks/README.md +++ b/stacks/README.md @@ -129,7 +129,7 @@ python3 tools/render_compose.py -i stacks/infrastructure.yml -o /tmp/check.rende - Ensure each service folder has a `.env` available. For local development, copy from `.env.example`; for production, use `./stackctl.sh secrets deploy` (see [Managing Secrets](../docs/Managing%20Secrets.md)). - APISIX dashboard uses `apisix/api-dashboard/config/conf.yaml` (generated from `conf.example.yml`). -- Consider adding healthchecks for critical dependencies to improve startup reliability. +- Healthchecks have been added for Prometheus and APISIX. Consider adding them for other services as needed. ### Resource caps & logging diff --git a/stacks/infrastructure.yml b/stacks/infrastructure.yml index b589a9e..8b7b2cc 100644 --- a/stacks/infrastructure.yml +++ b/stacks/infrastructure.yml @@ -62,6 +62,17 @@ services: volumes: - ./apisix/api-gateway/config/apisix.yaml:/usr/local/apisix/conf/config.yaml:ro - cache:/tmp/apisix-cache/ + healthcheck: + test: + - CMD + - wget + - --no-verbose + - --tries=1 + - --spider + - http://localhost:9091/apisix/prometheus/metrics + interval: 20s + timeout: 5s + retries: 3 labels: - traefik.enable=${TRAEFIK_ENABLE} - traefik.http.routers.apisix.rule=Host(`${HOST}`) diff --git a/stacks/observability.yml b/stacks/observability.yml index bb392df..2411c62 100644 --- a/stacks/observability.yml +++ b/stacks/observability.yml @@ -195,6 +195,17 @@ services: - traefik.http.services.prometheus.loadbalancer.server.port=${PORT} networks: - default + healthcheck: + test: + - CMD + - wget + - --no-verbose + - --tries=1 + - --spider + - http://localhost:9090/-/ready + interval: 30s + timeout: 5s + retries: 3 logging: options: max-size: 10m From d1fdd2cbc31000c3b2a70361ba5985f86464b0dd Mon Sep 17 00:00:00 2001 From: Maxwell Date: Sat, 27 Jun 2026 01:29:07 +0200 Subject: [PATCH 2/2] style(compose): break healthcheck test arrays across multiple lines Fix yamllint line-length violations (119 > 80, 101 > 80 characters). Convert single-line flow-style arrays to block sequence format. Ref: #503 --- apisix/api-gateway/docker-compose.yml | 8 +++++++- observability/prometheus/docker-compose.yml | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/apisix/api-gateway/docker-compose.yml b/apisix/api-gateway/docker-compose.yml index adfcdd0..62c99ad 100644 --- a/apisix/api-gateway/docker-compose.yml +++ b/apisix/api-gateway/docker-compose.yml @@ -20,7 +20,13 @@ services: - cache:/tmp/apisix-cache/ restart: unless-stopped healthcheck: - test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://localhost:9091/apisix/prometheus/metrics'] + test: + - CMD + - wget + - --no-verbose + - --tries=1 + - --spider + - http://localhost:9091/apisix/prometheus/metrics interval: 20s timeout: 5s retries: 3 diff --git a/observability/prometheus/docker-compose.yml b/observability/prometheus/docker-compose.yml index bbb5c74..50a5947 100644 --- a/observability/prometheus/docker-compose.yml +++ b/observability/prometheus/docker-compose.yml @@ -31,7 +31,13 @@ services: - default restart: unless-stopped healthcheck: - test: ['CMD', 'wget', '--no-verbose', '--tries=1', '--spider', 'http://localhost:9090/-/ready'] + test: + - CMD + - wget + - --no-verbose + - --tries=1 + - --spider + - http://localhost:9090/-/ready interval: 30s timeout: 5s retries: 3