diff --git a/README.md b/README.md index 1dcd2f9..a7938b5 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ The observability stack provides full monitoring capabilities through metrics, l - Loki - Traefik - APISIX Gateway + - Traefik and APISIX are cross-stack targets resolved via Swarm's `_` DNS naming on the shared overlay network. #### Logs diff --git a/apisix/api-gateway/docker-compose.yml b/apisix/api-gateway/docker-compose.yml index 5f2ee0e..62c99ad 100644 --- a/apisix/api-gateway/docker-compose.yml +++ b/apisix/api-gateway/docker-compose.yml @@ -19,6 +19,17 @@ services: - ./config/apisix.yaml:/usr/local/apisix/conf/config.yaml:ro - cache:/tmp/apisix-cache/ restart: unless-stopped + healthcheck: + test: + - CMD + - wget + - --no-verbose + - --tries=1 + - --spider + - http://localhost:9091/apisix/prometheus/metrics + interval: 20s + timeout: 5s + retries: 3 labels: - "traefik.enable=${TRAEFIK_ENABLE}" - "traefik.http.routers.apisix.rule=Host(`${HOST}`)" diff --git a/observability/prometheus/config/prometheus.yml b/observability/prometheus/config/prometheus.yml index 54b5941..fe65fd4 100644 --- a/observability/prometheus/config/prometheus.yml +++ b/observability/prometheus/config/prometheus.yml @@ -8,6 +8,11 @@ global: rule_files: - 'alert.rules' +# DNS naming convention: +# Same-stack targets use short names (e.g., 'tempo:3200') +# Cross-stack targets use _ (e.g., 'infrastructure_traefik:8080') +# All stacks share the 'traefik-public' overlay network. + scrape_configs: # Prometheus self-monitoring - job_name: 'prometheus' diff --git a/observability/prometheus/docker-compose.yml b/observability/prometheus/docker-compose.yml index aad0628..50a5947 100644 --- a/observability/prometheus/docker-compose.yml +++ b/observability/prometheus/docker-compose.yml @@ -30,6 +30,17 @@ services: networks: - default restart: unless-stopped + healthcheck: + test: + - CMD + - wget + - --no-verbose + - --tries=1 + - --spider + - http://localhost:9090/-/ready + interval: 30s + timeout: 5s + retries: 3 logging: options: max-size: "10m" diff --git a/stacks/README.md b/stacks/README.md index f7979d9..d7f3835 100644 --- a/stacks/README.md +++ b/stacks/README.md @@ -129,7 +129,7 @@ python3 tools/render_compose.py -i stacks/infrastructure.yml -o /tmp/check.rende - Ensure each service folder has a `.env` available. For local development, copy from `.env.example`; for production, use `./stackctl.sh secrets deploy` (see [Managing Secrets](../docs/Managing%20Secrets.md)). - APISIX dashboard uses `apisix/api-dashboard/config/conf.yaml` (generated from `conf.example.yml`). -- Consider adding healthchecks for critical dependencies to improve startup reliability. +- Healthchecks have been added for Prometheus and APISIX. Consider adding them for other services as needed. ### Resource caps & logging diff --git a/stacks/infrastructure.yml b/stacks/infrastructure.yml index b589a9e..8b7b2cc 100644 --- a/stacks/infrastructure.yml +++ b/stacks/infrastructure.yml @@ -62,6 +62,17 @@ services: volumes: - ./apisix/api-gateway/config/apisix.yaml:/usr/local/apisix/conf/config.yaml:ro - cache:/tmp/apisix-cache/ + healthcheck: + test: + - CMD + - wget + - --no-verbose + - --tries=1 + - --spider + - http://localhost:9091/apisix/prometheus/metrics + interval: 20s + timeout: 5s + retries: 3 labels: - traefik.enable=${TRAEFIK_ENABLE} - traefik.http.routers.apisix.rule=Host(`${HOST}`) diff --git a/stacks/observability.yml b/stacks/observability.yml index bb392df..2411c62 100644 --- a/stacks/observability.yml +++ b/stacks/observability.yml @@ -195,6 +195,17 @@ services: - traefik.http.services.prometheus.loadbalancer.server.port=${PORT} networks: - default + healthcheck: + test: + - CMD + - wget + - --no-verbose + - --tries=1 + - --spider + - http://localhost:9090/-/ready + interval: 30s + timeout: 5s + retries: 3 logging: options: max-size: 10m