Skip to content

Commit d3b4077

Browse files
authored
Add /health endpoint for service monitoring (#97)
* Add /health endpoint for service monitoring
1 parent be28362 commit d3b4077

4 files changed

Lines changed: 299 additions & 0 deletions

File tree

conf/api.yaml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,43 @@ paths:
3838
'303':
3939
description: Redirect to actual address of Loing service which performs auth up to its capabilities
4040

41+
/health:
42+
get:
43+
summary: Service health check
44+
description: Service health and dependency status check
45+
responses:
46+
'200':
47+
description: Service is healthy
48+
content:
49+
application/json:
50+
schema:
51+
type: object
52+
properties:
53+
status:
54+
type: string
55+
example: ok
56+
uptime_seconds:
57+
type: integer
58+
example: 12345
59+
'503':
60+
description: Service is degraded
61+
content:
62+
application/json:
63+
schema:
64+
type: object
65+
properties:
66+
status:
67+
type: string
68+
example: degraded
69+
failures:
70+
type: object
71+
additionalProperties:
72+
type: string
73+
example:
74+
eventbridge: client not initialized
75+
kafka: producer not initialized
76+
postgres: host not configured
77+
4178
/topics:
4279
get:
4380
summary: Get a list of topics

src/event_gate_lambda.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
from src.handlers.handler_token import HandlerToken
2828
from src.handlers.handler_topic import HandlerTopic
29+
from src.handlers.handler_health import HandlerHealth
2930
from src.utils.constants import SSL_CA_BUNDLE_KEY
3031
from src.utils.utils import build_error_response
3132
from src.writers import writer_eventbridge, writer_kafka, writer_postgres
@@ -85,6 +86,9 @@
8586
# Initialize topic handler and load topic schemas
8687
handler_topic = HandlerTopic(CONF_DIR, ACCESS, handler_token).load_topic_schemas()
8788

89+
# Initialize health handler
90+
handler_health = HandlerHealth()
91+
8892

8993
def get_api() -> Dict[str, Any]:
9094
"""Return the OpenAPI specification text."""
@@ -108,6 +112,8 @@ def lambda_handler(event: Dict[str, Any], _context: Any = None) -> Dict[str, Any
108112
return get_api()
109113
if resource == "/token":
110114
return handler_token.get_token_provider_info()
115+
if resource == "/health":
116+
return handler_health.get_health()
111117
if resource == "/topics":
112118
return handler_topic.get_topics_list()
113119
if resource == "/topics/{topic_name}":

src/handlers/handler_health.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#
2+
# Copyright 2025 ABSA Group Limited
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
"""
18+
This module provides the HandlerHealth class for service health monitoring.
19+
"""
20+
import json
21+
import logging
22+
import os
23+
from datetime import datetime, timezone
24+
from typing import Dict, Any
25+
26+
from src.writers import writer_eventbridge, writer_kafka, writer_postgres
27+
28+
logger = logging.getLogger(__name__)
29+
log_level = os.environ.get("LOG_LEVEL", "INFO")
30+
logger.setLevel(log_level)
31+
32+
33+
class HandlerHealth:
34+
"""
35+
HandlerHealth manages service health checks and dependency status monitoring.
36+
"""
37+
38+
def __init__(self):
39+
self.start_time: datetime = datetime.now(timezone.utc)
40+
41+
def get_health(self) -> Dict[str, Any]:
42+
"""
43+
Check service health and return status.
44+
45+
Returns:
46+
Dict[str, Any]: API Gateway response with health status.
47+
- 200: All dependencies healthy
48+
- 503: One or more dependencies not initialized
49+
"""
50+
logger.debug("Handling GET Health")
51+
52+
failures: Dict[str, str] = {}
53+
54+
# Check Kafka writer
55+
if writer_kafka.STATE.get("producer") is None:
56+
failures["kafka"] = "producer not initialized"
57+
58+
# Check EventBridge writer
59+
eventbus_arn = writer_eventbridge.STATE.get("event_bus_arn")
60+
eventbridge_client = writer_eventbridge.STATE.get("client")
61+
if eventbus_arn:
62+
if eventbridge_client is None:
63+
failures["eventbridge"] = "client not initialized"
64+
65+
# Check PostgreSQL writer
66+
postgres_config = writer_postgres.POSTGRES
67+
if postgres_config.get("database"):
68+
if not postgres_config.get("host"):
69+
failures["postgres"] = "host not configured"
70+
elif not postgres_config.get("user"):
71+
failures["postgres"] = "user not configured"
72+
elif not postgres_config.get("password"):
73+
failures["postgres"] = "password not configured"
74+
elif not postgres_config.get("port"):
75+
failures["postgres"] = "port not configured"
76+
77+
uptime_seconds = int((datetime.now(timezone.utc) - self.start_time).total_seconds())
78+
79+
if not failures:
80+
logger.debug("Health check passed")
81+
return {
82+
"statusCode": 200,
83+
"headers": {"Content-Type": "application/json"},
84+
"body": json.dumps({"status": "ok", "uptime_seconds": uptime_seconds}),
85+
}
86+
87+
logger.debug("Health check degraded: %s", failures)
88+
return {
89+
"statusCode": 503,
90+
"headers": {"Content-Type": "application/json"},
91+
"body": json.dumps({"status": "degraded", "failures": failures}),
92+
}
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
#
2+
# Copyright 2025 ABSA Group Limited
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
import json
18+
from unittest.mock import MagicMock, patch
19+
20+
from src.handlers.handler_health import HandlerHealth
21+
22+
### get_health()
23+
24+
25+
## Minimal healthy state (just kafka)
26+
def test_get_health_minimal_kafka_healthy():
27+
"""Health check returns 200 when Kafka is initialized and optional writers are disabled."""
28+
handler = HandlerHealth()
29+
30+
with (
31+
patch("src.handlers.handler_health.writer_kafka.STATE", {"producer": MagicMock()}),
32+
patch("src.handlers.handler_health.writer_eventbridge.STATE", {"client": None, "event_bus_arn": ""}),
33+
patch("src.handlers.handler_health.writer_postgres.POSTGRES", {"database": ""}),
34+
):
35+
response = handler.get_health()
36+
37+
assert response["statusCode"] == 200
38+
body = json.loads(response["body"])
39+
assert body["status"] == "ok"
40+
assert "uptime_seconds" in body
41+
42+
43+
## Healthy state with all writers enabled
44+
def test_get_health_all_writers_enabled_and_healthy():
45+
"""Health check returns 200 when all writers are enabled and properly configured."""
46+
handler = HandlerHealth()
47+
postgres_config = {"database": "db", "host": "localhost", "user": "user", "password": "pass", "port": "5432"}
48+
49+
with (
50+
patch("src.handlers.handler_health.writer_kafka.STATE", {"producer": MagicMock()}),
51+
patch("src.handlers.handler_health.writer_eventbridge.STATE", {"client": MagicMock(), "event_bus_arn": "arn"}),
52+
patch("src.handlers.handler_health.writer_postgres.POSTGRES", postgres_config),
53+
):
54+
response = handler.get_health()
55+
56+
assert response["statusCode"] == 200
57+
body = json.loads(response["body"])
58+
assert body["status"] == "ok"
59+
assert "uptime_seconds" in body
60+
61+
62+
## Degraded state with all writers enabled
63+
def test_get_health_kafka_not_initialized():
64+
"""Health check returns 503 when Kafka writer is not initialized."""
65+
handler = HandlerHealth()
66+
postgres_config = {"database": "db", "host": "", "user": "", "password": "", "port": ""}
67+
68+
with (
69+
patch("src.handlers.handler_health.writer_kafka.STATE", {"producer": None}),
70+
patch(
71+
"src.handlers.handler_health.writer_eventbridge.STATE",
72+
{"client": None, "event_bus_arn": "arn:aws:events:us-east-1:123:event-bus/bus"},
73+
),
74+
patch("src.handlers.handler_health.writer_postgres.POSTGRES", postgres_config),
75+
):
76+
response = handler.get_health()
77+
78+
assert response["statusCode"] == 503
79+
body = json.loads(response["body"])
80+
assert body["status"] == "degraded"
81+
assert "kafka" in body["failures"]
82+
assert "eventbridge" in body["failures"]
83+
assert "postgres" in body["failures"]
84+
85+
86+
## Healthy when eventbridge is disabled
87+
def test_get_health_eventbridge_disabled():
88+
"""Health check returns 200 when EventBridge is disabled (empty event_bus_arn)."""
89+
handler = HandlerHealth()
90+
postgres_config = {"database": "db", "host": "localhost", "user": "user", "password": "pass", "port": "5432"}
91+
92+
with (
93+
patch("src.handlers.handler_health.writer_kafka.STATE", {"producer": MagicMock()}),
94+
patch("src.handlers.handler_health.writer_eventbridge.STATE", {"client": None, "event_bus_arn": ""}),
95+
patch("src.handlers.handler_health.writer_postgres.POSTGRES", postgres_config),
96+
):
97+
response = handler.get_health()
98+
99+
assert response["statusCode"] == 200
100+
101+
102+
## Healthy when postgres is disabled
103+
def test_get_health_postgres_disabled():
104+
"""Health check returns 200 when PostgreSQL is disabled (empty database)."""
105+
handler = HandlerHealth()
106+
107+
with (
108+
patch("src.handlers.handler_health.writer_kafka.STATE", {"producer": MagicMock()}),
109+
patch("src.handlers.handler_health.writer_eventbridge.STATE", {"client": MagicMock(), "event_bus_arn": "arn"}),
110+
patch("src.handlers.handler_health.writer_postgres.POSTGRES", {"database": ""}),
111+
):
112+
response = handler.get_health()
113+
114+
assert response["statusCode"] == 200
115+
116+
117+
## Degraded state - postgres host not configured
118+
def test_get_health_postgres_host_not_configured():
119+
"""Health check returns 503 when PostgreSQL host is not configured."""
120+
handler = HandlerHealth()
121+
postgres_config = {"database": "db", "host": "", "user": "user", "password": "pass", "port": "5432"}
122+
123+
with (
124+
patch("src.handlers.handler_health.writer_kafka.STATE", {"producer": MagicMock()}),
125+
patch("src.handlers.handler_health.writer_eventbridge.STATE", {"client": MagicMock(), "event_bus_arn": "arn"}),
126+
patch("src.handlers.handler_health.writer_postgres.POSTGRES", postgres_config),
127+
):
128+
response = handler.get_health()
129+
130+
assert response["statusCode"] == 503
131+
body = json.loads(response["body"])
132+
assert body["failures"]["postgres"] == "host not configured"
133+
134+
135+
## Uptime calculation
136+
def test_get_health_uptime_is_positive():
137+
"""Verify uptime_seconds is calculated and is a positive integer."""
138+
handler = HandlerHealth()
139+
postgres_config = {"database": "db", "host": "localhost", "user": "user", "password": "pass", "port": "5432"}
140+
141+
with (
142+
patch("src.handlers.handler_health.writer_kafka.STATE", {"producer": MagicMock()}),
143+
patch("src.handlers.handler_health.writer_eventbridge.STATE", {"client": MagicMock(), "event_bus_arn": "arn"}),
144+
patch("src.handlers.handler_health.writer_postgres.POSTGRES", postgres_config),
145+
):
146+
response = handler.get_health()
147+
148+
body = json.loads(response["body"])
149+
assert "uptime_seconds" in body
150+
assert isinstance(body["uptime_seconds"], int)
151+
assert body["uptime_seconds"] >= 0
152+
153+
154+
## Integration test with event_gate_module
155+
def test_health_endpoint_integration(event_gate_module, make_event):
156+
"""Test /health endpoint through lambda_handler."""
157+
event = make_event("/health")
158+
resp = event_gate_module.lambda_handler(event)
159+
160+
# Should return 200 since writers are mocked as initialized in conftest
161+
assert resp["statusCode"] == 200
162+
body = json.loads(resp["body"])
163+
assert body["status"] == "ok"
164+
assert "uptime_seconds" in body

0 commit comments

Comments
 (0)