-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathalerts.yaml
More file actions
84 lines (78 loc) · 2.78 KB
/
alerts.yaml
File metadata and controls
84 lines (78 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# PrometheusRule for GitOps alerts
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: gitops-alerts
namespace: monitoring
labels:
release: prometheus
spec:
groups:
- name: argocd
rules:
- alert: ArgoCDAppOutOfSync
expr: argocd_app_info{sync_status="OutOfSync"} == 1
for: 15m
labels:
severity: warning
annotations:
summary: "ArgoCD app {{ $labels.name }} is out of sync"
description: "Application {{ $labels.name }} in project {{ $labels.project }} has been out of sync for more than 15 minutes."
- alert: ArgoCDAppHealthDegraded
expr: argocd_app_info{health_status=~"Degraded|Missing"} == 1
for: 5m
labels:
severity: critical
annotations:
summary: "ArgoCD app {{ $labels.name }} is degraded"
description: "Application {{ $labels.name }} health status is {{ $labels.health_status }}."
- alert: ArgoCDSyncFailed
expr: increase(argocd_app_sync_total{phase="Failed"}[1h]) > 0
labels:
severity: critical
annotations:
summary: "ArgoCD sync failed for {{ $labels.name }}"
description: "Application {{ $labels.name }} sync has failed in the last hour."
- name: argo-rollouts
rules:
- alert: RolloutStalled
expr: kube_rollout_status_phase{phase="Paused"} == 1
for: 30m
labels:
severity: warning
annotations:
summary: "Rollout {{ $labels.rollout }} is stalled"
description: "Rollout {{ $labels.rollout }} in namespace {{ $labels.namespace }} has been paused for more than 30 minutes."
- alert: RolloutFailed
expr: kube_rollout_status_phase{phase="Degraded"} == 1
for: 5m
labels:
severity: critical
annotations:
summary: "Rollout {{ $labels.rollout }} failed"
description: "Rollout {{ $labels.rollout }} in namespace {{ $labels.namespace }} is in degraded state."
- name: deployment-health
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (namespace, service)
/
sum(rate(http_requests_total[5m])) by (namespace, service)
> 0.01
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate for {{ $labels.service }}"
description: "Service {{ $labels.service }} in {{ $labels.namespace }} has error rate above 1%."
- alert: HighLatency
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket[5m])) by (namespace, service, le)
) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High latency for {{ $labels.service }}"
description: "Service {{ $labels.service }} P99 latency is above 500ms."