Skip to content

Commit 7d1e497

Browse files
committed
improvement(helm): postgres startupProbe + otel-collector NetworkPolicy
- add startupProbe defaults for both postgresql + copilot-postgresql STSs to shield liveness from slow first-boot (pgvector init, WAL replay) - render a dedicated NetworkPolicy for the otel-collector when telemetry.enabled=true (OTLP ingress from app/realtime/copilot, DNS + HTTPS egress for forwarding to external observability backends) - document why copilot + copilot-postgresql intentionally do NOT ship dedicated NetworkPolicies (Redis URL is unknowable at render time) - regression test pins the otel-collector NP at documentIndex 3
1 parent 72648ac commit 7d1e497

5 files changed

Lines changed: 128 additions & 2 deletions

File tree

helm/sim/templates/networkpolicy.yaml

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,4 +303,89 @@ spec:
303303
- protocol: TCP
304304
port: 443
305305
{{- end }}
306+
307+
{{- if .Values.telemetry.enabled }}
308+
---
309+
# Network Policy for OpenTelemetry Collector
310+
apiVersion: networking.k8s.io/v1
311+
kind: NetworkPolicy
312+
metadata:
313+
name: {{ include "sim.fullname" . }}-otel-collector
314+
namespace: {{ .Release.Namespace }}
315+
labels:
316+
{{- include "sim.labels" . | nindent 4 }}
317+
app.kubernetes.io/component: telemetry
318+
spec:
319+
podSelector:
320+
matchLabels:
321+
{{- include "sim.selectorLabels" . | nindent 6 }}
322+
app.kubernetes.io/component: telemetry
323+
policyTypes:
324+
- Ingress
325+
- Egress
326+
ingress:
327+
# OTLP from app
328+
- from:
329+
- podSelector:
330+
matchLabels:
331+
{{- include "sim.app.selectorLabels" . | nindent 10 }}
332+
ports:
333+
- protocol: TCP
334+
port: 4317
335+
- protocol: TCP
336+
port: 4318
337+
# OTLP from realtime
338+
{{- if .Values.realtime.enabled }}
339+
- from:
340+
- podSelector:
341+
matchLabels:
342+
{{- include "sim.realtime.selectorLabels" . | nindent 10 }}
343+
ports:
344+
- protocol: TCP
345+
port: 4317
346+
- protocol: TCP
347+
port: 4318
348+
{{- end }}
349+
# OTLP from copilot
350+
{{- if .Values.copilot.enabled }}
351+
- from:
352+
- podSelector:
353+
matchLabels:
354+
{{- include "sim.selectorLabels" . | nindent 10 }}
355+
app.kubernetes.io/component: copilot
356+
ports:
357+
- protocol: TCP
358+
port: 4317
359+
- protocol: TCP
360+
port: 4318
361+
{{- end }}
362+
egress:
363+
# DNS
364+
- to: []
365+
ports:
366+
- protocol: UDP
367+
port: 53
368+
- protocol: TCP
369+
port: 53
370+
# HTTPS for forwarding to external observability backends (Datadog, Honeycomb, etc.)
371+
- to:
372+
- ipBlock:
373+
cidr: 0.0.0.0/0
374+
except:
375+
{{- range (default (list "169.254.169.254/32" "169.254.170.2/32") .Values.networkPolicy.egressExceptCidrs) }}
376+
- {{ . | quote }}
377+
{{- end }}
378+
ports:
379+
- protocol: TCP
380+
port: 443
381+
{{- end }}
382+
383+
{{- /*
384+
Copilot + copilot-postgresql intentionally do NOT ship dedicated NetworkPolicies.
385+
Copilot requires REDIS_URL (external Redis on a non-443 port), and the chart
386+
cannot know the user's Redis host/port at render time — a default egress rule
387+
would silently block Redis on most installs. Users running networkPolicy.enabled=true
388+
with copilot enabled should add their own NPs (or extend networkPolicy.egress
389+
with the appropriate egress rules).
390+
*/}}
306391
{{- end }}

helm/sim/templates/statefulset-copilot-postgres.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,10 @@ spec:
115115
envFrom:
116116
- secretRef:
117117
name: {{ include "sim.fullname" . }}-copilot-postgresql-secret
118+
{{- if .Values.copilot.postgresql.startupProbe }}
119+
startupProbe:
120+
{{- toYaml .Values.copilot.postgresql.startupProbe | nindent 12 }}
121+
{{- end }}
118122
{{- if .Values.copilot.postgresql.livenessProbe }}
119123
livenessProbe:
120124
{{- toYaml .Values.copilot.postgresql.livenessProbe | nindent 12 }}

helm/sim/templates/statefulset-postgresql.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ spec:
140140
name: {{ include "sim.fullname" . }}-postgresql-env
141141
- secretRef:
142142
name: {{ include "sim.postgresqlSecretName" . }}
143+
{{- if .Values.postgresql.startupProbe }}
144+
startupProbe:
145+
{{- toYaml .Values.postgresql.startupProbe | nindent 12 }}
146+
{{- end }}
143147
{{- if .Values.postgresql.livenessProbe }}
144148
livenessProbe:
145149
{{- toYaml .Values.postgresql.livenessProbe | nindent 12 }}

helm/sim/tests/networkpolicy_test.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,19 @@ tests:
128128
- protocol: TCP
129129
port: 3000
130130

131+
- it: telemetry collector NetworkPolicy renders when telemetry.enabled=true
132+
set:
133+
<<: *defaults
134+
telemetry.enabled: true
135+
documentIndex: 3
136+
asserts:
137+
- equal:
138+
path: kind
139+
value: NetworkPolicy
140+
- equal:
141+
path: metadata.name
142+
value: t-sim-otel-collector
143+
131144
- it: networkPolicy.egress (custom rules) are appended to both app and realtime NetworkPolicies
132145
set:
133146
<<: *defaults

helm/sim/values.yaml

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -622,12 +622,22 @@ postgresql:
622622
targetPort: 5432
623623

624624
# Health checks
625+
# startupProbe shields liveness from slow first-boot scenarios (pgvector
626+
# extension init, WAL replay after a crash on a large data dir). Gives
627+
# postgres up to 150s (30 * 5s) to become ready before liveness takes over.
628+
startupProbe:
629+
exec:
630+
command: ["pg_isready", "-U", "postgres", "-d", "sim"]
631+
periodSeconds: 5
632+
failureThreshold: 30
633+
timeoutSeconds: 5
634+
625635
livenessProbe:
626636
exec:
627637
command: ["pg_isready", "-U", "postgres", "-d", "sim"]
628638
initialDelaySeconds: 10
629639
periodSeconds: 5
630-
640+
631641
readinessProbe:
632642
exec:
633643
command: ["pg_isready", "-U", "postgres", "-d", "sim"]
@@ -1440,14 +1450,24 @@ copilot:
14401450
targetPort: 5432
14411451

14421452
# Health checks
1453+
# startupProbe shields liveness from slow first-boot scenarios (pgvector
1454+
# extension init, WAL replay after a crash). Gives postgres up to 150s
1455+
# (30 * 5s) to become ready before liveness takes over.
1456+
startupProbe:
1457+
exec:
1458+
command: ["pg_isready", "-U", "copilot", "-d", "copilot"]
1459+
periodSeconds: 5
1460+
failureThreshold: 30
1461+
timeoutSeconds: 5
1462+
14431463
livenessProbe:
14441464
exec:
14451465
command: ["pg_isready", "-U", "copilot", "-d", "copilot"]
14461466
initialDelaySeconds: 10
14471467
periodSeconds: 5
14481468
timeoutSeconds: 5
14491469
failureThreshold: 10
1450-
1470+
14511471
readinessProbe:
14521472
exec:
14531473
command: ["pg_isready", "-U", "copilot", "-d", "copilot"]

0 commit comments

Comments
 (0)