11---
22# TFO-Agent DaemonSet
3- # Runs one agent pod per node for comprehensive monitoring
3+ # Runs one agent pod per node for per-node OS metrics (node_exporter).
4+ # Kubernetes cluster state is collected separately by deployment-k8s.yaml.
45
56apiVersion : apps/v1
67kind : DaemonSet
@@ -30,16 +31,40 @@ spec:
3031 prometheus.io/path : " /metrics"
3132 spec :
3233 serviceAccountName : tfo-agent
34+ automountServiceAccountToken : true
3335 terminationGracePeriodSeconds : 30
3436
3537 # Tolerate all taints to run on every node (including control-plane)
3638 tolerations :
3739 - operator : Exists
3840
39- # Node affinity: prefer Linux nodes
41+ # Run on Linux nodes only
4042 nodeSelector :
4143 kubernetes.io/os : linux
4244
45+ # Wait for tfo-backend to be ready before starting the agent.
46+ # This prevents heartbeat/auth failures on cluster cold-start when the agent
47+ # pod comes up before the backend service is accepting requests.
48+ initContainers :
49+ - name : wait-for-backend
50+ image : busybox:1.36
51+ command :
52+ - sh
53+ - -c
54+ - |
55+ until wget -qO- http://tfo-backend.telemetryflow.svc.cluster.local:3100/api/v2/health >/dev/null 2>&1; do
56+ echo "Waiting for tfo-backend..."; sleep 5
57+ done
58+ echo "tfo-backend is ready."
59+ securityContext :
60+ runAsNonRoot : true
61+ runAsUser : 65534
62+ readOnlyRootFilesystem : true
63+ allowPrivilegeEscalation : false
64+ capabilities :
65+ drop :
66+ - ALL
67+
4368 containers :
4469 - name : tfo-agent
4570 image : telemetryflow/telemetryflow-agent:latest
5681 protocol : TCP
5782
5883 env :
59- # Inject node name and pod info via Downward API
84+ # Downward API — node and pod identity
6085 - name : NODE_NAME
6186 valueFrom :
6287 fieldRef :
@@ -86,19 +111,48 @@ spec:
86111 valueFrom :
87112 fieldRef :
88113 fieldPath : status.podIP
89- # API credentials from secret
90- - name : TFAGENT_TELEMETRYFLOW_API_KEY_ID
114+
115+ # TFO Platform endpoint — must point to backend REST API (NOT the OTLP collector port)
116+ # The agent uses this URL for: heartbeat, cluster auto-registration, and K8s state sync.
117+ # Format: http://<backend-service>:<port>/api/v2
118+ - name : TELEMETRYFLOW_ENDPOINT
119+ value : " http://tfo-backend.telemetryflow.svc.cluster.local:3100/api/v2"
120+
121+ # API credentials from Secret
122+ # Create secret: kubectl create secret generic tfo-agent-credentials \
123+ # --from-literal=api-key-id=tfk_xxx \
124+ # --from-literal=api-key-secret=tfs_xxx \
125+ # -n telemetryflow
126+ - name : TELEMETRYFLOW_API_KEY_ID
91127 valueFrom :
92128 secretKeyRef :
93129 name : tfo-agent-credentials
94130 key : api-key-id
95- optional : true
96- - name : TFAGENT_TELEMETRYFLOW_API_KEY_SECRET
131+ optional : false
132+ - name : TELEMETRYFLOW_API_KEY_SECRET
97133 valueFrom :
98134 secretKeyRef :
99135 name : tfo-agent-credentials
100136 key : api-key-secret
101- optional : true
137+ optional : false
138+
139+ # Per-workload collector toggles (override YAML config)
140+ - name : TELEMETRYFLOW_NODE_EXPORTER_ENABLED
141+ value : " true"
142+ - name : TELEMETRYFLOW_K8S_ENABLED
143+ value : " false" # K8s state handled by tfo-agent-k8s Deployment (deployment-k8s.yaml)
144+
145+ # Prometheus server for liveness/readiness probes
146+ - name : TELEMETRYFLOW_PROMETHEUS_ENABLED
147+ value : " true"
148+ - name : TELEMETRYFLOW_PROMETHEUS_PORT
149+ value : " 8888"
150+
151+ # Cluster and environment tags for OTEL resource attributes
152+ - name : CLUSTER_NAME
153+ value : " " # override with your cluster name, or auto-detected from hostname
154+ - name : ENVIRONMENT
155+ value : " production"
102156
103157 resources :
104158 requests :
@@ -110,27 +164,29 @@ spec:
110164
111165 livenessProbe :
112166 httpGet :
113- path : /ready
114- port : metrics
167+ path : /metrics
168+ port : 8888
115169 initialDelaySeconds : 15
116170 periodSeconds : 30
117171 timeoutSeconds : 5
172+ failureThreshold : 3
118173
119174 readinessProbe :
120175 httpGet :
121- path : /ready
122- port : metrics
176+ path : /metrics
177+ port : 8888
123178 initialDelaySeconds : 5
124179 periodSeconds : 10
125180 timeoutSeconds : 3
181+ failureThreshold : 3
126182
127183 volumeMounts :
128184 - name : config
129185 mountPath : /etc/tfo-agent
130186 readOnly : true
131187 - name : buffer
132188 mountPath : /var/lib/tfo-agent/buffer
133- # Host filesystem for system metrics
189+ # Host filesystem for node_exporter metrics
134190 - name : proc
135191 mountPath : /host/proc
136192 readOnly : true
@@ -143,12 +199,15 @@ spec:
143199 mountPropagation : HostToContainer
144200
145201 securityContext :
202+ runAsUser : 0 # required to read /proc and /sys for node metrics
203+ runAsGroup : 0
146204 readOnlyRootFilesystem : true
147- runAsNonRoot : true
148- runAsUser : 65534
205+ allowPrivilegeEscalation : false
149206 capabilities :
150207 drop :
151208 - ALL
209+ add :
210+ - SYS_PTRACE # process inspection for node metrics
152211
153212 volumes :
154213 - name : config
0 commit comments