Skip to content

Commit e6649ae

Browse files
committed
fix: Config TFO-Agent & Deploy kubernetes manifest
1 parent 5f53cc2 commit e6649ae

6 files changed

Lines changed: 437 additions & 61 deletions

File tree

configs/tfo-agent.default.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,12 +374,12 @@ exporter:
374374

375375
# Traces export
376376
traces:
377-
enabled: false # Enable when distributed tracing is needed
377+
enabled: true
378378
# endpoint: "" # Full URL override (e.g., http://tfo-collector:4318/v2/traces)
379379

380380
# Logs export
381381
logs:
382-
enabled: false # Enable when log forwarding is needed
382+
enabled: true
383383
# endpoint: "" # Full URL override (e.g., http://tfo-collector:4318/v2/logs)
384384

385385
# =============================================================================

configs/tfo-agent.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -375,12 +375,12 @@ exporter:
375375

376376
# Traces export
377377
traces:
378-
enabled: false # Enable when distributed tracing is needed
378+
enabled: true
379379
# endpoint: "" # Full URL override (e.g., http://tfo-collector:4318/v2/traces)
380380

381381
# Logs export
382382
logs:
383-
enabled: false # Enable when log forwarding is needed
383+
enabled: true
384384
# endpoint: "" # Full URL override (e.g., http://tfo-collector:4318/v2/logs)
385385

386386
# =============================================================================

deploy/kubernetes/configmap.yaml

Lines changed: 93 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
---
2-
# TFO-Agent ConfigMap
3-
# Kubernetes-enabled configuration
2+
# TFO-Agent ConfigMap (Node DaemonSet)
3+
# Per-node OS metrics: node_exporter ON, kubernetes collector OFF.
4+
# Auth and endpoint are injected via env vars (TELEMETRYFLOW_API_KEY_ID/SECRET/ENDPOINT).
45

56
apiVersion: v1
67
kind: ConfigMap
@@ -12,84 +13,138 @@ metadata:
1213
app.kubernetes.io/component: monitoring
1314
data:
1415
tfo-agent.yaml: |
15-
telemetryflow:
16-
api_key_id: ""
17-
api_key_secret: ""
18-
endpoint: "tfo-backend.telemetryflow.svc.cluster.local:4317"
19-
protocol: grpc
20-
tls:
21-
enabled: false
22-
2316
agent:
24-
name: "TFO-Agent (Kubernetes)"
17+
description: "TFO Agent - ${NODE_NAME}"
2518
tags:
26-
environment: production
27-
deployment: kubernetes
19+
environment: "${ENVIRONMENT}"
20+
cluster: "${CLUSTER_NAME}"
2821
2922
heartbeat:
3023
interval: 60s
24+
timeout: 10s
3125
include_system_info: true
3226
3327
collectors:
34-
system:
28+
node_exporter:
3529
enabled: true
3630
interval: 15s
3731
cpu: true
3832
memory: true
39-
disk: true
33+
disk_io: true
34+
filesystem: true
4035
network: true
36+
load_avg: true
37+
thermal: false
38+
textfile: false
39+
conntrack: false
40+
psi: false
41+
vmstat: false
42+
sockstat: false
43+
44+
# kubernetes is handled by the separate K8s Collector Deployment (tfo-agent-k8s)
45+
kubernetes:
46+
enabled: false
47+
48+
ebpf:
49+
enabled: false
50+
51+
prometheus_server:
52+
enabled: true
53+
port: 8888
54+
path: /metrics
55+
56+
exporter:
57+
otlp:
58+
enabled: true
59+
batch_size: 100
60+
flush_interval: 10s
61+
compression: gzip
62+
63+
buffer:
64+
enabled: true
65+
max_size_mb: 100
66+
path: /var/lib/tfo-agent/buffer
67+
flush_interval: 30s
68+
69+
logging:
70+
level: info
71+
format: json
72+
73+
---
74+
# TFO-Agent K8s Collector ConfigMap
75+
# Single-replica Deployment: kubernetes collector ON, node_exporter OFF.
76+
# Connects to TFO Platform backend REST API (not the OTLP Collector port).
77+
# Auth and endpoint are injected via env vars.
78+
79+
apiVersion: v1
80+
kind: ConfigMap
81+
metadata:
82+
name: tfo-agent-k8s-config
83+
namespace: telemetryflow
84+
labels:
85+
app.kubernetes.io/name: tfo-agent
86+
app.kubernetes.io/component: k8s-collector
87+
data:
88+
tfo-agent.yaml: |
89+
agent:
90+
description: "TFO K8s Collector - ${CLUSTER_NAME}"
91+
tags:
92+
environment: "${ENVIRONMENT}"
93+
cluster: "${CLUSTER_NAME}"
94+
95+
heartbeat:
96+
interval: 60s
97+
timeout: 10s
98+
99+
collectors:
100+
node_exporter:
101+
enabled: false
41102
42103
kubernetes:
43104
enabled: true
44105
interval: 30s
45-
kubeconfig: ""
106+
kubeconfig: "" # empty = in-cluster ServiceAccount auto-detection
46107
context: ""
47108
namespaces: []
48109
exclude_namespaces:
49110
- kube-system
111+
- kube-public
112+
- kube-node-lease
50113
nodes: true
51114
pods: true
52115
deployments: true
53116
namespaces_collect: true
54117
storage: true
55118
services: true
56119
workloads: true
57-
metrics_api: true
120+
events: true
121+
resource_counts: true
122+
network: true # Kubelet /stats/summary (requires nodes/proxy RBAC)
123+
metrics_api: true # CPU/Memory usage from metrics-server (set false if not installed)
58124
sync_to_backend: true
59125
sync_interval: 60s
60-
cluster_name: ""
61-
cluster_provider: ""
126+
cluster_name: "" # auto-detected from CLUSTER_NAME env or hostname
127+
cluster_provider: "" # auto-detected from env/filesystem heuristics
128+
# cluster_id is auto-registered on startup (find-or-create)
129+
# Set TELEMETRYFLOW_K8S_CLUSTER_ID env var to skip auto-registration
130+
131+
ebpf:
132+
enabled: false
62133
63134
prometheus_server:
64135
enabled: true
65136
port: 8888
66137
path: /metrics
67-
include_go_metrics: true
68-
include_process_metrics: true
69-
metric_prefix: tfo
70-
read_timeout: 10s
71-
write_timeout: 10s
72138
73139
exporter:
74140
otlp:
75-
enabled: true
76-
endpoint_version: v2
77-
batch_size: 100
78-
flush_interval: 10s
79-
compression: gzip
80-
metrics:
81-
enabled: true
82-
traces:
83-
enabled: false
84-
logs:
85-
enabled: false
141+
enabled: false # K8s state syncs directly to backend REST API, not via OTLP
86142
87143
buffer:
88144
enabled: true
89-
max_size_mb: 100
145+
max_size_mb: 50
90146
path: /var/lib/tfo-agent/buffer
91-
max_age: 24h
92-
flush_interval: 5s
147+
flush_interval: 30s
93148
94149
logging:
95150
level: info

deploy/kubernetes/daemonset.yaml

Lines changed: 74 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
---
22
# TFO-Agent DaemonSet
3-
# Runs one agent pod per node for comprehensive monitoring
3+
# Runs one agent pod per node for per-node OS metrics (node_exporter).
4+
# Kubernetes cluster state is collected separately by deployment-k8s.yaml.
45

56
apiVersion: apps/v1
67
kind: DaemonSet
@@ -30,16 +31,40 @@ spec:
3031
prometheus.io/path: "/metrics"
3132
spec:
3233
serviceAccountName: tfo-agent
34+
automountServiceAccountToken: true
3335
terminationGracePeriodSeconds: 30
3436

3537
# Tolerate all taints to run on every node (including control-plane)
3638
tolerations:
3739
- operator: Exists
3840

39-
# Node affinity: prefer Linux nodes
41+
# Run on Linux nodes only
4042
nodeSelector:
4143
kubernetes.io/os: linux
4244

45+
# Wait for tfo-backend to be ready before starting the agent.
46+
# This prevents heartbeat/auth failures on cluster cold-start when the agent
47+
# pod comes up before the backend service is accepting requests.
48+
initContainers:
49+
- name: wait-for-backend
50+
image: busybox:1.36
51+
command:
52+
- sh
53+
- -c
54+
- |
55+
until wget -qO- http://tfo-backend.telemetryflow.svc.cluster.local:3100/api/v2/health >/dev/null 2>&1; do
56+
echo "Waiting for tfo-backend..."; sleep 5
57+
done
58+
echo "tfo-backend is ready."
59+
securityContext:
60+
runAsNonRoot: true
61+
runAsUser: 65534
62+
readOnlyRootFilesystem: true
63+
allowPrivilegeEscalation: false
64+
capabilities:
65+
drop:
66+
- ALL
67+
4368
containers:
4469
- name: tfo-agent
4570
image: telemetryflow/telemetryflow-agent:latest
@@ -56,7 +81,7 @@ spec:
5681
protocol: TCP
5782

5883
env:
59-
# Inject node name and pod info via Downward API
84+
# Downward API — node and pod identity
6085
- name: NODE_NAME
6186
valueFrom:
6287
fieldRef:
@@ -86,19 +111,48 @@ spec:
86111
valueFrom:
87112
fieldRef:
88113
fieldPath: status.podIP
89-
# API credentials from secret
90-
- name: TFAGENT_TELEMETRYFLOW_API_KEY_ID
114+
115+
# TFO Platform endpoint — must point to backend REST API (NOT the OTLP collector port)
116+
# The agent uses this URL for: heartbeat, cluster auto-registration, and K8s state sync.
117+
# Format: http://<backend-service>:<port>/api/v2
118+
- name: TELEMETRYFLOW_ENDPOINT
119+
value: "http://tfo-backend.telemetryflow.svc.cluster.local:3100/api/v2"
120+
121+
# API credentials from Secret
122+
# Create secret: kubectl create secret generic tfo-agent-credentials \
123+
# --from-literal=api-key-id=tfk_xxx \
124+
# --from-literal=api-key-secret=tfs_xxx \
125+
# -n telemetryflow
126+
- name: TELEMETRYFLOW_API_KEY_ID
91127
valueFrom:
92128
secretKeyRef:
93129
name: tfo-agent-credentials
94130
key: api-key-id
95-
optional: true
96-
- name: TFAGENT_TELEMETRYFLOW_API_KEY_SECRET
131+
optional: false
132+
- name: TELEMETRYFLOW_API_KEY_SECRET
97133
valueFrom:
98134
secretKeyRef:
99135
name: tfo-agent-credentials
100136
key: api-key-secret
101-
optional: true
137+
optional: false
138+
139+
# Per-workload collector toggles (override YAML config)
140+
- name: TELEMETRYFLOW_NODE_EXPORTER_ENABLED
141+
value: "true"
142+
- name: TELEMETRYFLOW_K8S_ENABLED
143+
value: "false" # K8s state handled by tfo-agent-k8s Deployment (deployment-k8s.yaml)
144+
145+
# Prometheus server for liveness/readiness probes
146+
- name: TELEMETRYFLOW_PROMETHEUS_ENABLED
147+
value: "true"
148+
- name: TELEMETRYFLOW_PROMETHEUS_PORT
149+
value: "8888"
150+
151+
# Cluster and environment tags for OTEL resource attributes
152+
- name: CLUSTER_NAME
153+
value: "" # override with your cluster name, or auto-detected from hostname
154+
- name: ENVIRONMENT
155+
value: "production"
102156

103157
resources:
104158
requests:
@@ -110,27 +164,29 @@ spec:
110164

111165
livenessProbe:
112166
httpGet:
113-
path: /ready
114-
port: metrics
167+
path: /metrics
168+
port: 8888
115169
initialDelaySeconds: 15
116170
periodSeconds: 30
117171
timeoutSeconds: 5
172+
failureThreshold: 3
118173

119174
readinessProbe:
120175
httpGet:
121-
path: /ready
122-
port: metrics
176+
path: /metrics
177+
port: 8888
123178
initialDelaySeconds: 5
124179
periodSeconds: 10
125180
timeoutSeconds: 3
181+
failureThreshold: 3
126182

127183
volumeMounts:
128184
- name: config
129185
mountPath: /etc/tfo-agent
130186
readOnly: true
131187
- name: buffer
132188
mountPath: /var/lib/tfo-agent/buffer
133-
# Host filesystem for system metrics
189+
# Host filesystem for node_exporter metrics
134190
- name: proc
135191
mountPath: /host/proc
136192
readOnly: true
@@ -143,12 +199,15 @@ spec:
143199
mountPropagation: HostToContainer
144200

145201
securityContext:
202+
runAsUser: 0 # required to read /proc and /sys for node metrics
203+
runAsGroup: 0
146204
readOnlyRootFilesystem: true
147-
runAsNonRoot: true
148-
runAsUser: 65534
205+
allowPrivilegeEscalation: false
149206
capabilities:
150207
drop:
151208
- ALL
209+
add:
210+
- SYS_PTRACE # process inspection for node metrics
152211

153212
volumes:
154213
- name: config

0 commit comments

Comments
 (0)