From 25809931415aea4416ef4e562dd0d839f21c323f Mon Sep 17 00:00:00 2001
From: lucaconsalvi <lconsalv@redhat.com>
Date: Thu, 26 Feb 2026 10:58:44 +0100
Subject: [PATCH 1/2] Add shift-week-kepler demo files and configurations

---
 .claude/commands/tnf-power.md                 | 199 +++++++
 deploy/Makefile                               |  12 +-
 deploy/openshift-clusters/kepler.yml          | 105 ++++
 .../roles/kepler/defaults/main.yml            |  26 +
 .../roles/kepler/tasks/grafana.yml            | 354 +++++++++++
 .../roles/kepler/tasks/main.yml               | 295 +++++++++
 .../roles/kepler/tasks/monitoring.yml         |  94 +++
 .../kepler/templates/kepler-daemonset.yaml.j2 | 111 ++++
 .../templates/tnf-power-dashboard-cm.yaml.j2  | 560 ++++++++++++++++++
 .../scripts/deploy-kepler.sh                  |  63 ++
 .../scripts/remove-kepler.sh                  |  51 ++
 docs/kepler/KEPLER-ARCHITECTURE.md            | 319 ++++++++++
 docs/kepler/KEPLER-PRESENTATION.md            | 362 +++++++++++
 docs/kepler/README.md                         | 361 +++++++++++
 14 files changed, 2911 insertions(+), 1 deletion(-)
 create mode 100644 .claude/commands/tnf-power.md
 create mode 100644 deploy/openshift-clusters/kepler.yml
 create mode 100644 deploy/openshift-clusters/roles/kepler/defaults/main.yml
 create mode 100644 deploy/openshift-clusters/roles/kepler/tasks/grafana.yml
 create mode 100644 deploy/openshift-clusters/roles/kepler/tasks/main.yml
 create mode 100644 deploy/openshift-clusters/roles/kepler/tasks/monitoring.yml
 create mode 100644 deploy/openshift-clusters/roles/kepler/templates/kepler-daemonset.yaml.j2
 create mode 100644 deploy/openshift-clusters/roles/kepler/templates/tnf-power-dashboard-cm.yaml.j2
 create mode 100755 deploy/openshift-clusters/scripts/deploy-kepler.sh
 create mode 100755 deploy/openshift-clusters/scripts/remove-kepler.sh
 create mode 100644 docs/kepler/KEPLER-ARCHITECTURE.md
 create mode 100644 docs/kepler/KEPLER-PRESENTATION.md
 create mode 100644 docs/kepler/README.md

diff --git a/.claude/commands/tnf-power.md b/.claude/commands/tnf-power.md
new file mode 100644
index 00000000..6b991960
--- /dev/null
+++ b/.claude/commands/tnf-power.md
@@ -0,0 +1,199 @@
+---
+description: Show TNF cluster power consumption from Kepler metrics
+---
+
+You are generating a power consumption report for a TNF (Two Nodes with Fencing) cluster using Kepler metrics.
+
+## Step 0: Setup Cluster Access
+
+**IMPORTANT**: Before running any `oc` commands, you MUST ensure KUBECONFIG is set.
+
+First, check if cluster access works:
+```bash
+oc get nodes 2>&1 | head -3
+```
+
+If you get an error about missing config, look for and source the proxy.env file:
+
+```bash
+# Find proxy.env in the repository
+PROXY_ENV=$(find . -name "proxy.env" -type f 2>/dev/null | head -1)
+if [ -n "$PROXY_ENV" ]; then
+  echo "Found: $PROXY_ENV"
+  source "$PROXY_ENV"
+  echo "KUBECONFIG=$KUBECONFIG"
+fi
+```
+
+If proxy.env doesn't exist, check common locations:
+- `deploy/openshift-clusters/proxy.env`
+- Look for KUBECONFIG in the dev-scripts directory
+
+Only proceed to Step 1 after `oc get nodes` works successfully.
+
+## Prerequisites
+
+Before running queries, verify:
+1. Kepler is deployed: `oc get pods -n kepler`
+2. User workload monitoring is running: `oc get pods -n openshift-user-workload-monitoring`
+3. KUBECONFIG is set (handled in Step 0)
+
+## Query Steps
+
+### Step 1: Check Kepler Status
+
+First, verify Kepler is running and metrics are being scraped:
+
+```bash
+# Check Kepler pods
+oc get pods -n kepler -l app.kubernetes.io/name=kepler-exporter
+
+# Check if metrics are being scraped (should return 2 targets for TNF)
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s http://localhost:9090/api/v1/targets 2>/dev/null | \
+  jq '[.data.activeTargets[] | select(.labels.job == "kepler-exporter")] | length'
+```
+
+### Step 2: Check Power Measurement Mode
+
+Determine if we're getting real or estimated power:
+
+```bash
+# Check if RAPL is available (real power) or not (estimated)
+POD=$(oc get pods -n kepler -l app.kubernetes.io/name=kepler-exporter -o jsonpath='{.items[0].metadata.name}')
+RAPL_CHECK=$(oc exec -n kepler $POD -- ls /sys/class/powercap/intel-rapl 2>/dev/null || echo "NOT_FOUND")
+if [ -z "$RAPL_CHECK" ] || [ "$RAPL_CHECK" = "NOT_FOUND" ]; then
+  echo "Mode: ESTIMATED (VMs - no RAPL hardware access)"
+else
+  echo "Mode: REAL (Bare metal - RAPL available)"
+fi
+```
+
+### Step 3: Query Power Metrics
+
+Run these queries against the user workload Prometheus:
+
+```bash
+# Total cluster power (watts) - sum of node CPU power
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum(kepler_node_cpu_watts)' | \
+  jq -r '.data.result[0].value[1] // "0"'
+
+# Power by node (using node CPU watts)
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum%20by%20(instance)(kepler_node_cpu_watts)' | \
+  jq -r '.data.result[] | "\(.metric.instance): \(.value[1])W"'
+
+# Top 10 containers by power
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=topk(10,kepler_container_cpu_watts)' | \
+  jq -r '.data.result[] | "\(.metric.container_name): \(.value[1])W"'
+
+# Power over time using joules (rate gives watts)
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum(rate(kepler_node_cpu_joules_total[5m]))' | \
+  jq -r '.data.result[0].value[1] // "0"'
+```
+
+**Note on metrics**:
+- `kepler_node_cpu_watts` - Instantaneous CPU power per node
+- `kepler_container_cpu_watts` - Instantaneous CPU power per container
+- `kepler_node_cpu_joules_total` - Cumulative energy (use rate() for watts)
+
+In **estimation mode** (VMs without RAPL), values will be very small (microwatts to milliwatts) because they're based on CPU activity models, not real power measurements. On **bare metal with RAPL**, expect realistic values (tens to hundreds of watts).
+
+### Step 4: Get Kepler Build Info
+
+```bash
+# Kepler version and configuration
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=kepler_build_info' | \
+  jq -r '.data.result[0].metric | "Version: \(.version), Branch: \(.branch)"'
+```
+
+## Output Format
+
+Present the results in this format:
+
+```
+## TNF Cluster Power Report
+
+**Measurement Mode**: [REAL/ESTIMATED]
+**Kepler Version**: [version]
+**Report Time**: [current time]
+
+### Cluster Summary
+| Metric | Value |
+|--------|-------|
+| Total Power | XX.X W |
+| Nodes Monitored | 2 |
+
+### Power by Node
+| Node | Power (W) |
+|------|-----------|
+| master-0 (192.168.111.20) | XX.X |
+| master-1 (192.168.111.21) | XX.X |
+
+### TNF Control Plane Overhead
+| Component | Power (W) |
+|-----------|-----------|
+| openshift-etcd | XX.X |
+| openshift-kube-apiserver | XX.X |
+| openshift-machine-config-operator | XX.X |
+
+### Top Namespaces by Power
+| Namespace | Power (W) |
+|-----------|-----------|
+| namespace-1 | XX.X |
+| namespace-2 | XX.X |
+| ... | ... |
+
+---
+*Note: [If ESTIMATED mode] Power values are ML-based estimates.
+On production bare metal TNF clusters, real RAPL measurements are used.*
+```
+
+## Error Handling
+
+If Kepler is not deployed:
+```
+Kepler is not deployed on this cluster.
+
+To deploy Kepler power monitoring:
+  cd deploy/openshift-clusters
+  ansible-playbook kepler.yml -i inventory.ini
+
+Or using make:
+  cd deploy && make deploy-kepler
+```
+
+If metrics are not available:
+```
+Kepler pods are running but no metrics found in Prometheus.
+
+Check:
+1. ServiceMonitor exists: oc get servicemonitor -n kepler
+2. User workload monitoring is enabled
+3. Wait a few minutes for metrics to be scraped
+
+Troubleshooting:
+  oc logs -n kepler -l app.kubernetes.io/name=kepler-exporter --tail=50
+```
+
+## Grafana Dashboard Link
+
+After showing the report, remind the user about the Grafana dashboard:
+
+```
+For detailed visualizations, access the Grafana dashboard:
+
+Via port-forward (recommended for dev environments):
+  oc port-forward -n grafana svc/grafana 3000:3000
+  Open: http://localhost:3000
+
+Via route (if exposed):
+  URL: https://grafana-grafana.apps.<cluster-domain>
+
+Credentials: admin / admin
+Dashboard: TNF Power Monitoring
+```
diff --git a/deploy/Makefile b/deploy/Makefile
index 1e8e8ecf..9c2d808f 100644
--- a/deploy/Makefile
+++ b/deploy/Makefile
@@ -81,6 +81,12 @@ patch-nodes:
 get-tnf-logs:
 	@./openshift-clusters/scripts/get-tnf-logs.sh
 
+deploy-kepler:
+	@./openshift-clusters/scripts/deploy-kepler.sh
+
+remove-kepler:
+	@./openshift-clusters/scripts/remove-kepler.sh
+
 help:
 	@echo "Available commands:"
 	@echo ""
@@ -98,7 +104,7 @@ help:
 	@echo "Instance Utils:"
 	@echo "  ssh                  - SSH into the EC2 instance"
 	@echo "  info                 - Display instance information"
-	@echo "  inventory            - Update inventory.ini with current instance IP" 
+	@echo "  inventory            - Update inventory.ini with current instance IP"
 	@echo ""
 	@echo "OpenShift Cluster Deployment:"
 	@echo "  fencing-ipi          - Deploy fencing IPI cluster (non-interactive)"
@@ -118,4 +124,8 @@ help:
 	@echo ""
 	@echo "Cluster Utilities:"
 	@echo "  get-tnf-logs         - Collect pacemaker and etcd logs from cluster nodes"
+	@echo ""
+	@echo "Power Monitoring:"
+	@echo "  deploy-kepler        - Deploy Kepler power monitoring (v0.11.3)"
+	@echo "  remove-kepler        - Remove Kepler power monitoring from cluster"
 
diff --git a/deploy/openshift-clusters/kepler.yml b/deploy/openshift-clusters/kepler.yml
new file mode 100644
index 00000000..9f68d669
--- /dev/null
+++ b/deploy/openshift-clusters/kepler.yml
@@ -0,0 +1,105 @@
+---
+# Kepler Power Monitoring Deployment for TNF Clusters
+#
+# This playbook deploys Kepler power monitoring with Grafana dashboards
+# on TNF (Two Nodes with Fencing) clusters.
+#
+# Usage:
+#   ansible-playbook kepler.yml -i inventory.ini
+#
+# To remove Kepler:
+#   ansible-playbook kepler.yml -i inventory.ini -e kepler_state=absent
+#
+# Options:
+#   -e grafana_enabled=false    Skip Grafana deployment
+#   -e kepler_state=absent      Remove Kepler from cluster
+
+- name: Deploy Kepler Power Monitoring on TNF Cluster
+  hosts: localhost
+  connection: local
+  gather_facts: no
+
+  vars:
+    # Load proxy configuration if proxy.env exists
+    proxy_env_file: "./proxy.env"
+
+  pre_tasks:
+    - name: Check if proxy.env file exists
+      ansible.builtin.stat:
+        path: "{{ proxy_env_file }}"
+      register: proxy_env_stat
+
+    - name: Source proxy.env and extract environment variables
+      ansible.builtin.shell: |
+        source {{ proxy_env_file }} && env | grep -E '^(KUBECONFIG|HTTP_PROXY|HTTPS_PROXY|NO_PROXY)='
+      register: proxy_env_vars
+      when: proxy_env_stat.stat.exists
+      failed_when: false
+      changed_when: false
+
+    - name: Parse environment variables from proxy.env
+      ansible.builtin.set_fact:
+        proxy_vars: "{{ proxy_vars | default({}) | combine({item.split('=')[0]: item.split('=')[1:]|join('=')}) }}"
+      loop: "{{ proxy_env_vars.stdout_lines | default([]) }}"
+      when:
+        - proxy_env_stat.stat.exists
+        - proxy_env_vars.stdout_lines is defined
+
+    - name: Set proxy variables for role
+      ansible.builtin.set_fact:
+        proxy_kubeconfig: "{{ proxy_vars.KUBECONFIG | default(lookup('env', 'KUBECONFIG')) }}"
+        proxy_http_proxy: "{{ proxy_vars.HTTP_PROXY | default('') }}"
+        proxy_https_proxy: "{{ proxy_vars.HTTPS_PROXY | default('') }}"
+        proxy_no_proxy: "{{ proxy_vars.NO_PROXY | default('') }}"
+        proxy_k8s_auth_proxy: "{{ proxy_vars.HTTP_PROXY | default('') }}"
+      when: proxy_env_stat.stat.exists
+
+    - name: Use environment KUBECONFIG if no proxy.env
+      ansible.builtin.set_fact:
+        proxy_kubeconfig: "{{ lookup('env', 'KUBECONFIG') }}"
+      when: not proxy_env_stat.stat.exists
+
+    - name: Verify cluster access
+      ansible.builtin.shell: |
+        oc get namespace default -o name
+      environment:
+        KUBECONFIG: "{{ proxy_kubeconfig }}"
+        HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+        HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+        NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+      register: cluster_check
+      changed_when: false
+      failed_when: cluster_check.rc != 0
+
+    - name: Display deployment information
+      ansible.builtin.debug:
+        msg: |
+          Kepler Power Monitoring Deployment
+          -----------------------------------
+          State: {{ kepler_state | default('present') }}
+          Grafana: {{ grafana_enabled | default(true) }}
+          KUBECONFIG: {{ proxy_kubeconfig | default('not set') }}
+
+  roles:
+    - role: kepler
+
+  post_tasks:
+    - name: Deployment summary
+      ansible.builtin.debug:
+        msg: |
+          {% if kepler_state | default('present') == 'present' %}
+          Kepler Power Monitoring deployed successfully!
+
+          Next steps:
+          1. Wait a few minutes for metrics to be collected
+          2. Access Grafana dashboard (if enabled):
+             oc get route grafana-route -n grafana -o jsonpath='{.spec.host}'
+          3. Query metrics in OpenShift Console:
+             Observe -> Metrics -> kepler_node_cpu_joules_total
+
+          Useful PromQL queries:
+          - Total power: sum(rate(kepler_node_cpu_joules_total[5m])) * 60
+          - Power by node: sum by (instance) (rate(kepler_node_cpu_joules_total[5m])) * 60
+          {% else %}
+          Kepler Power Monitoring has been removed from the cluster.
+          {% endif %}
diff --git a/deploy/openshift-clusters/roles/kepler/defaults/main.yml b/deploy/openshift-clusters/roles/kepler/defaults/main.yml
new file mode 100644
index 00000000..5500423a
--- /dev/null
+++ b/deploy/openshift-clusters/roles/kepler/defaults/main.yml
@@ -0,0 +1,26 @@
+---
+# Default variables for the kepler role
+# These variables can be overridden when calling the role
+
+# Kepler configuration
+kepler_namespace: "kepler"
+kepler_image: "quay.io/sustainable_computing_io/kepler:v0.11.3"
+kepler_port: 9188
+
+# Grafana configuration
+grafana_enabled: true
+grafana_namespace: "grafana"
+grafana_image: "docker.io/grafana/grafana:10.4.1"
+
+# User workload monitoring (required for ServiceMonitor)
+enable_user_workload_monitoring: true
+
+# Scrape interval for Kepler metrics
+kepler_scrape_interval: "30s"
+
+# State for idempotent operations (present/absent)
+kepler_state: "present"
+
+# Wait timeouts (in retries, each retry is 10 seconds)
+operator_ready_retries: 30
+daemonset_ready_retries: 30
diff --git a/deploy/openshift-clusters/roles/kepler/tasks/grafana.yml b/deploy/openshift-clusters/roles/kepler/tasks/grafana.yml
new file mode 100644
index 00000000..97eb9ffa
--- /dev/null
+++ b/deploy/openshift-clusters/roles/kepler/tasks/grafana.yml
@@ -0,0 +1,354 @@
+---
+# Deploy Grafana using oc commands (no OLM required)
+# For viewing Kepler power metrics
+#
+# Authentication flow:
+#   1. Create ServiceAccount "grafana" with cluster-monitoring-view role
+#   2. Create a token Secret for the ServiceAccount
+#   3. Use the token as Bearer auth header when querying Thanos Querier
+#
+# Data flow:
+#   Grafana -> Thanos Querier (Bearer token) -> Prometheus -> Kepler metrics
+
+- name: Create Grafana namespace
+  ansible.builtin.shell: |
+    oc create namespace {{ grafana_namespace }} --dry-run=client -o yaml | oc apply -f -
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: ns_result
+  changed_when: "'created' in ns_result.stdout or 'configured' in ns_result.stdout"
+
+- name: Create ServiceAccount for Grafana Prometheus access
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: v1
+    kind: ServiceAccount
+    metadata:
+      name: grafana
+      namespace: {{ grafana_namespace }}
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: sa_result
+  changed_when: "'created' in sa_result.stdout or 'configured' in sa_result.stdout"
+
+- name: Create ClusterRoleBinding for Grafana to read metrics
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: rbac.authorization.k8s.io/v1
+    kind: ClusterRoleBinding
+    metadata:
+      name: grafana-cluster-monitoring-view
+    roleRef:
+      apiGroup: rbac.authorization.k8s.io
+      kind: ClusterRole
+      name: cluster-monitoring-view
+    subjects:
+      - kind: ServiceAccount
+        name: grafana
+        namespace: {{ grafana_namespace }}
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: crb_result
+  changed_when: "'created' in crb_result.stdout or 'configured' in crb_result.stdout"
+
+- name: Create ServiceAccount token for Grafana
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: v1
+    kind: Secret
+    metadata:
+      name: grafana-token
+      namespace: {{ grafana_namespace }}
+      annotations:
+        kubernetes.io/service-account.name: grafana
+    type: kubernetes.io/service-account-token
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: token_result
+  changed_when: "'created' in token_result.stdout or 'configured' in token_result.stdout"
+
+- name: Wait for token to be populated
+  ansible.builtin.shell: |
+    for i in $(seq 1 10); do
+      TOKEN=$(oc get secret grafana-token -n {{ grafana_namespace }} -o jsonpath='{.data.token}' 2>/dev/null)
+      if [ -n "$TOKEN" ]; then
+        echo "$TOKEN"
+        exit 0
+      fi
+      sleep 5
+    done
+    echo "TIMEOUT"
+    exit 1
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: token_wait
+  changed_when: false
+  failed_when: "'TIMEOUT' in token_wait.stdout"
+
+- name: Set Grafana SA token fact
+  set_fact:
+    grafana_sa_token: "{{ token_wait.stdout | b64decode }}"
+
+- name: Create Grafana datasource ConfigMap
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: v1
+    kind: ConfigMap
+    metadata:
+      name: grafana-datasources
+      namespace: {{ grafana_namespace }}
+    data:
+      datasources.yaml: |
+        apiVersion: 1
+        datasources:
+          - name: Prometheus
+            type: prometheus
+            access: proxy
+            url: https://thanos-querier.openshift-monitoring.svc.cluster.local:9091
+            isDefault: true
+            jsonData:
+              httpHeaderName1: Authorization
+              timeInterval: 30s
+              tlsSkipVerify: true
+            secureJsonData:
+              httpHeaderValue1: "Bearer {{ grafana_sa_token }}"
+            editable: false
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: ds_result
+  changed_when: "'created' in ds_result.stdout or 'configured' in ds_result.stdout"
+
+- name: Create Grafana dashboard provisioning ConfigMap
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: v1
+    kind: ConfigMap
+    metadata:
+      name: grafana-dashboards-config
+      namespace: {{ grafana_namespace }}
+    data:
+      dashboards.yaml: |
+        apiVersion: 1
+        providers:
+          - name: 'default'
+            orgId: 1
+            folder: ''
+            type: file
+            disableDeletion: false
+            updateIntervalSeconds: 30
+            options:
+              path: /var/lib/grafana/dashboards
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: dashcfg_result
+  changed_when: "'created' in dashcfg_result.stdout or 'configured' in dashcfg_result.stdout"
+
+- name: Generate TNF Power Dashboard ConfigMap
+  ansible.builtin.template:
+    src: tnf-power-dashboard-cm.yaml.j2
+    dest: /tmp/tnf-power-dashboard-cm.yaml
+
+- name: Create TNF Power Dashboard ConfigMap
+  ansible.builtin.shell: |
+    cat /tmp/tnf-power-dashboard-cm.yaml | oc apply -f -
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: dashboard_result
+  changed_when: "'created' in dashboard_result.stdout or 'configured' in dashboard_result.stdout"
+
+- name: Deploy Grafana Deployment
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: grafana
+      namespace: {{ grafana_namespace }}
+      labels:
+        app: grafana
+    spec:
+      replicas: 1
+      selector:
+        matchLabels:
+          app: grafana
+      template:
+        metadata:
+          labels:
+            app: grafana
+        spec:
+          serviceAccountName: grafana
+          containers:
+            - name: grafana
+              image: {{ grafana_image }}
+              imagePullPolicy: IfNotPresent
+              ports:
+                - containerPort: 3000
+                  name: http
+                  protocol: TCP
+              env:
+                - name: GF_AUTH_ANONYMOUS_ENABLED
+                  value: "true"
+                - name: GF_AUTH_ANONYMOUS_ORG_ROLE
+                  value: Admin
+                - name: GF_AUTH_DISABLE_LOGIN_FORM
+                  value: "true"
+                - name: GF_PATHS_PROVISIONING
+                  value: /etc/grafana/provisioning
+              volumeMounts:
+                - name: grafana-datasources
+                  mountPath: /etc/grafana/provisioning/datasources
+                  readOnly: true
+                - name: grafana-dashboards-config
+                  mountPath: /etc/grafana/provisioning/dashboards
+                  readOnly: true
+                - name: grafana-dashboards
+                  mountPath: /var/lib/grafana/dashboards
+                  readOnly: true
+                - name: grafana-storage
+                  mountPath: /var/lib/grafana
+          volumes:
+            - name: grafana-datasources
+              configMap:
+                name: grafana-datasources
+            - name: grafana-dashboards-config
+              configMap:
+                name: grafana-dashboards-config
+            - name: grafana-dashboards
+              configMap:
+                name: tnf-power-dashboard
+            - name: grafana-storage
+              emptyDir: {}
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: deploy_result
+  changed_when: "'created' in deploy_result.stdout or 'configured' in deploy_result.stdout"
+
+- name: Create Grafana Service
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: v1
+    kind: Service
+    metadata:
+      name: grafana
+      namespace: {{ grafana_namespace }}
+      labels:
+        app: grafana
+    spec:
+      ports:
+        - name: http
+          port: 3000
+          targetPort: 3000
+          protocol: TCP
+      selector:
+        app: grafana
+      type: ClusterIP
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: svc_result
+  changed_when: "'created' in svc_result.stdout or 'configured' in svc_result.stdout"
+
+- name: Create Grafana Route
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: route.openshift.io/v1
+    kind: Route
+    metadata:
+      name: grafana
+      namespace: {{ grafana_namespace }}
+      labels:
+        app: grafana
+    spec:
+      to:
+        kind: Service
+        name: grafana
+      port:
+        targetPort: http
+      tls:
+        termination: edge
+        insecureEdgeTerminationPolicy: Redirect
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: route_result
+  changed_when: "'created' in route_result.stdout or 'configured' in route_result.stdout"
+
+- name: Wait for Grafana deployment to be ready
+  ansible.builtin.shell: |
+    for i in $(seq 1 {{ operator_ready_retries }}); do
+      READY=$(oc get deployment grafana -n {{ grafana_namespace }} -o jsonpath='{.status.availableReplicas}' 2>/dev/null || echo "0")
+      if [ "$READY" -ge 1 ]; then
+        echo "READY"
+        exit 0
+      fi
+      sleep 10
+    done
+    echo "TIMEOUT"
+    exit 1
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: grafana_ready
+  changed_when: false
+  failed_when: "'TIMEOUT' in grafana_ready.stdout"
+
+- name: Get Grafana route URL
+  ansible.builtin.shell: |
+    oc get route grafana -n {{ grafana_namespace }} -o jsonpath='{.spec.host}'
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: grafana_route
+  changed_when: false
+
+- name: Display Grafana access information
+  ansible.builtin.debug:
+    msg: |
+      Grafana deployed successfully!
+
+      Access URL: https://{{ grafana_route.stdout }}
+      Dashboard: TNF Power Monitoring
+      Power Data Mode: {{ 'REAL (bare metal RAPL)' if (kepler_rapl_available | default(false) | bool) else 'ESTIMATED (VM/virtualized - no RAPL)' }}
diff --git a/deploy/openshift-clusters/roles/kepler/tasks/main.yml b/deploy/openshift-clusters/roles/kepler/tasks/main.yml
new file mode 100644
index 00000000..f9d19f07
--- /dev/null
+++ b/deploy/openshift-clusters/roles/kepler/tasks/main.yml
@@ -0,0 +1,295 @@
+---
+# Main tasks for kepler role
+# Deploys Kepler power monitoring on TNF clusters using oc commands
+# (Consistent with other roles - no kubernetes.core dependency)
+
+- name: Set KUBECONFIG environment
+  ansible.builtin.set_fact:
+    kepler_kubeconfig: "{{ proxy_kubeconfig | default(lookup('env', 'KUBECONFIG')) }}"
+
+- name: Set common environment for all oc commands
+  ansible.builtin.set_fact:
+    kepler_env:
+      KUBECONFIG: "{{ kepler_kubeconfig }}"
+      HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+      HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+      NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+
+- name: Deploy Kepler exporter
+  when: kepler_state == "present"
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  block:
+    - name: Create Kepler namespace
+      ansible.builtin.shell: |
+        oc create namespace {{ kepler_namespace }} --dry-run=client -o yaml | oc apply -f -
+      register: ns_result
+      changed_when: "'created' in ns_result.stdout or 'configured' in ns_result.stdout"
+
+    - name: Wait for namespace to be fully initialized
+      ansible.builtin.shell: |
+        for i in $(seq 1 30); do
+          ANNOTATION=$(oc get namespace {{ kepler_namespace }} -o jsonpath='{.metadata.annotations.openshift\.io/sa\.scc\.uid-range}' 2>/dev/null)
+          if [ -n "$ANNOTATION" ]; then
+            echo "READY: $ANNOTATION"
+            exit 0
+          fi
+          sleep 2
+        done
+        echo "TIMEOUT"
+        exit 1
+      register: ns_ready
+      changed_when: false
+      failed_when: "'TIMEOUT' in ns_ready.stdout"
+
+    - name: Create Kepler ServiceAccount
+      ansible.builtin.shell: |
+        cat <<EOF | oc apply -f -
+        apiVersion: v1
+        kind: ServiceAccount
+        metadata:
+          name: kepler-sa
+          namespace: {{ kepler_namespace }}
+        EOF
+      register: sa_result
+      changed_when: "'created' in sa_result.stdout or 'configured' in sa_result.stdout"
+
+    - name: Create Kepler ClusterRole
+      ansible.builtin.shell: |
+        cat <<EOF | oc apply -f -
+        apiVersion: rbac.authorization.k8s.io/v1
+        kind: ClusterRole
+        metadata:
+          name: kepler-clusterrole
+        rules:
+          - apiGroups: [""]
+            resources: ["nodes", "pods", "namespaces"]
+            verbs: ["get", "list", "watch"]
+        EOF
+      register: cr_result
+      changed_when: "'created' in cr_result.stdout or 'configured' in cr_result.stdout"
+
+    - name: Create Kepler ClusterRoleBinding
+      ansible.builtin.shell: |
+        cat <<EOF | oc apply -f -
+        apiVersion: rbac.authorization.k8s.io/v1
+        kind: ClusterRoleBinding
+        metadata:
+          name: kepler-clusterrole-binding
+        roleRef:
+          apiGroup: rbac.authorization.k8s.io
+          kind: ClusterRole
+          name: kepler-clusterrole
+        subjects:
+          - kind: ServiceAccount
+            name: kepler-sa
+            namespace: {{ kepler_namespace }}
+        EOF
+      register: crb_result
+      changed_when: "'created' in crb_result.stdout or 'configured' in crb_result.stdout"
+
+    - name: Add Kepler ServiceAccount to privileged SCC
+      ansible.builtin.shell: |
+        oc adm policy add-scc-to-user privileged -z kepler-sa -n {{ kepler_namespace }}
+      register: scc_result
+      changed_when: "'added' in scc_result.stdout or scc_result.rc == 0"
+      failed_when: false
+
+    - name: Create Kepler ConfigMap
+      ansible.builtin.shell: |
+        cat <<EOF | oc apply -f -
+        apiVersion: v1
+        kind: ConfigMap
+        metadata:
+          name: kepler-cfm
+          namespace: {{ kepler_namespace }}
+        data:
+          KEPLER_LOG_LEVEL: "1"
+          KEPLER_EXPOSE_HARDWARE_COUNTER_METRICS: "true"
+          KEPLER_EXPOSE_CGROUP_METRICS: "true"
+          KEPLER_EXPOSE_BPF_METRICS: "true"
+        EOF
+      register: cm_result
+      changed_when: "'created' in cm_result.stdout or 'configured' in cm_result.stdout"
+
+    - name: Create Kepler config for VM mode (fake-cpu-meter)
+      ansible.builtin.shell: |
+        cat <<'EOF' | oc apply -f -
+        apiVersion: v1
+        kind: ConfigMap
+        metadata:
+          name: kepler-config
+          namespace: {{ kepler_namespace }}
+        data:
+          kepler.yaml: |
+            log:
+              level: info
+              format: text
+            dev:
+              fake-cpu-meter:
+                enabled: true
+                zones:
+                  - package-0
+            rapl:
+              zones: []
+            exporter:
+              prometheus:
+                enabled: true
+            monitor:
+              interval: 5s
+        EOF
+      register: cfg_result
+      changed_when: "'created' in cfg_result.stdout or 'configured' in cfg_result.stdout"
+
+    - name: Generate Kepler DaemonSet manifest
+      ansible.builtin.template:
+        src: kepler-daemonset.yaml.j2
+        dest: /tmp/kepler-daemonset.yaml
+
+    - name: Deploy Kepler DaemonSet
+      ansible.builtin.shell: |
+        cat /tmp/kepler-daemonset.yaml | oc apply -f -
+      register: ds_result
+      changed_when: "'created' in ds_result.stdout or 'configured' in ds_result.stdout"
+
+    - name: Create Kepler Service
+      ansible.builtin.shell: |
+        cat <<EOF | oc apply -f -
+        apiVersion: v1
+        kind: Service
+        metadata:
+          name: kepler-exporter
+          namespace: {{ kepler_namespace }}
+          labels:
+            app.kubernetes.io/name: kepler-exporter
+        spec:
+          ports:
+            - name: http
+              port: {{ kepler_port }}
+              targetPort: {{ kepler_port }}
+              protocol: TCP
+          selector:
+            app.kubernetes.io/name: kepler-exporter
+          type: ClusterIP
+        EOF
+      register: svc_result
+      changed_when: "'created' in svc_result.stdout or 'configured' in svc_result.stdout"
+
+    - name: Wait for Kepler DaemonSet to be ready
+      ansible.builtin.shell: |
+        for i in $(seq 1 {{ daemonset_ready_retries }}); do
+          READY=$(oc get daemonset kepler-exporter -n {{ kepler_namespace }} -o jsonpath='{.status.numberReady}' 2>/dev/null || echo "0")
+          if [ "$READY" -ge 1 ]; then
+            echo "READY:$READY"
+            exit 0
+          fi
+          sleep 10
+        done
+        echo "TIMEOUT"
+        exit 1
+      register: ds_ready
+      changed_when: false
+      failed_when: "'TIMEOUT' in ds_ready.stdout"
+
+    - name: Kepler exporter deployed successfully
+      ansible.builtin.debug:
+        msg: "Kepler exporter DaemonSet is running ({{ ds_ready.stdout }})"
+
+    - name: Check if nodes have RAPL (bare metal detection)
+      ansible.builtin.shell: |
+        POD=$(oc get pods -n {{ kepler_namespace }} -l app.kubernetes.io/name=kepler-exporter -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+        if [ -z "$POD" ]; then
+          echo "NO_POD"
+          exit 0
+        fi
+        RAPL_CHECK=$(oc exec -n {{ kepler_namespace }} $POD -- ls /sys/class/powercap/intel-rapl 2>/dev/null || echo "NOT_FOUND")
+        if [ "$RAPL_CHECK" = "NOT_FOUND" ] || [ -z "$RAPL_CHECK" ]; then
+          echo "NO_RAPL"
+        else
+          echo "RAPL_AVAILABLE"
+        fi
+      register: rapl_check_result
+      changed_when: false
+      failed_when: false
+
+    - name: Set RAPL availability fact
+      ansible.builtin.set_fact:
+        kepler_rapl_available: "{{ rapl_check_result.stdout | trim == 'RAPL_AVAILABLE' }}"
+
+    - name: Display bare metal power monitoring status
+      ansible.builtin.debug:
+        msg: |
+          KEPLER POWER MONITORING - BARE METAL
+          Intel RAPL detected: Real hardware power measurement available
+          Kepler is reading actual power consumption from CPU registers.
+      when: kepler_rapl_available | bool
+
+    - name: Display VM/virtualized environment warning
+      ansible.builtin.debug:
+        msg: |
+          KEPLER POWER MONITORING - VIRTUALIZED MODE
+          WARNING: Intel RAPL not detected - nodes appear to be VMs
+          Kepler is using fake-cpu-meter for simulated power values.
+          Power values shown are ESTIMATES based on CPU cycles.
+      when: not (kepler_rapl_available | bool)
+
+- name: Configure monitoring integration
+  include_tasks: monitoring.yml
+  when: kepler_state == "present"
+
+- name: Deploy Grafana and dashboards
+  include_tasks: grafana.yml
+  when:
+    - kepler_state == "present"
+    - grafana_enabled | bool
+
+- name: Remove Kepler deployment
+  when: kepler_state == "absent"
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  block:
+    - name: Remove Kepler DaemonSet
+      ansible.builtin.shell: |
+        oc delete daemonset kepler-exporter -n {{ kepler_namespace }} --ignore-not-found=true
+      changed_when: true
+      ignore_errors: true
+
+    - name: Remove Kepler Service
+      ansible.builtin.shell: |
+        oc delete service kepler-exporter -n {{ kepler_namespace }} --ignore-not-found=true
+      changed_when: true
+      ignore_errors: true
+
+    - name: Remove Kepler SA from privileged SCC
+      ansible.builtin.shell: |
+        oc adm policy remove-scc-from-user privileged -z kepler-sa -n {{ kepler_namespace }} || true
+      ignore_errors: true
+      changed_when: true
+
+    - name: Remove Kepler ClusterRoleBinding
+      ansible.builtin.shell: |
+        oc delete clusterrolebinding kepler-clusterrole-binding --ignore-not-found=true
+      changed_when: true
+      ignore_errors: true
+
+    - name: Remove Kepler ClusterRole
+      ansible.builtin.shell: |
+        oc delete clusterrole kepler-clusterrole --ignore-not-found=true
+      changed_when: true
+      ignore_errors: true
+
+    - name: Remove Kepler namespace
+      ansible.builtin.shell: |
+        oc delete namespace {{ kepler_namespace }} --ignore-not-found=true
+      changed_when: true
+      ignore_errors: true
+
+    - name: Kepler removed
+      ansible.builtin.debug:
+        msg: "Kepler power monitoring has been removed from the cluster"
diff --git a/deploy/openshift-clusters/roles/kepler/tasks/monitoring.yml b/deploy/openshift-clusters/roles/kepler/tasks/monitoring.yml
new file mode 100644
index 00000000..ace83e16
--- /dev/null
+++ b/deploy/openshift-clusters/roles/kepler/tasks/monitoring.yml
@@ -0,0 +1,94 @@
+---
+# Configure OpenShift monitoring integration for Kepler
+#
+# IMPORTANT: ServiceMonitor must be in the KEPLER namespace, NOT openshift-monitoring.
+# User workload monitoring only discovers ServiceMonitors in the same namespace
+# as the workload. Placing it in openshift-monitoring will NOT work.
+#
+# Data flow:
+#   Kepler Service (kepler ns) <- ServiceMonitor (kepler ns) <- Prometheus (user-workload-monitoring)
+
+- name: Enable user workload monitoring
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: v1
+    kind: ConfigMap
+    metadata:
+      name: cluster-monitoring-config
+      namespace: openshift-monitoring
+    data:
+      config.yaml: |
+        enableUserWorkload: true
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: uwm_result
+  changed_when: "'created' in uwm_result.stdout or 'configured' in uwm_result.stdout"
+  when: enable_user_workload_monitoring | bool
+
+- name: Wait for user workload monitoring to be ready
+  ansible.builtin.shell: |
+    for i in $(seq 1 30); do
+      READY=$(oc get deployment prometheus-operator -n openshift-user-workload-monitoring -o jsonpath='{.status.availableReplicas}' 2>/dev/null || echo "0")
+      if [ "$READY" -ge 1 ]; then
+        echo "READY"
+        exit 0
+      fi
+      sleep 10
+    done
+    echo "TIMEOUT"
+    exit 1
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: uwm_ready
+  changed_when: false
+  failed_when: "'TIMEOUT' in uwm_ready.stdout"
+  when: enable_user_workload_monitoring | bool
+
+- name: Create ServiceMonitor for Kepler metrics
+  ansible.builtin.shell: |
+    cat <<EOF | oc apply -f -
+    apiVersion: monitoring.coreos.com/v1
+    kind: ServiceMonitor
+    metadata:
+      name: kepler
+      namespace: {{ kepler_namespace }}
+      labels:
+        app: kepler
+    spec:
+      endpoints:
+        - interval: {{ kepler_scrape_interval }}
+          port: http
+          scheme: http
+          path: /metrics
+      namespaceSelector:
+        matchNames:
+          - {{ kepler_namespace }}
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: kepler-exporter
+    EOF
+  environment:
+    KUBECONFIG: "{{ kepler_kubeconfig }}"
+    HTTP_PROXY: "{{ proxy_http_proxy | default('') }}"
+    HTTPS_PROXY: "{{ proxy_https_proxy | default('') }}"
+    NO_PROXY: "{{ proxy_no_proxy | default('') }}"
+  register: sm_result
+  changed_when: "'created' in sm_result.stdout or 'configured' in sm_result.stdout"
+
+- name: Verify Kepler metrics are being scraped
+  ansible.builtin.debug:
+    msg: |
+      Kepler ServiceMonitor created. Metrics will be available at:
+      - OpenShift Console: Observe -> Metrics
+      - Query: kepler_build_info
+
+      To verify metrics are being scraped:
+        oc -n openshift-user-workload-monitoring exec -c prometheus prometheus-user-workload-0 -- \
+          curl -s 'http://localhost:9090/api/v1/query?query=up{job="kepler-exporter"}' | jq
diff --git a/deploy/openshift-clusters/roles/kepler/templates/kepler-daemonset.yaml.j2 b/deploy/openshift-clusters/roles/kepler/templates/kepler-daemonset.yaml.j2
new file mode 100644
index 00000000..1c12df5b
--- /dev/null
+++ b/deploy/openshift-clusters/roles/kepler/templates/kepler-daemonset.yaml.j2
@@ -0,0 +1,111 @@
+# Kepler DaemonSet - Power monitoring via eBPF
+#
+# Kepler uses eBPF to probe CPU performance counters and kernel tracepoints.
+# This requires privileged access to the host:
+#   - hostNetwork: true  -> Access host network stack
+#   - hostPID: true      -> Access host process namespace
+#   - privileged: true   -> Required for eBPF and RAPL access
+#
+# Volume mounts provide access to kernel interfaces:
+#   - /lib/modules       -> Kernel modules for eBPF
+#   - /sys/kernel/tracing -> eBPF tracing interface
+#   - /sys/kernel/debug  -> Kernel debug interface
+#   - /proc              -> Process information
+#   - /usr/src/kernels   -> Kernel headers
+#
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: kepler-exporter
+  namespace: {{ kepler_namespace }}
+  labels:
+    app.kubernetes.io/name: kepler-exporter
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: kepler-exporter
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: kepler-exporter
+    spec:
+      serviceAccountName: kepler-sa
+      # Required for eBPF access to host
+      hostNetwork: true
+      hostPID: true
+      nodeSelector:
+        kubernetes.io/os: linux
+      # Tolerations to run on TNF control-plane nodes
+      tolerations:
+        - effect: NoSchedule
+          key: node-role.kubernetes.io/master
+          operator: Exists
+        - effect: NoSchedule
+          key: node-role.kubernetes.io/control-plane
+          operator: Exists
+      containers:
+        - name: kepler-exporter
+          image: {{ kepler_image }}
+          imagePullPolicy: Always
+          securityContext:
+            privileged: true
+            runAsUser: 0
+          args:
+            - --config.file=/etc/kepler/kepler.yaml
+            - --web.listen-address=:{{ kepler_port }}
+          ports:
+            - containerPort: {{ kepler_port }}
+              name: http
+              protocol: TCP
+          envFrom:
+            - configMapRef:
+                name: kepler-cfm
+          resources:
+            requests:
+              cpu: 100m
+              memory: 400Mi
+            limits:
+              cpu: 500m
+              memory: 800Mi
+          volumeMounts:
+            - name: kepler-config
+              mountPath: /etc/kepler
+              readOnly: true
+            - name: lib-modules
+              mountPath: /lib/modules
+              readOnly: true
+            - name: tracing
+              mountPath: /sys/kernel/tracing
+              readOnly: true
+            - name: kernel-src
+              mountPath: /usr/src/kernels
+              readOnly: true
+            - name: proc
+              mountPath: /proc
+            - name: kernel-debug
+              mountPath: /sys/kernel/debug
+              readOnly: true
+      volumes:
+        - name: kepler-config
+          configMap:
+            name: kepler-config
+        - name: lib-modules
+          hostPath:
+            path: /lib/modules
+            type: Directory
+        - name: tracing
+          hostPath:
+            path: /sys/kernel/tracing
+            type: Directory
+        - name: kernel-src
+          hostPath:
+            path: /usr/src/kernels
+            type: Directory
+        - name: proc
+          hostPath:
+            path: /proc
+            type: Directory
+        - name: kernel-debug
+          hostPath:
+            path: /sys/kernel/debug
+            type: Directory
diff --git a/deploy/openshift-clusters/roles/kepler/templates/tnf-power-dashboard-cm.yaml.j2 b/deploy/openshift-clusters/roles/kepler/templates/tnf-power-dashboard-cm.yaml.j2
new file mode 100644
index 00000000..f4d3f2ca
--- /dev/null
+++ b/deploy/openshift-clusters/roles/kepler/templates/tnf-power-dashboard-cm.yaml.j2
@@ -0,0 +1,560 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: tnf-power-dashboard
+  namespace: {{ grafana_namespace }}
+data:
+  tnf-power.json: |
+    {
+      "annotations": {
+        "list": [
+          {
+            "builtIn": 0,
+            "datasource": "Prometheus",
+            "enable": true,
+            "expr": "changes(up{job=\"kepler-exporter\"}[2m]) > 0",
+            "hide": false,
+            "iconColor": "red",
+            "name": "Node State Change",
+            "step": "60s"
+          },
+          {
+            "builtIn": 0,
+            "datasource": "Prometheus",
+            "enable": true,
+            "expr": "up{job=\"kepler-exporter\"} == 0",
+            "hide": false,
+            "iconColor": "dark-red",
+            "name": "Node Down",
+            "step": "60s"
+          }
+        ]
+      },
+      "editable": true,
+      "fiscalYearStartMonth": 0,
+      "graphTooltip": 0,
+      "id": null,
+      "links": [],
+      "panels": [
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "thresholds"},
+              "decimals": 2,
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {"color": "green", "value": null},
+                  {"color": "yellow", "value": 100},
+                  {"color": "red", "value": 200}
+                ]
+              },
+              "unit": "watt"
+            }
+          },
+          "gridPos": {"h": 6, "w": 6, "x": 0, "y": 0},
+          "id": 1,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "textMode": "auto"
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(kepler_node_cpu_joules_total[5m])) * 60",
+              "legendFormat": "Total Power",
+              "refId": "A"
+            }
+          ],
+          "title": "Total Cluster Power",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "palette-classic"},
+              "decimals": 2,
+              "mappings": [],
+              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+              "unit": "watt"
+            }
+          },
+          "gridPos": {"h": 6, "w": 6, "x": 6, "y": 0},
+          "id": 2,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "none",
+            "justifyMode": "auto",
+            "orientation": "horizontal",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "textMode": "auto"
+          },
+          "targets": [
+            {
+              "expr": "label_replace(sum by (instance) (rate(kepler_node_cpu_joules_total[5m])) * 60, \"node\", \"$1\", \"instance\", \"(.*):[0-9]+\")",
+              "legendFormat": "{{ '{{node}}' }}",
+              "refId": "A"
+            }
+          ],
+          "title": "Power by Node",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "thresholds"},
+              "decimals": 2,
+              "mappings": [],
+              "thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]},
+              "unit": "watt"
+            }
+          },
+          "gridPos": {"h": 6, "w": 4, "x": 12, "y": 0},
+          "id": 3,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "textMode": "auto"
+          },
+          "targets": [
+            {
+              "expr": "sum(kepler_container_cpu_watts{container_name=~\".*apiserver.*|.*machine-config.*|master-.*|.*etcd.*\"})",
+              "legendFormat": "HA Overhead",
+              "refId": "A"
+            }
+          ],
+          "title": "HA Components Power",
+          "description": "Power consumed by etcd, MCO, and kube-apiserver (core HA components). On VMs without RAPL, values may be near-zero.",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "thresholds"},
+              "decimals": 2,
+              "mappings": [],
+              "thresholds": {"mode": "absolute", "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 10},
+                {"color": "red", "value": 50}
+              ]},
+              "unit": "currencyUSD"
+            }
+          },
+          "gridPos": {"h": 6, "w": 4, "x": 16, "y": 0},
+          "id": 8,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "none",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "textMode": "auto"
+          },
+          "targets": [
+            {
+              "expr": "sum(kepler_node_cpu_watts) / 1000 * 24 * 30 * $cost_per_kwh",
+              "legendFormat": "Monthly Cost",
+              "refId": "A"
+            }
+          ],
+          "title": "Est. Monthly Cost",
+          "description": "Estimated monthly electricity cost based on current power draw. Note: On VMs without RAPL, Kepler uses estimation which may show near-zero values.",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "palette-classic"},
+              "mappings": [],
+              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+              "unit": "short"
+            }
+          },
+          "gridPos": {"h": 6, "w": 4, "x": 20, "y": 0},
+          "id": 4,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "none",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "textMode": "auto"
+          },
+          "targets": [
+            {
+              "expr": "count(up{job=\"kepler-exporter\"} == 1)",
+              "legendFormat": "Nodes Up",
+              "refId": "A"
+            }
+          ],
+          "title": "Nodes Online",
+          "description": "Number of nodes currently being monitored (2 = healthy TNF, 1 = degraded mode)",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "palette-classic"},
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisPlacement": "auto",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 20,
+                "gradientMode": "none",
+                "lineInterpolation": "smooth",
+                "lineWidth": 2,
+                "pointSize": 5,
+                "showPoints": "auto",
+                "spanNulls": false,
+                "stacking": {"group": "A", "mode": "none"},
+                "thresholdsStyle": {"mode": "off"}
+              },
+              "decimals": 2,
+              "mappings": [],
+              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+              "unit": "watt"
+            }
+          },
+          "gridPos": {"h": 8, "w": 24, "x": 0, "y": 6},
+          "id": 5,
+          "options": {
+            "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+            "tooltip": {"mode": "multi", "sort": "desc"}
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(kepler_node_cpu_joules_total[5m])) * 60",
+              "legendFormat": "Total Cluster",
+              "refId": "A"
+            },
+            {
+              "expr": "label_replace(sum by (instance) (rate(kepler_node_cpu_joules_total[5m])) * 60, \"node\", \"$1\", \"instance\", \"(.*):[0-9]+\")",
+              "legendFormat": "{{ '{{node}}' }}",
+              "refId": "B"
+            }
+          ],
+          "title": "Power Over Time",
+          "type": "timeseries"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "palette-classic"},
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisGridShow": true,
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "fillOpacity": 80,
+                "gradientMode": "none",
+                "lineWidth": 1,
+                "scaleDistribution": {"type": "linear"},
+                "thresholdsStyle": {"mode": "off"}
+              },
+              "decimals": 2,
+              "mappings": [],
+              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+              "unit": "watt"
+            }
+          },
+          "gridPos": {"h": 10, "w": 12, "x": 0, "y": 14},
+          "id": 6,
+          "options": {
+            "displayMode": "gradient",
+            "minVizHeight": 10,
+            "minVizWidth": 0,
+            "orientation": "horizontal",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "showUnfilled": true,
+            "valueMode": "color"
+          },
+          "targets": [
+            {
+              "expr": "label_replace(label_replace(topk(10, sum by (container_name) (rate(kepler_container_cpu_joules_total{container_name!~\".*apiserver.*|.*machine-config.*|master-.*|.*etcd.*|.*controller-manager.*\"}[5m])) * 60), \"short_name\", \"$1\", \"container_name\", \"(.*)\"), \"short_name\", \"$1\", \"container_name\", \"(.+)-[a-z0-9]+-[a-z0-9]+$\")",
+              "legendFormat": "{{ '{{short_name}}' }}",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "title": "Top 10 Workloads by Power",
+          "description": "Shows the top 10 non-control-plane workloads by power consumption. Excludes HA components (apiserver, etcd, MCO, controller-manager) which are shown in the Control Plane panel.",
+          "type": "bargauge"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "palette-classic"},
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisGridShow": true,
+                "axisLabel": "",
+                "axisPlacement": "auto",
+                "fillOpacity": 80,
+                "gradientMode": "none",
+                "lineWidth": 1,
+                "scaleDistribution": {"type": "linear"},
+                "thresholdsStyle": {"mode": "off"}
+              },
+              "decimals": 2,
+              "mappings": [],
+              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+              "unit": "watt"
+            }
+          },
+          "gridPos": {"h": 10, "w": 12, "x": 12, "y": 14},
+          "id": 7,
+          "options": {
+            "displayMode": "gradient",
+            "minVizHeight": 10,
+            "minVizWidth": 0,
+            "orientation": "horizontal",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "showUnfilled": true,
+            "valueMode": "color"
+          },
+          "targets": [
+            {
+              "expr": "label_replace(label_replace(topk(10, sum by (container_name) (rate(kepler_container_cpu_joules_total{container_name=~\".*apiserver.*|.*machine-config.*|master-.*|.*etcd.*|.*controller-manager.*\"}[5m])) * 60), \"short_name\", \"$1\", \"container_name\", \"(.*)\"), \"short_name\", \"$1\", \"container_name\", \"(.+)-[a-z0-9]+-[a-z0-9]+$\")",
+              "legendFormat": "{{ '{{short_name}}' }}",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "title": "TNF Control Plane Power Breakdown",
+          "description": "Power consumption of TNF high-availability components: kube-apiserver, openshift-apiserver, etcd, machine-config-operator, and controller-manager. These are critical for cluster HA operation.",
+          "type": "bargauge"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "thresholds"},
+              "decimals": 1,
+              "mappings": [
+                {"options": {"from": -5, "to": 5, "result": {"color": "green", "text": "Balanced"}}, "type": "range"},
+                {"options": {"from": 5, "to": 20, "result": {"color": "yellow", "text": "Slight Imbalance"}}, "type": "range"},
+                {"options": {"from": -20, "to": -5, "result": {"color": "yellow", "text": "Slight Imbalance"}}, "type": "range"},
+                {"options": {"from": 20, "to": 100, "result": {"color": "red", "text": "Imbalanced"}}, "type": "range"},
+                {"options": {"from": -100, "to": -20, "result": {"color": "red", "text": "Imbalanced"}}, "type": "range"}
+              ],
+              "thresholds": {"mode": "absolute", "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 5},
+                {"color": "red", "value": 20}
+              ]},
+              "unit": "percent"
+            }
+          },
+          "gridPos": {"h": 6, "w": 8, "x": 0, "y": 24},
+          "id": 9,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "textMode": "value_and_name"
+          },
+          "targets": [
+            {
+              "expr": "((max(sum by (instance) (rate(kepler_node_cpu_joules_total[5m]))) - min(sum by (instance) (rate(kepler_node_cpu_joules_total[5m])))) / (avg(sum by (instance) (rate(kepler_node_cpu_joules_total[5m]))) + 0.0001)) * 100",
+              "legendFormat": "Node Imbalance",
+              "refId": "A"
+            }
+          ],
+          "title": "Node Power Balance",
+          "description": "Power imbalance between nodes. 0% = perfectly balanced. High values indicate one node is doing significantly more work. Important for TNF load distribution.",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "thresholds"},
+              "decimals": 2,
+              "mappings": [],
+              "thresholds": {"mode": "absolute", "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 0.5},
+                {"color": "red", "value": 1}
+              ]},
+              "unit": "watt"
+            }
+          },
+          "gridPos": {"h": 6, "w": 8, "x": 8, "y": 24},
+          "id": 10,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "textMode": "auto"
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(kepler_container_cpu_joules_total{container_name=~\".*etcd.*\"}[5m])) * 60",
+              "legendFormat": "etcd Power",
+              "refId": "A"
+            }
+          ],
+          "title": "etcd Power",
+          "description": "Power consumed by etcd - the critical distributed key-value store for TNF cluster state. High values may indicate heavy cluster activity or potential issues.",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "thresholds"},
+              "decimals": 1,
+              "mappings": [
+                {"options": {"from": -100, "to": -5, "result": {"color": "green", "text": "Decreasing"}}, "type": "range"},
+                {"options": {"from": -5, "to": 5, "result": {"color": "blue", "text": "Stable"}}, "type": "range"},
+                {"options": {"from": 5, "to": 100, "result": {"color": "orange", "text": "Increasing"}}, "type": "range"}
+              ],
+              "thresholds": {"mode": "absolute", "steps": [
+                {"color": "green", "value": null},
+                {"color": "blue", "value": -5},
+                {"color": "orange", "value": 5}
+              ]},
+              "unit": "percent"
+            }
+          },
+          "gridPos": {"h": 6, "w": 8, "x": 16, "y": 24},
+          "id": 11,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+            "textMode": "value_and_name"
+          },
+          "targets": [
+            {
+              "expr": "((sum(rate(kepler_node_cpu_joules_total[5m])) - sum(rate(kepler_node_cpu_joules_total[30m]))) / (sum(rate(kepler_node_cpu_joules_total[30m])) + 0.0001)) * 100",
+              "legendFormat": "Trend",
+              "refId": "A"
+            }
+          ],
+          "title": "Power Trend (1h)",
+          "description": "Power consumption trend comparing recent 5 minutes vs last 30 minutes. Positive = increasing power usage, Negative = decreasing.",
+          "type": "stat"
+        },
+        {
+          "datasource": "Prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "color": {"mode": "palette-classic"},
+              "custom": {
+                "axisBorderShow": false,
+                "axisCenteredZero": false,
+                "axisColorMode": "text",
+                "axisLabel": "Power (W)",
+                "axisPlacement": "left",
+                "barAlignment": 0,
+                "drawStyle": "line",
+                "fillOpacity": 10,
+                "gradientMode": "none",
+                "lineInterpolation": "smooth",
+                "lineWidth": 2,
+                "pointSize": 5,
+                "showPoints": "never",
+                "spanNulls": false,
+                "stacking": {"group": "A", "mode": "none"},
+                "thresholdsStyle": {"mode": "off"}
+              },
+              "decimals": 2,
+              "mappings": [],
+              "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
+              "unit": "watt"
+            },
+            "overrides": [
+              {
+                "matcher": {"id": "byName", "options": "CPU Usage"},
+                "properties": [
+                  {"id": "custom.axisPlacement", "value": "right"},
+                  {"id": "custom.axisLabel", "value": "CPU (cores)"},
+                  {"id": "unit", "value": "short"},
+                  {"id": "color", "value": {"fixedColor": "orange", "mode": "fixed"}}
+                ]
+              }
+            ]
+          },
+          "gridPos": {"h": 8, "w": 24, "x": 0, "y": 30},
+          "id": 13,
+          "options": {
+            "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+            "tooltip": {"mode": "multi", "sort": "desc"}
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(kepler_node_cpu_joules_total[5m])) * 60",
+              "legendFormat": "Power",
+              "refId": "A"
+            },
+            {
+              "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m]))",
+              "legendFormat": "CPU Usage",
+              "refId": "B"
+            }
+          ],
+          "title": "CPU vs Power Correlation",
+          "description": "Shows the relationship between CPU usage and power consumption. Helps understand power efficiency - ideally they should correlate closely.",
+          "type": "timeseries"
+        }
+      ],
+      "refresh": "30s",
+      "schemaVersion": 38,
+      "tags": ["kepler", "power", "tnf", "two-node"],
+      "templating": {
+        "list": [
+          {
+            "current": {"selected": false, "text": "0.12", "value": "0.12"},
+            "description": "Electricity cost per kWh in your currency (e.g., USD, EUR)",
+            "hide": 0,
+            "label": "Cost per kWh ($)",
+            "name": "cost_per_kwh",
+            "options": [
+              {"selected": false, "text": "0.08", "value": "0.08"},
+              {"selected": true, "text": "0.12", "value": "0.12"},
+              {"selected": false, "text": "0.15", "value": "0.15"},
+              {"selected": false, "text": "0.20", "value": "0.20"},
+              {"selected": false, "text": "0.25", "value": "0.25"},
+              {"selected": false, "text": "0.30", "value": "0.30"}
+            ],
+            "query": "0.08,0.12,0.15,0.20,0.25,0.30",
+            "skipUrlSync": false,
+            "type": "custom"
+          }
+        ]
+      },
+      "time": {"from": "now-1h", "to": "now"},
+      "timepicker": {},
+      "timezone": "browser",
+      "title": "TNF Power Monitoring",
+      "uid": "tnf-power-monitoring",
+      "version": 1
+    }
diff --git a/deploy/openshift-clusters/scripts/deploy-kepler.sh b/deploy/openshift-clusters/scripts/deploy-kepler.sh
new file mode 100755
index 00000000..1da27eab
--- /dev/null
+++ b/deploy/openshift-clusters/scripts/deploy-kepler.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# Get the directory where this script is located
+SCRIPT_DIR=$(dirname "$0")
+# Get the deploy directory (two levels up from scripts)
+DEPLOY_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+set -o nounset
+set -o errexit
+set -o pipefail
+
+# Check if inventory.ini exists in the openshift-clusters directory
+if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/inventory.ini" ]]; then
+    echo "Error: inventory.ini not found in ${DEPLOY_DIR}/openshift-clusters/"
+    echo "Please ensure the inventory file is properly configured."
+    exit 1
+fi
+
+# Parse optional parameters
+GRAFANA_ENABLED="${1:-true}"
+
+echo "Deploying Kepler power monitoring on TNF cluster..."
+echo ""
+echo "Options:"
+echo "  Grafana dashboard: ${GRAFANA_ENABLED}"
+echo ""
+
+# Navigate to the openshift-clusters directory
+cd "${DEPLOY_DIR}/openshift-clusters"
+
+# Run the Kepler deployment playbook
+if ansible-playbook kepler.yml -i inventory.ini -e "grafana_enabled=${GRAFANA_ENABLED}"; then
+    echo ""
+    echo "=============================================="
+    echo "Kepler power monitoring deployed successfully!"
+    echo "=============================================="
+    echo ""
+    echo "Access Grafana dashboard:"
+    echo "  1. Set up port-forward:"
+    echo "     source proxy.env"
+    echo "     oc port-forward -n grafana svc/grafana 3000:3000"
+    echo ""
+    echo "  2. Open in browser: http://localhost:3000"
+    echo "     Dashboard: TNF Power Monitoring"
+    echo ""
+    echo "Query metrics via CLI:"
+    echo "  oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \\"
+    echo "    curl -s 'http://localhost:9090/api/v1/query?query=sum(kepler_node_cpu_watts)'"
+    echo ""
+    echo "Remove Kepler:"
+    echo "  make remove-kepler"
+    echo ""
+else
+    echo ""
+    echo "Error: Kepler deployment failed!"
+    echo ""
+    echo "Troubleshooting:"
+    echo "  1. Check cluster access: oc get nodes"
+    echo "  2. Check KUBECONFIG is set or proxy.env is sourced"
+    echo "  3. View Kepler pods: oc get pods -n kepler"
+    echo ""
+    exit 1
+fi
diff --git a/deploy/openshift-clusters/scripts/remove-kepler.sh b/deploy/openshift-clusters/scripts/remove-kepler.sh
new file mode 100755
index 00000000..f409e55a
--- /dev/null
+++ b/deploy/openshift-clusters/scripts/remove-kepler.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Get the directory where this script is located
+SCRIPT_DIR=$(dirname "$0")
+# Get the deploy directory (two levels up from scripts)
+DEPLOY_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+set -o nounset
+set -o errexit
+set -o pipefail
+
+# Check if inventory.ini exists in the openshift-clusters directory
+if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/inventory.ini" ]]; then
+    echo "Error: inventory.ini not found in ${DEPLOY_DIR}/openshift-clusters/"
+    echo "Please ensure the inventory file is properly configured."
+    exit 1
+fi
+
+echo "Removing Kepler power monitoring from TNF cluster..."
+echo ""
+
+# Navigate to the openshift-clusters directory
+cd "${DEPLOY_DIR}/openshift-clusters"
+
+# Run the Kepler removal playbook
+if ansible-playbook kepler.yml -i inventory.ini -e "kepler_state=absent"; then
+    echo ""
+    echo "=========================================="
+    echo "Kepler power monitoring removed successfully!"
+    echo "=========================================="
+    echo ""
+    echo "The following resources have been cleaned up:"
+    echo "  - Kepler DaemonSet and Service"
+    echo "  - Kepler namespace"
+    echo "  - Grafana deployment and namespace"
+    echo "  - ServiceMonitor and RBAC resources"
+    echo ""
+    echo "To redeploy Kepler:"
+    echo "  make deploy-kepler"
+    echo ""
+else
+    echo ""
+    echo "Error: Kepler removal failed!"
+    echo ""
+    echo "Troubleshooting:"
+    echo "  1. Check cluster access: oc get nodes"
+    echo "  2. Manually check namespaces: oc get ns | grep -E 'kepler|grafana'"
+    echo "  3. Force delete if stuck: oc delete ns kepler grafana --force"
+    echo ""
+    exit 1
+fi
diff --git a/docs/kepler/KEPLER-ARCHITECTURE.md b/docs/kepler/KEPLER-ARCHITECTURE.md
new file mode 100644
index 00000000..0f88885c
--- /dev/null
+++ b/docs/kepler/KEPLER-ARCHITECTURE.md
@@ -0,0 +1,319 @@
+# Kepler Architecture & File Guide
+
+This document explains each Kepler file, what it does, and how they work together.
+
+---
+
+## Architecture Overview
+
+```
+User runs: make deploy-kepler
+                │
+                ▼
+        deploy-kepler.sh (wrapper script)
+                │
+                ▼
+        kepler.yml (main playbook)
+                │
+                ▼
+    ┌───────────┴───────────┐
+    │     roles/kepler/     │
+    │                       │
+    │  defaults/main.yml    │ ← Variables
+    │         │             │
+    │         ▼             │
+    │  tasks/main.yml       │ ← Deploy Kepler DaemonSet
+    │         │             │
+    │         ▼             │
+    │  tasks/monitoring.yml │ ← Create ServiceMonitor
+    │         │             │
+    │         ▼             │
+    │  tasks/grafana.yml    │ ← Deploy Grafana + Dashboard
+    │                       │
+    │  templates/*.j2       │ ← Kubernetes manifests
+    └───────────────────────┘
+                │
+                ▼
+        TNF Cluster with Kepler running
+```
+
+---
+
+## 1. Entry Points
+
+### `scripts/deploy-kepler.sh`
+
+**Purpose:** Wrapper script that users call to deploy Kepler.
+
+**Flow:**
+1. Check inventory.ini exists
+2. Run: `ansible-playbook kepler.yml -i inventory.ini`
+3. Print success message with Grafana access instructions
+
+**Usage:** `make deploy-kepler` or `./scripts/deploy-kepler.sh`
+
+---
+
+### `scripts/remove-kepler.sh`
+
+**Purpose:** Wrapper script to remove Kepler from cluster.
+
+**Flow:** Runs `ansible-playbook kepler.yml -i inventory.ini -e kepler_state=absent`
+
+---
+
+### `kepler.yml`
+
+**Purpose:** Main Ansible playbook - orchestrates the entire deployment.
+
+**Flow:**
+1. **PRE-TASKS:** Setup cluster access
+   - Check if proxy.env exists
+   - Extract KUBECONFIG from proxy.env
+   - Verify cluster access with: `oc get namespace default`
+
+2. **ROLES:** Call the kepler role
+   - Triggers `roles/kepler/tasks/main.yml`
+
+3. **POST-TASKS:** Print summary with useful commands
+
+**Key variables passed to role:**
+- `kepler_state`: "present" (deploy) or "absent" (remove)
+- `grafana_enabled`: true/false
+- `proxy_kubeconfig`: Path to kubeconfig
+
+---
+
+## 2. Ansible Role
+
+### `roles/kepler/defaults/main.yml`
+
+**Purpose:** Defines default configuration values.
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `kepler_namespace` | `kepler` | Namespace for Kepler components |
+| `kepler_image` | `quay.io/.../kepler:v0.11.3` | Kepler container image |
+| `kepler_port` | `9188` | Metrics endpoint port |
+| `grafana_enabled` | `true` | Deploy Grafana dashboard |
+| `grafana_namespace` | `grafana` | Namespace for Grafana |
+| `grafana_image` | `grafana/grafana:10.4.1` | Grafana container image |
+| `kepler_scrape_interval` | `30s` | Prometheus scrape interval |
+| `kepler_state` | `present` | Deploy or remove (`absent`) |
+| `operator_ready_retries` | `30` | Timeout retries (30 × 10s = 5min) |
+
+**Override:** Pass `-e variable=value` to ansible-playbook
+
+---
+
+### `roles/kepler/tasks/main.yml`
+
+**Purpose:** Deploys Kepler exporter DaemonSet on all nodes.
+
+**Flow when `kepler_state == "present"`:**
+
+1. Set KUBECONFIG environment
+2. Create namespace `kepler`
+3. Create ServiceAccount `kepler-sa`
+4. Create ClusterRole (get/list/watch: nodes, pods, namespaces)
+5. Create ClusterRoleBinding
+6. Add kepler-sa to privileged SCC (required for eBPF)
+7. Create ConfigMap `kepler-cfm` (environment settings)
+8. Create ConfigMap `kepler-config` (kepler.yaml with fake-cpu-meter for VMs)
+9. Generate DaemonSet from template (`kepler-daemonset.yaml.j2`)
+10. Create Kepler Service (port 9188)
+11. Wait for DaemonSet ready (up to 5 min)
+12. Check RAPL availability (`/sys/class/powercap/intel-rapl/`)
+13. Display mode: REAL (RAPL) or ESTIMATED (VM)
+14. Include `monitoring.yml`
+15. Include `grafana.yml` (if enabled)
+
+**Flow when `kepler_state == "absent"`:**
+- Delete all resources in reverse order
+
+---
+
+### `roles/kepler/tasks/monitoring.yml`
+
+**Purpose:** Configures OpenShift to scrape Kepler metrics.
+
+**Flow:**
+
+1. **Enable user workload monitoring**
+   - Creates ConfigMap `cluster-monitoring-config` in `openshift-monitoring`
+   - Sets `enableUserWorkload: true`
+   - This starts `prometheus-user-workload` pods
+
+2. **Wait for monitoring ready**
+   - Checks prometheus-operator deployment
+
+3. **Create ServiceMonitor in kepler namespace**
+   ```yaml
+   ServiceMonitor: kepler
+   namespace: kepler  # IMPORTANT - must match workload namespace
+   spec:
+     endpoints:
+       - port: http
+         interval: 30s
+     selector:
+       matchLabels:
+         app.kubernetes.io/name: kepler-exporter
+   ```
+
+**Why namespace matters:** ServiceMonitor must be in `kepler` namespace (same as workload) for user-workload-monitoring to discover it. NOT in `openshift-monitoring`.
+
+---
+
+### `roles/kepler/tasks/grafana.yml`
+
+**Purpose:** Deploys Grafana with pre-configured TNF power dashboard.
+
+**Flow:**
+
+1. Create namespace `grafana`
+2. Create ServiceAccount `grafana`
+3. Create ClusterRoleBinding to `cluster-monitoring-view` role
+4. Create ServiceAccount token Secret
+5. Wait for token, decode it → `grafana_sa_token`
+6. Create datasource ConfigMap (Thanos Querier with Bearer auth)
+7. Create dashboard provisioning ConfigMap
+8. Generate dashboard from template (`tnf-power-dashboard-cm.yaml.j2`)
+9. Deploy Grafana Deployment:
+   - Anonymous auth enabled (Admin role)
+   - Mounts datasources and dashboards
+10. Create Service (port 3000)
+11. Create Route (HTTPS edge termination)
+12. Wait for Grafana ready
+13. Print access URL and RAPL mode
+
+---
+
+## 3. Templates
+
+### `templates/kepler-daemonset.yaml.j2`
+
+**Purpose:** Kubernetes DaemonSet manifest for Kepler.
+
+**Key configuration:**
+
+| Setting | Value | Why |
+|---------|-------|-----|
+| `privileged: true` | Required | eBPF access |
+| `hostNetwork: true` | Required | Host network access |
+| `hostPID: true` | Required | Host process access |
+| `runAsUser: 0` | Required | Root for kernel access |
+
+**Tolerations:** Runs on control-plane nodes (master/control-plane)
+
+**Volume mounts:**
+- `/lib/modules` - Kernel modules
+- `/sys/kernel/tracing` - eBPF tracing
+- `/sys/kernel/debug` - Kernel debug
+- `/usr/src/kernels` - Kernel source
+- `/proc` - Process info
+
+---
+
+### `templates/tnf-power-dashboard-cm.yaml.j2`
+
+**Purpose:** Grafana dashboard JSON as ConfigMap.
+
+**Dashboard panels (12 total):**
+
+| Row | Panels |
+|-----|--------|
+| 1 | Total Cluster Power, Power by Node, HA Components, Est. Monthly Cost, Nodes Online |
+| 2 | Power Over Time (full-width graph) |
+| 3 | Top 10 Workloads, TNF Control Plane Breakdown |
+| 4 | Node Power Balance, etcd Power, Power Trend (1h) |
+| 5 | CPU vs Power Correlation |
+
+**Annotations:**
+- Red line: Node down event
+- Node state change markers
+
+**Variables:**
+- `cost_per_kwh`: Dropdown for electricity cost
+
+**Key PromQL queries:**
+
+| Metric | Query |
+|--------|-------|
+| Total Power | `sum(rate(kepler_node_cpu_joules_total[5m])) * 60` |
+| Power by Node | `sum by (instance) (rate(kepler_node_cpu_joules_total[5m])) * 60` |
+| Node Balance | `((max(...) - min(...)) / avg(...)) * 100` |
+| etcd Power | `sum(rate(kepler_container_cpu_joules_total{container_name=~".*etcd.*"}[5m])) * 60` |
+
+---
+
+## 4. Claude Skill
+
+### `.claude/commands/tnf-power.md`
+
+**Purpose:** Quick power reports via `/tnf-power` command.
+
+**Flow:**
+1. Check KUBECONFIG / source proxy.env
+2. Verify Kepler pods running
+3. Check RAPL availability (REAL vs ESTIMATED)
+4. Query Prometheus for metrics
+5. Format and display power report
+6. Show Grafana access instructions
+
+---
+
+## Data Flow
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                      TNF Cluster                            │
+│                                                             │
+│   master-0                              master-1            │
+│  ┌──────────┐                          ┌──────────┐         │
+│  │ Kepler   │  Reads CPU counters      │ Kepler   │         │
+│  │ Pod      │  via eBPF (or RAPL       │ Pod      │         │
+│  │ :9188    │  on bare metal)          │ :9188    │         │
+│  └────┬─────┘                          └────┬─────┘         │
+│       │                                      │              │
+│       └──────────────┬───────────────────────┘              │
+│                      │                                      │
+│                      ▼                                      │
+│         ┌────────────────────────┐                          │
+│         │ ServiceMonitor         │                          │
+│         │ (kepler namespace)     │                          │
+│         └───────────┬────────────┘                          │
+│                     │ "Scrape :9188 every 30s"              │
+│                     ▼                                       │
+│         ┌────────────────────────┐                          │
+│         │ Prometheus             │                          │
+│         │ (user-workload-        │                          │
+│         │  monitoring)           │                          │
+│         └───────────┬────────────┘                          │
+│                     │                                       │
+│                     ▼                                       │
+│         ┌────────────────────────┐                          │
+│         │ Thanos Querier         │                          │
+│         │ (openshift-monitoring) │                          │
+│         └───────────┬────────────┘                          │
+│                     │ Bearer token auth                     │
+│                     ▼                                       │
+│         ┌────────────────────────┐                          │
+│         │ Grafana                │                          │
+│         │ "TNF Power Monitoring" │                          │
+│         └────────────────────────┘                          │
+│                                                             │
+└─────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Power Measurement Modes
+
+| Mode | Environment | Source | Accuracy |
+|------|-------------|--------|----------|
+| **REAL** | Bare metal | Intel RAPL registers | High (actual watts) |
+| **ESTIMATED** | VMs | fake-cpu-meter | Low (proportional to CPU activity) |
+
+The deployment automatically detects which mode is available by checking `/sys/class/powercap/intel-rapl/`.
+
+**Note:** For accurate power readings, deploy on bare metal with Intel CPUs (RAPL support) or servers with Redfish-enabled BMC (Kepler v0.11.0+).
diff --git a/docs/kepler/KEPLER-PRESENTATION.md b/docs/kepler/KEPLER-PRESENTATION.md
new file mode 100644
index 00000000..987efb77
--- /dev/null
+++ b/docs/kepler/KEPLER-PRESENTATION.md
@@ -0,0 +1,362 @@
+# Kepler Power Monitoring for TNF Clusters
+
+## Presentation Guide
+
+This document provides a comprehensive overview of Kepler power monitoring integration with TNF (Two Nodes with Fencing) OpenShift clusters.
+
+---
+
+## 1. Introduction
+
+### What is Kepler?
+
+**Kepler** (Kubernetes Efficient Power Level Exporter) is an open-source project that uses eBPF to probe CPU performance counters and Linux kernel tracepoints to calculate energy consumption per workload.
+
+- **Project**: https://github.com/sustainable-computing-io/kepler
+- **Maintained by**: Sustainable Computing IO (Red Hat contributors)
+- **Purpose**: Expose power consumption metrics as Prometheus metrics
+
+### Why Power Monitoring for TNF?
+
+TNF clusters run high-availability components that consume resources:
+
+| Component | Purpose | Power Impact |
+|-----------|---------|--------------|
+| **etcd** | Distributed key-value store | Continuous disk I/O and network sync |
+| **Pacemaker/Corosync** | Cluster resource manager | Heartbeat monitoring, quorum checks |
+| **MCO** | Machine Config Operator | Node configuration management |
+| **kube-apiserver** | Kubernetes API | Request processing |
+
+Understanding power consumption helps with:
+- **Cost estimation**: Calculate electricity costs for running TNF clusters
+- **Sustainability reporting**: Track carbon footprint
+- **Capacity planning**: Understand resource overhead of HA components
+- **Optimization**: Identify power-hungry workloads
+
+---
+
+## 2. Architecture
+
+### Component Overview
+
+```
+┌──────────────────────────────────────────────────────┐
+│                        TNF Cluster                   │
+│                                                      │
+│  ┌─────────────┐                    ┌─────────────┐  │
+│  │  master-0   │                    │  master-1   │  │
+│  │             │                    │             │  │
+│  │ ┌─────────┐ │                    │ ┌─────────┐ │  │
+│  │ │ Kepler  │ │                    │ │ Kepler  │ │  │
+│  │ │DaemonSet│ │                    │ │DaemonSet│ │  │
+│  │ └────┬────┘ │                    │ └────┬────┘ │  │
+│  │      │:9188 │                    │      │:9188 │  │
+│  └──────┼──────┘                    └──────┼──────┘  │
+│         │                                  │         │
+│         └──────────────┬───────────────────┘         │
+│                        │                             │
+│                        ▼                             │
+│         ┌──────────────────────────┐                 │
+│         │     ServiceMonitor       │                 │
+│         │   (kepler namespace)     │                 │
+│         └────────────┬─────────────┘                 │
+│                      │                               │
+│                      ▼                               │
+│         ┌──────────────────────────┐                 │
+│         │      Prometheus          │                 │
+│         │(user-workload-monitoring)│                 │
+│         └────────────┬─────────────┘                 │
+│                      │                               │
+│                      ▼                               │
+│         ┌──────────────────────────┐                 │
+│         │    Thanos Querier        │                 │
+│         └────────────┬─────────────┘                 │
+│                      │                               │
+│                      ▼                               │
+│         ┌──────────────────────────┐                 │
+│         │       Grafana            │                 │
+│         │  (TNF Power Dashboard)   │                 │
+│         └──────────────────────────┘                 │
+│                                                      │
+└──────────────────────────────────────────────────────┘
+```
+
+### Data Flow
+
+1. **Kepler DaemonSet** runs on each node, collects power metrics using eBPF
+2. **ServiceMonitor** tells Prometheus where to scrape metrics
+3. **Prometheus** (user-workload-monitoring) scrapes metrics every 30 seconds
+4. **Thanos Querier** federates data from all Prometheus instances
+5. **Grafana** queries Thanos and displays dashboards
+
+### Namespaces
+
+| Namespace | Components |
+|-----------|------------|
+| `kepler` | Kepler DaemonSet, Service, ServiceMonitor |
+| `grafana` | Grafana Deployment, Service, Route |
+| `openshift-user-workload-monitoring` | Prometheus for user workloads |
+| `openshift-monitoring` | Thanos Querier, platform monitoring |
+
+---
+
+## 3. Power Measurement Modes
+
+### Bare Metal (RAPL)
+
+On physical servers with Intel/AMD CPUs, Kepler reads **RAPL** (Running Average Power Limit) registers:
+
+- **Source**: Hardware MSR (Model-Specific Registers)
+- **Accuracy**: High - actual power consumption in watts
+- **Metrics**: CPU package power, DRAM power, core power
+
+```bash
+# Check if RAPL is available
+ls /sys/class/powercap/intel-rapl/
+```
+
+### Virtual Machines (Estimation)
+
+On VMs, RAPL is not accessible. Kepler uses **fake-cpu-meter** mode:
+
+- **Source**: CPU utilization via fake-cpu-meter
+- **Accuracy**: Estimated - based on CPU activity patterns
+- **Metrics**: Estimated power values (may show near-zero on idle VMs)
+
+The TNF Ansible role automatically detects the environment and displays:
+- "REAL (bare metal RAPL)" - when RAPL is available
+- "ESTIMATED (VM/virtualized - no RAPL)" - when running on VMs
+
+---
+
+## 4. Deployment
+
+### Deployment Approach: Direct vs OLM
+
+We chose **direct deployment** over **OLM (Operator Lifecycle Manager)** for this integration.
+
+| Approach | How it Works |
+|----------|--------------|
+| **OLM** | Install operator from OperatorHub → Operator deploys workloads via Custom Resources |
+| **Direct** | Ansible applies DaemonSet, Deployment, ConfigMaps directly via `oc apply` |
+
+**Why Direct Deployment?**
+
+| Consideration | OLM | Direct (chosen) |
+|---------------|-----|-----------------|
+| **Simplicity** | Requires Subscription, Operator, CR | Single playbook, no dependencies |
+| **Offline/Air-gapped** | Needs OperatorHub catalog mirroring | Works with any registry access |
+| **Control** | Operator manages resources | Full control over manifests |
+| **Debugging** | Operator abstracts deployment | Direct visibility into all resources |
+| **Dev/Test focus** | Suited for production lifecycle | Suited for dev/test environments |
+
+For TNF development and testing environments, direct deployment provides:
+- Faster iteration and debugging
+- No external catalog dependencies
+- Consistent behavior across environments
+- Easier customization of Kepler and Grafana settings
+
+### Prerequisites
+
+- TNF OpenShift cluster running (4.20+)
+- `oc` CLI access to the cluster
+- Ansible installed locally
+
+### One-Command Deployment
+
+From the `deploy/` directory:
+
+```bash
+# Deploy Kepler + Grafana
+make deploy-kepler
+
+# Remove Kepler
+make remove-kepler
+```
+
+### Manual Deployment
+
+```bash
+cd deploy/openshift-clusters
+source proxy.env
+ansible-playbook kepler.yml
+```
+
+### What Gets Deployed
+
+1. **Kepler namespace** with:
+   - ServiceAccount with privileged SCC
+   - ClusterRole/ClusterRoleBinding for node/pod access
+   - ConfigMap with Kepler settings (fake-cpu-meter enabled for VMs)
+   - DaemonSet running Kepler on all nodes
+   - Service exposing port 9188
+   - ServiceMonitor for Prometheus scraping
+
+2. **Grafana namespace** with:
+   - ServiceAccount with cluster-monitoring-view permissions
+   - Datasource ConfigMap (Thanos Querier with Bearer auth)
+   - Dashboard ConfigMap (TNF Power Monitoring)
+   - Deployment, Service, Route
+
+---
+
+## 5. Grafana Dashboard
+
+### Accessing Grafana
+
+**Via Port-Forward** (recommended for external access):
+
+```bash
+ssh -L 3000:localhost:3002 ec2-user@<HYPERVISOR_IP> \
+  "export KUBECONFIG=~/openshift-metal3/dev-scripts/ocp/ostest/auth/kubeconfig && \
+   oc port-forward -n grafana svc/grafana 3002:3000"
+```
+
+Then open: http://localhost:3000
+
+### Dashboard Panels
+
+| Panel | Description | Query |
+|-------|-------------|-------|
+| **Total Cluster Power** | Sum of all nodes' CPU power | `sum(rate(kepler_node_cpu_joules_total[5m])) * 60` |
+| **Power by Node** | Power consumption per node | `sum by (instance) (rate(kepler_node_cpu_joules_total[5m])) * 60` |
+| **HA Components Power** | Power used by etcd, MCO, apiserver | `sum(kepler_container_cpu_watts{container_name=~".*apiserver.*\|.*etcd.*"})` |
+| **Est. Monthly Cost** | Estimated electricity cost | `sum(kepler_node_cpu_watts) / 1000 * 24 * 30 * $cost_per_kwh` |
+| **Nodes Online** | Number of healthy nodes | `count(up{job="kepler-exporter"} == 1)` |
+| **Power Over Time** | Time series of power consumption | Line graph with cluster and per-node data |
+| **Top 10 Pods** | Highest power-consuming pods | `topk(10, sum by (container_name) (...))` |
+| **TNF Control Plane** | Breakdown of control plane components | Bar chart of apiserver, etcd, MCO, etc. |
+
+### Dashboard Features
+
+- **Cost variable**: Dropdown to set electricity cost ($/kWh)
+- **Fencing annotations**: Red markers when node state changes (fence events)
+- **Auto-refresh**: Updates every 30 seconds
+
+---
+
+## 6. Useful Commands
+
+### Verify Deployment
+
+```bash
+# Check Kepler pods
+oc get pods -n kepler
+
+# Check Grafana
+oc get pods -n grafana
+
+# Check metrics are being scraped
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=kepler_build_info' | jq
+```
+
+### Query Metrics Directly
+
+```bash
+# Total power
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum(kepler_node_cpu_watts)' | jq
+
+# Power by node
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum%20by%20(instance)%20(kepler_node_cpu_watts)' | jq
+```
+
+### Check RAPL Availability
+
+```bash
+# From a Kepler pod
+POD=$(oc get pods -n kepler -l app.kubernetes.io/name=kepler-exporter -o jsonpath='{.items[0].metadata.name}')
+oc exec -n kepler $POD -- ls /sys/class/powercap/ 2>/dev/null || echo "No RAPL (VM mode)"
+```
+
+---
+
+## 7. Limitations and Considerations
+
+### VM Limitations
+
+| Aspect | Bare Metal | VM |
+|--------|------------|-----|
+| Power Source | RAPL hardware | Estimation model |
+| Accuracy | High (real watts) | Low (estimated) |
+| Values | Meaningful (10-200W) | Near-zero (0.001W) |
+| Use Case | Production monitoring | Development/testing |
+
+### Known Limitations
+
+1. **VM power values are simulated**: On VMs without RAPL, Kepler uses fake-cpu-meter which shows very low values proportional to CPU activity
+2. **Node names hardcoded**: Dashboard uses `label_replace` with IPs (192.168.111.20/21) - adjust for different clusters
+3. **No namespace breakdown**: Kepler's `container_name` label contains pod names, not namespace info
+4. **Dashboard optimized for 2-node**: TNF-specific layout assumes master-0 and master-1
+
+### Future Improvements
+
+- Dynamic node name detection
+- Alerts for power anomalies
+- Historical cost tracking
+- Redfish integration for BMC-based power readings
+
+---
+
+## 8. File Structure
+
+```
+deploy/openshift-clusters/
+├── kepler.yml                          # Main playbook
+├── roles/kepler/
+│   ├── defaults/main.yml               # Default variables (image, ports)
+│   ├── tasks/
+│   │   ├── main.yml                    # Kepler deployment
+│   │   ├── monitoring.yml              # ServiceMonitor setup
+│   │   └── grafana.yml                 # Grafana deployment
+│   └── templates/
+│       ├── kepler-daemonset.yaml.j2    # DaemonSet template
+│       └── tnf-power-dashboard-cm.yaml.j2  # Dashboard JSON
+└── Makefile                            # deploy-kepler / remove-kepler targets
+```
+
+---
+
+## 9. Demo Script
+
+### Preparation (before presentation)
+
+1. Ensure TNF cluster is running
+2. Deploy Kepler: `make deploy-kepler`
+3. Set up port-forward in a terminal
+4. Open Grafana in browser, navigate to dashboard
+5. Have backup screenshots ready
+
+### Demo Flow
+
+1. **Show the cluster**
+   ```bash
+   oc get nodes
+   oc get pods -n kepler
+   ```
+
+2. **Explain the architecture** (use diagram above)
+
+3. **Show Grafana dashboard**
+   - Total power consumption
+   - Per-node breakdown
+   - HA components overhead
+   - Cost estimation
+
+4. **Show deployment simplicity**
+   ```bash
+   make deploy-kepler   # one command
+   ```
+
+5. **Q&A**
+
+---
+
+## 10. References
+
+- Kepler Project: https://github.com/sustainable-computing-io/kepler
+- Kepler Documentation: https://sustainable-computing.io/
+- Red Hat Kepler Blog: https://www.redhat.com/en/blog/introducing-developer-preview-of-kepler-power-monitoring-for-red-hat-openshift
+- OpenShift Power Monitoring: https://docs.redhat.com/en/documentation/openshift_container_platform/4.14/html-single/power_monitoring/index
diff --git a/docs/kepler/README.md b/docs/kepler/README.md
new file mode 100644
index 00000000..41cf12ee
--- /dev/null
+++ b/docs/kepler/README.md
@@ -0,0 +1,361 @@
+# Kepler Power Monitoring for TNF Clusters
+
+This guide explains how to deploy Kepler power monitoring on TNF (Two Nodes with Fencing) clusters.
+
+## Overview
+
+[Kepler](https://sustainable-computing.io/) (Kubernetes-based Efficient Power Level Exporter) is a CNCF Sandbox project that uses eBPF to measure power consumption at container, pod, and node levels.
+
+### Why TNF + Kepler?
+
+| TNF Characteristic | Kepler Relevance |
+|--------------------|------------------|
+| Edge deployments | Power is often limited/expensive at the edge |
+| Pacemaker/etcd overhead | Measure the power cost of HA components |
+| Fencing events | Power cycles affect consumption patterns |
+
+## Prerequisites
+
+Before deploying, verify on your TNF cluster:
+
+```bash
+# Check OpenShift version (Kepler needs 4.12+)
+oc version
+
+# Check if monitoring is enabled
+oc get pods -n openshift-monitoring
+
+# Check node kernel (eBPF needs 4.18+)
+oc debug node/master-0 -- chroot /host uname -r
+```
+
+## Quick Start
+
+### From the deploy/ directory
+
+```bash
+# Deploy Kepler with Grafana
+make deploy-kepler
+
+# Remove Kepler
+make remove-kepler
+```
+
+### Using Ansible directly
+
+```bash
+cd deploy/openshift-clusters
+
+# Deploy with defaults
+ansible-playbook kepler.yml -i inventory.ini
+
+# Deploy without Grafana
+ansible-playbook kepler.yml -i inventory.ini -e grafana_enabled=false
+
+# Remove Kepler
+ansible-playbook kepler.yml -i inventory.ini -e kepler_state=absent
+```
+
+### Using the scripts directly
+
+```bash
+# Deploy (with helpful output)
+./deploy/openshift-clusters/scripts/deploy-kepler.sh
+
+# Deploy without Grafana
+./deploy/openshift-clusters/scripts/deploy-kepler.sh false
+
+# Remove
+./deploy/openshift-clusters/scripts/remove-kepler.sh
+```
+
+## Configuration Options
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `kepler_state` | `present` | Set to `absent` to remove |
+| `kepler_namespace` | `kepler` | Namespace for Kepler components |
+| `kepler_image` | `quay.io/sustainable_computing_io/kepler:v0.11.3` | Kepler image |
+| `kepler_port` | `9188` | Metrics port |
+| `grafana_enabled` | `true` | Deploy Grafana with dashboards |
+| `grafana_namespace` | `grafana` | Namespace for Grafana |
+| `grafana_image` | `docker.io/grafana/grafana:10.4.1` | Grafana image |
+| `kepler_scrape_interval` | `30s` | Prometheus scrape interval |
+
+## Accessing Dashboards
+
+### Via Grafana (port-forward)
+
+```bash
+# Source cluster credentials
+source deploy/openshift-clusters/proxy.env
+
+# Port-forward Grafana
+oc port-forward -n grafana svc/grafana 3000:3000
+
+# Open in browser: http://localhost:3000
+# Dashboard: TNF Power Monitoring
+```
+
+### Via SSH tunnel (for remote hypervisors)
+
+```bash
+ssh -L 3000:localhost:3002 ec2-user@<HYPERVISOR_IP> \
+    "pkill -f 'oc port-forward.*grafana' 2>/dev/null; sleep 1; \
+     export KUBECONFIG=~/openshift-metal3/dev-scripts/ocp/ostest/auth/kubeconfig && \
+     oc port-forward -n grafana svc/grafana 3002:3000"
+```
+
+### Via OpenShift Console
+
+1. Navigate to Observe -> Metrics
+2. Enter query: `kepler_node_cpu_joules_total`
+
+## Dashboard Panels
+
+The TNF Power Monitoring dashboard includes 12 panels:
+
+### Row 1: Key Metrics
+| Panel | Description |
+|-------|-------------|
+| **Total Cluster Power** | Combined power of both nodes |
+| **Power by Node** | Individual node consumption |
+| **HA Components Power** | etcd, apiserver, MCO overhead |
+| **Est. Monthly Cost** | Configurable $/kWh cost estimate |
+| **Nodes Online** | Health indicator (2=healthy, 1=degraded) |
+
+### Row 2: Time Series
+| Panel | Description |
+|-------|-------------|
+| **Power Over Time** | Full-width time series of cluster and node power |
+
+### Row 3: Workload Analysis
+| Panel | Description |
+|-------|-------------|
+| **Top 10 Workloads by Power** | Non-control-plane workloads |
+| **TNF Control Plane Breakdown** | HA component power details |
+
+### Row 4: TNF-Specific Metrics
+| Panel | Description |
+|-------|-------------|
+| **Node Power Balance** | Imbalance detection (green=balanced, red=imbalanced) |
+| **etcd Power** | Dedicated etcd monitoring |
+| **Power Trend (1h)** | Increasing/Stable/Decreasing indicator |
+
+### Row 5: Correlation
+| Panel | Description |
+|-------|-------------|
+| **CPU vs Power Correlation** | Dual-axis chart showing CPU usage vs power |
+
+### Dashboard Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| Cost per kWh ($) | 0.12 | Electricity cost for monthly estimate |
+
+### Dashboard Annotations
+
+The dashboard includes automatic annotations:
+- **Red line**: Node down event (fencing, crash)
+- **Node state change**: Markers when nodes join/leave
+
+## Key Metrics
+
+| Metric | Description |
+|--------|-------------|
+| `kepler_node_cpu_joules_total` | CPU energy (joules) - use rate() for watts |
+| `kepler_node_cpu_watts` | Instantaneous CPU power per node |
+| `kepler_container_cpu_joules_total` | Container-level energy |
+| `kepler_container_cpu_watts` | Instantaneous container power |
+
+### Useful PromQL Queries
+
+```promql
+# Total cluster power (watts)
+sum(rate(kepler_node_cpu_joules_total[5m])) * 60
+
+# Power by node
+sum by (instance) (rate(kepler_node_cpu_joules_total[5m])) * 60
+
+# etcd power
+sum(rate(kepler_container_cpu_joules_total{container_name=~".*etcd.*"}[5m])) * 60
+
+# Node power balance (% imbalance)
+((max(sum by (instance) (rate(kepler_node_cpu_joules_total[5m]))) -
+  min(sum by (instance) (rate(kepler_node_cpu_joules_total[5m])))) /
+ (avg(sum by (instance) (rate(kepler_node_cpu_joules_total[5m]))) + 0.0001)) * 100
+
+# TNF control plane overhead
+sum(rate(kepler_container_cpu_joules_total{
+  container_name=~".*apiserver.*|.*etcd.*|.*machine-config.*"
+}[5m])) * 60
+```
+
+## Claude Code Skill
+
+A `/tnf-power` skill is available for quick power reports:
+
+```bash
+cd repos/two-node-toolbox
+claude
+
+# In Claude:
+/tnf-power
+```
+
+This generates a formatted power consumption report directly from the CLI.
+
+## Power Measurement Modes
+
+Kepler operates in two different modes depending on your infrastructure:
+
+### Bare Metal (Production TNF)
+
+```
+Physical Server
+├── Intel/AMD CPU with RAPL
+├── /sys/class/powercap/intel-rapl/ <- Real hardware registers
+└── OpenShift node
+    └── Kepler <- Reads REAL power in watts
+```
+
+On bare metal TNF clusters (the intended production deployment):
+- Kepler reads **Intel RAPL** (Running Average Power Limit) registers
+- Power measurements are **real electrical consumption** from CPU hardware
+- DRAM power is also available on supported platforms
+- This is the most accurate power monitoring available
+
+### Virtualized (Dev/Test Environment)
+
+```
+Hypervisor (bare metal)
+├── Real RAPL available here
+└── libvirt/QEMU VMs
+    ├── master-0 (no RAPL access)
+    │   └── Kepler <- Uses fake-cpu-meter (simulated)
+    └── master-1 (no RAPL access)
+        └── Kepler <- Uses fake-cpu-meter (simulated)
+```
+
+In virtualized environments (dev-scripts, kcli, cloud VMs):
+- VMs cannot access host RAPL registers (hardware isolation)
+- Kepler uses **fake-cpu-meter** mode for simulated power values
+- Values are proportional to CPU activity, not actual watts
+- Useful for **relative comparisons**, not absolute measurements
+
+### Automatic Detection
+
+The Ansible role automatically detects which mode is active:
+- Checks for RAPL availability in `/sys/class/powercap/intel-rapl/`
+- Displays a clear message indicating the power data mode
+- No configuration required - works automatically on both environments
+
+## Known Limitations
+
+1. **Virtual environments**: In VMs (like dev-scripts deployments), Kepler uses fake-cpu-meter mode rather than actual hardware sensors (RAPL). Power readings are proportional to CPU activity but not actual watts. This is a fundamental limitation of virtualization.
+
+2. **Pacemaker/Corosync**: These run as systemd services on the host, not as containers. Their power consumption is included in node-level metrics but not visible as separate containers.
+
+3. **Fencing events**: During a fence operation (node power-off), metrics will be unavailable for that node until it rejoins the cluster.
+
+4. **RAPL coverage**: RAPL measures CPU package and DRAM power. Other components (storage, network, fans) are not included in RAPL readings.
+
+## Troubleshooting
+
+### Kepler pods not starting
+
+```bash
+# Check DaemonSet status
+oc get ds -n kepler
+
+# Check pod logs
+oc logs -n kepler -l app.kubernetes.io/name=kepler-exporter
+```
+
+### No metrics in Prometheus
+
+**Important context**: Kepler uses OpenShift's user workload monitoring, not the platform monitoring stack. The ServiceMonitor must be in the `kepler` namespace (not `openshift-monitoring`), and metrics are queried from `prometheus-user-workload-0` (not `prometheus-k8s-0`).
+
+```bash
+# Verify ServiceMonitor exists in kepler namespace
+oc get servicemonitor -n kepler
+
+# Check user workload monitoring is running
+oc get pods -n openshift-user-workload-monitoring
+
+# Check if Kepler target is up
+oc -n openshift-user-workload-monitoring exec -c prometheus prometheus-user-workload-0 -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=up{job="kepler-exporter"}' | jq
+
+# Query Kepler metrics
+oc -n openshift-user-workload-monitoring exec -c prometheus prometheus-user-workload-0 -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=kepler_build_info' | jq '.data.result'
+```
+
+**Note**: The job name is `kepler-exporter` (from the Service name), not `kepler`.
+
+### Grafana cannot connect to Prometheus
+
+```bash
+# Check Grafana pod logs
+oc logs -n grafana -l app=grafana
+
+# Verify ServiceAccount token exists
+oc get secret grafana-token -n grafana
+
+# Check ClusterRoleBinding
+oc get clusterrolebinding grafana-cluster-monitoring-view
+```
+
+## Architecture
+
+```
+TNF Cluster
+├── Kepler Namespace
+│   ├── Kepler DaemonSet (runs on both nodes)
+│   ├── Kepler Service (port 9188)
+│   ├── ServiceMonitor (for Prometheus)
+│   └── ConfigMaps (kepler-cfm, kepler-config)
+│
+├── OpenShift User Workload Monitoring
+│   ├── Prometheus (scrapes Kepler metrics)
+│   └── Thanos Querier
+│
+└── Grafana Namespace
+    ├── Grafana Deployment
+    ├── Grafana Service & Route
+    ├── ConfigMap: grafana-datasources (Thanos connection)
+    └── ConfigMap: tnf-power-dashboard (dashboard JSON)
+```
+
+## Files
+
+```
+repos/two-node-toolbox/
+├── .claude/commands/
+│   └── tnf-power.md                    # Claude skill for power reports
+├── docs/kepler/
+│   ├── README.md                       # This documentation
+│   ├── KEPLER-ARCHITECTURE.md          # Detailed file-by-file guide
+│   └── KEPLER-PRESENTATION.md          # Presentation guide
+└── deploy/openshift-clusters/
+    ├── kepler.yml                      # Main playbook
+    ├── scripts/
+    │   ├── deploy-kepler.sh            # Deployment wrapper
+    │   └── remove-kepler.sh            # Removal wrapper
+    └── roles/kepler/
+        ├── defaults/main.yml           # Configuration variables
+        ├── tasks/
+        │   ├── main.yml                # Kepler deployment
+        │   ├── monitoring.yml          # ServiceMonitor setup
+        │   └── grafana.yml             # Grafana deployment
+        └── templates/
+            ├── kepler-daemonset.yaml.j2               # Kepler DaemonSet
+            └── tnf-power-dashboard-cm.yaml.j2          # Grafana dashboard
+```
+
+## Resources
+
+- [Kepler GitHub](https://github.com/sustainable-computing-io/kepler)
+- [Kepler Documentation](https://sustainable-computing.io/)
+- [Prometheus Naming Conventions](https://prometheus.io/docs/practices/naming/)

From 561c9d31bb79789c0dd8d4fcb154081be46498c9 Mon Sep 17 00:00:00 2001
From: lucaconsalvi <lconsalv@redhat.com>
Date: Thu, 5 Mar 2026 14:25:13 +0100
Subject: [PATCH 2/2] Fix tnf-power claude skill.

---
 .claude/commands/tnf-power.md | 69 +++++++++++++++++++++++++----------
 1 file changed, 49 insertions(+), 20 deletions(-)

diff --git a/.claude/commands/tnf-power.md b/.claude/commands/tnf-power.md
index 6b991960..b3fc0807 100644
--- a/.claude/commands/tnf-power.md
+++ b/.claude/commands/tnf-power.md
@@ -71,34 +71,62 @@ fi
 
 ### Step 3: Query Power Metrics
 
-Run these queries against the user workload Prometheus:
+Run these queries against the user workload Prometheus.
+
+**Important label notes for Kepler v0.11.x:**
+- Container metrics (`kepler_container_cpu_watts`) have `namespace: kepler` (the exporter's own namespace), NOT the workload namespace
+- The `container_name` label holds the pod/process name discovered by Kepler
+- The `instance` label identifies which node reported the metric (e.g., `192.168.111.20:9188`)
+- To find control plane components, use `container_name` regex matching against known pod prefixes
 
 ```bash
 # Total cluster power (watts) - sum of node CPU power
 oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
-  curl -s 'http://localhost:9090/api/v1/query?query=sum(kepler_node_cpu_watts)' | \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum(kepler_node_cpu_watts)' 2>/dev/null | \
   jq -r '.data.result[0].value[1] // "0"'
 
 # Power by node (using node CPU watts)
 oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
-  curl -s 'http://localhost:9090/api/v1/query?query=sum%20by%20(instance)(kepler_node_cpu_watts)' | \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum%20by%20(instance)(kepler_node_cpu_watts)' 2>/dev/null | \
   jq -r '.data.result[] | "\(.metric.instance): \(.value[1])W"'
 
-# Top 10 containers by power
+# Top 10 containers by power (container_name = pod/process name, instance = node)
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=topk(10,sum%20by%20(container_name,%20instance)(kepler_container_cpu_watts))' 2>/dev/null | \
+  jq -r '.data.result[] | "\(.metric.container_name) [\(.metric.instance)]: \(.value[1])W"'
+
+# Control plane component power (matched by known pod name prefixes)
+# etcd
 oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
-  curl -s 'http://localhost:9090/api/v1/query?query=topk(10,kepler_container_cpu_watts)' | \
-  jq -r '.data.result[] | "\(.metric.container_name): \(.value[1])W"'
+  curl -s 'http://localhost:9090/api/v1/query?query=sum(kepler_container_cpu_watts%7Bcontainer_name%3D~%22etcd.*%22%7D)' 2>/dev/null | \
+  jq -r '"etcd: \(.data.result[0].value[1] // "0")W"'
+
+# kube-apiserver
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum(kepler_container_cpu_watts%7Bcontainer_name%3D~%22kube-apiserver.*%22%7D)' 2>/dev/null | \
+  jq -r '"kube-apiserver: \(.data.result[0].value[1] // "0")W"'
+
+# kube-controller-manager
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum(kepler_container_cpu_watts%7Bcontainer_name%3D~%22kube-controller-manager.*%22%7D)' 2>/dev/null | \
+  jq -r '"kube-controller-manager: \(.data.result[0].value[1] // "0")W"'
+
+# kube-scheduler
+oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum(kepler_container_cpu_watts%7Bcontainer_name%3D~%22kube-scheduler.*%22%7D)' 2>/dev/null | \
+  jq -r '"kube-scheduler: \(.data.result[0].value[1] // "0")W"'
 
 # Power over time using joules (rate gives watts)
 oc exec -n openshift-user-workload-monitoring prometheus-user-workload-0 -c prometheus -- \
-  curl -s 'http://localhost:9090/api/v1/query?query=sum(rate(kepler_node_cpu_joules_total[5m]))' | \
+  curl -s 'http://localhost:9090/api/v1/query?query=sum(rate(kepler_node_cpu_joules_total%5B5m%5D))' 2>/dev/null | \
   jq -r '.data.result[0].value[1] // "0"'
 ```
 
 **Note on metrics**:
-- `kepler_node_cpu_watts` - Instantaneous CPU power per node
-- `kepler_container_cpu_watts` - Instantaneous CPU power per container
-- `kepler_node_cpu_joules_total` - Cumulative energy (use rate() for watts)
+- `kepler_node_cpu_watts` - Instantaneous CPU power per node (labels: `instance`, `zone`)
+- `kepler_container_cpu_watts` - Instantaneous CPU power per container (labels: `container_name`, `instance`)
+- `kepler_node_cpu_joules_total` - Cumulative energy (use `rate()` for watts)
+- All container metrics report under `namespace: kepler` — use `container_name` regex to identify workloads
 
 In **estimation mode** (VMs without RAPL), values will be very small (microwatts to milliwatts) because they're based on CPU activity models, not real power measurements. On **bare metal with RAPL**, expect realistic values (tens to hundreds of watts).
 
@@ -137,16 +165,17 @@ Present the results in this format:
 ### TNF Control Plane Overhead
 | Component | Power (W) |
 |-----------|-----------|
-| openshift-etcd | XX.X |
-| openshift-kube-apiserver | XX.X |
-| openshift-machine-config-operator | XX.X |
-
-### Top Namespaces by Power
-| Namespace | Power (W) |
-|-----------|-----------|
-| namespace-1 | XX.X |
-| namespace-2 | XX.X |
-| ... | ... |
+| etcd | XX.X |
+| kube-apiserver | XX.X |
+| kube-controller-manager | XX.X |
+| kube-scheduler | XX.X |
+
+### Top Containers by Power
+| Container (Pod) | Node | Power (W) |
+|-----------------|------|-----------|
+| container-1 | 192.168.111.20 | XX.X |
+| container-2 | 192.168.111.21 | XX.X |
+| ... | ... | ... |
 
 ---
 *Note: [If ESTIMATED mode] Power values are ML-based estimates.