diff --git a/playbooks/cluster_healthcheck.yml b/playbooks/cluster_healthcheck.yml
new file mode 100644
index 0000000..dafcabb
--- /dev/null
+++ b/playbooks/cluster_healthcheck.yml
@@ -0,0 +1,10 @@
+---
+- name: Run cluster healthchecks
+ hosts: localhost
+ connection: local
+ gather_facts: false
+ tasks:
+ - name: Include cluster_healthcheck role
+ ansible.builtin.import_role:
+ name: infra.openshift_virtualization_migration.cluster_healthcheck
+...
diff --git a/roles/cluster_healthcheck/README.md b/roles/cluster_healthcheck/README.md
new file mode 100644
index 0000000..d12ae0d
--- /dev/null
+++ b/roles/cluster_healthcheck/README.md
@@ -0,0 +1,69 @@
+# cluster_healthcheck
+
+```
+Role belongs to infra/openshift_virtualization_migration
+Namespace - infra
+Collection - openshift_virtualization_migration
+```
+
+Description: Cluster health validation for OpenShift Virtualization migration environments.
+
+## Requirements
+
+- OpenShift cluster with `kubeconfig` configured
+- `kubernetes.core` collection installed
+- OpenShift Virtualization (CNV) operator installed
+- Migration Toolkit for Virtualization (MTV) operator installed
+
+## Role Variables
+
+### Defaults
+
+| Variable | Type | Default | Description |
+|----------|------|---------|-------------|
+| `cluster_healthcheck_checks` | list | See defaults/main.yml | List of health checks to run |
+| `cluster_healthcheck_post_migration_vms` | list | `[]` | VMs to check post-migration |
+| `cluster_healthcheck_generate_report` | bool | `true` | Generate HTML report |
+| `cluster_healthcheck_report_path` | str | `/tmp/cluster_healthcheck_report.html` | Report output path |
+| `cluster_healthcheck_mtv_namespace` | str | `openshift-mtv` | MTV operator namespace |
+| `cluster_healthcheck_kubevirt_namespace` | str | `openshift-cnv` | KubeVirt operator namespace |
+| `cluster_healthcheck_ssh_timeout` | int | `10` | SSH check timeout in seconds |
+| `cluster_healthcheck_debug` | bool | `false` | Enable verbose debug output |
+
+### Post-Migration VM Format
+
+```yaml
+cluster_healthcheck_post_migration_vms:
+ - name: my-vm
+ namespace: my-namespace
+ check_ssh: true # optional, default false
+```
+
+## Health Checks
+
+| Check | Description |
+|-------|-------------|
+| `ocp_node_health` | Node Ready status, resource pressure, kubevirt.io/schedulable label |
+| `kubevirt_health` | HyperConverged CR, virt-* pods, CDI operator |
+| `mtv_health` | ForkliftController, MTV pods, Providers, Plans |
+| `storage_health` | StorageClasses, CSI drivers, PV capacity, pending PVCs |
+| `network_health` | Multus, NADs, OVN/SDN health, migration network |
+
+## Example Playbook
+
+```yaml
+- name: Run cluster healthchecks
+ hosts: localhost
+ connection: local
+ gather_facts: false
+ roles:
+ - role: infra.openshift_virtualization_migration.cluster_healthcheck
+ vars:
+ cluster_healthcheck_post_migration_vms:
+ - name: rhel9-vm
+ namespace: migration-target
+```
+
+## License
+
+GPL-3.0-only
diff --git a/roles/cluster_healthcheck/defaults/main.yml b/roles/cluster_healthcheck/defaults/main.yml
new file mode 100644
index 0000000..f8b58e9
--- /dev/null
+++ b/roles/cluster_healthcheck/defaults/main.yml
@@ -0,0 +1,23 @@
+---
+# defaults file for cluster_healthcheck
+cluster_healthcheck_checks:
+ - ocp_node_health
+ - kubevirt_health
+ - mtv_health
+ - storage_health
+ - network_health
+
+cluster_healthcheck_post_migration_vms: []
+
+cluster_healthcheck_generate_report: true
+
+cluster_healthcheck_report_path: "/tmp/cluster_healthcheck_report.html"
+
+cluster_healthcheck_mtv_namespace: "openshift-mtv"
+
+cluster_healthcheck_kubevirt_namespace: "openshift-cnv"
+
+cluster_healthcheck_ssh_timeout: 10
+
+cluster_healthcheck_debug: false
+...
diff --git a/roles/cluster_healthcheck/meta/main.yml b/roles/cluster_healthcheck/meta/main.yml
new file mode 100644
index 0000000..7f4bf14
--- /dev/null
+++ b/roles/cluster_healthcheck/meta/main.yml
@@ -0,0 +1,10 @@
+---
+galaxy_info:
+ author: ""
+ description: Cluster health validation for OpenShift Virtualization migration environments.
+ company: Red Hat
+ license: GPL-3.0-only
+ min_ansible_version: 2.15.0
+ galaxy_tags: []
+dependencies: []
+...
diff --git a/roles/cluster_healthcheck/tasks/kubevirt_health.yml b/roles/cluster_healthcheck/tasks/kubevirt_health.yml
new file mode 100644
index 0000000..d8d4df0
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/kubevirt_health.yml
@@ -0,0 +1,177 @@
+---
+- name: kubevirt_health | Get HyperConverged CR status
+ kubernetes.core.k8s_info:
+ api_version: hco.kubevirt.io/v1beta1
+ kind: HyperConverged
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ register: __cluster_healthcheck_hco
+
+- name: kubevirt_health | Evaluate HyperConverged conditions
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_hco_available: >-
+ {{ __cluster_healthcheck_hco.resources | length > 0 and
+ __cluster_healthcheck_hco.resources[0].status.conditions | default([])
+ | selectattr('type', 'equalto', 'Available')
+ | map(attribute='status')
+ | first | default('False') == 'True' }}
+ __cluster_healthcheck_hco_degraded: >-
+ {{ __cluster_healthcheck_hco.resources | length > 0 and
+ __cluster_healthcheck_hco.resources[0].status.conditions | default([])
+ | selectattr('type', 'equalto', 'Degraded')
+ | map(attribute='status')
+ | first | default('False') == 'True' }}
+
+- name: kubevirt_health | Report HyperConverged status
+ ansible.builtin.debug:
+ msg: >-
+ HyperConverged CR -
+ Available: {{ __cluster_healthcheck_hco_available }},
+ Degraded: {{ __cluster_healthcheck_hco_degraded }}
+
+- name: kubevirt_health | Check virt-operator pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "kubevirt.io=virt-operator"
+ register: __cluster_healthcheck_virt_operator_pods
+
+- name: kubevirt_health | Check virt-controller pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "kubevirt.io=virt-controller"
+ register: __cluster_healthcheck_virt_controller_pods
+
+- name: kubevirt_health | Check virt-handler pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "kubevirt.io=virt-handler"
+ register: __cluster_healthcheck_virt_handler_pods
+
+- name: kubevirt_health | Check virt-api pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "kubevirt.io=virt-api"
+ register: __cluster_healthcheck_virt_api_pods
+
+- name: kubevirt_health | Evaluate KubeVirt pod health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_kubevirt_pods:
+ virt_operator:
+ running: "{{ __cluster_healthcheck_virt_operator_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_virt_operator_pods.resources | length }}"
+ virt_controller:
+ running: "{{ __cluster_healthcheck_virt_controller_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_virt_controller_pods.resources | length }}"
+ virt_handler:
+ running: "{{ __cluster_healthcheck_virt_handler_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_virt_handler_pods.resources | length }}"
+ virt_api:
+ running: "{{ __cluster_healthcheck_virt_api_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_virt_api_pods.resources | length }}"
+
+- name: kubevirt_health | Check CDI operator pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "app.kubernetes.io/component=cdi-operator"
+ register: __cluster_healthcheck_cdi_operator_pods
+
+- name: kubevirt_health | Check CDI deployment pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ label_selectors:
+ - "app.kubernetes.io/component=cdi-deployment"
+ register: __cluster_healthcheck_cdi_deployment_pods
+
+- name: kubevirt_health | Evaluate CDI health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_cdi_pods:
+ cdi_operator:
+ running: "{{ __cluster_healthcheck_cdi_operator_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_cdi_operator_pods.resources | length }}"
+ cdi_deployment:
+ running: "{{ __cluster_healthcheck_cdi_deployment_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running') | list | length }}"
+ total: "{{ __cluster_healthcheck_cdi_deployment_pods.resources | length }}"
+
+- name: kubevirt_health | Set kubevirt health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'kubevirt_health': {
+ 'status': ('fail' if (not __cluster_healthcheck_hco_available or
+ __cluster_healthcheck_hco_degraded or
+ __cluster_healthcheck_kubevirt_pods.virt_operator.total | int == 0 or
+ __cluster_healthcheck_kubevirt_pods.virt_controller.total | int == 0)
+ else 'pass'),
+ 'details': [
+ { 'check': 'HyperConverged Available',
+ 'status': ('pass' if __cluster_healthcheck_hco_available else 'fail'),
+ 'message': ('HyperConverged CR is Available'
+ if __cluster_healthcheck_hco_available
+ else 'HyperConverged CR is NOT Available') },
+ { 'check': 'HyperConverged Not Degraded',
+ 'status': ('fail' if __cluster_healthcheck_hco_degraded else 'pass'),
+ 'message': ('HyperConverged CR is Degraded'
+ if __cluster_healthcheck_hco_degraded
+ else 'HyperConverged CR is not Degraded') },
+ { 'check': 'virt-operator',
+ 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_operator.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_kubevirt_pods.virt_operator.running | string +
+ '/' + __cluster_healthcheck_kubevirt_pods.virt_operator.total | string +
+ ' pods Running') },
+ { 'check': 'virt-controller',
+ 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_controller.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_kubevirt_pods.virt_controller.running | string +
+ '/' + __cluster_healthcheck_kubevirt_pods.virt_controller.total | string +
+ ' pods Running') },
+ { 'check': 'virt-handler',
+ 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_handler.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_kubevirt_pods.virt_handler.running | string +
+ '/' + __cluster_healthcheck_kubevirt_pods.virt_handler.total | string +
+ ' pods Running') },
+ { 'check': 'virt-api',
+ 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_api.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_kubevirt_pods.virt_api.running | string +
+ '/' + __cluster_healthcheck_kubevirt_pods.virt_api.total | string +
+ ' pods Running') },
+ { 'check': 'CDI Operator',
+ 'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_operator.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_cdi_pods.cdi_operator.running | string +
+ '/' + __cluster_healthcheck_cdi_pods.cdi_operator.total | string +
+ ' pods Running') },
+ { 'check': 'CDI Deployment',
+ 'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_deployment.running | int > 0
+ else 'fail'),
+ 'message': (__cluster_healthcheck_cdi_pods.cdi_deployment.running | string +
+ '/' + __cluster_healthcheck_cdi_pods.cdi_deployment.total | string +
+ ' pods Running') }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/main.yml b/roles/cluster_healthcheck/tasks/main.yml
new file mode 100644
index 0000000..3bb1475
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/main.yml
@@ -0,0 +1,40 @@
+---
+# tasks file for cluster_healthcheck
+- name: Initialize healthcheck results
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results:
+ ocp_node_health: { status: "skipped", details: [] }
+ kubevirt_health: { status: "skipped", details: [] }
+ mtv_health: { status: "skipped", details: [] }
+ storage_health: { status: "skipped", details: [] }
+ network_health: { status: "skipped", details: [] }
+ post_migration_vm: { status: "skipped", details: [] }
+
+- name: Include ocp_node_health tasks
+ ansible.builtin.include_tasks: ocp_node_health.yml
+ when: "'ocp_node_health' in cluster_healthcheck_checks"
+
+- name: Include kubevirt_health tasks
+ ansible.builtin.include_tasks: kubevirt_health.yml
+ when: "'kubevirt_health' in cluster_healthcheck_checks"
+
+- name: Include mtv_health tasks
+ ansible.builtin.include_tasks: mtv_health.yml
+ when: "'mtv_health' in cluster_healthcheck_checks"
+
+- name: Include storage_health tasks
+ ansible.builtin.include_tasks: storage_health.yml
+ when: "'storage_health' in cluster_healthcheck_checks"
+
+- name: Include network_health tasks
+ ansible.builtin.include_tasks: network_health.yml
+ when: "'network_health' in cluster_healthcheck_checks"
+
+- name: Include post_migration_vm tasks
+ ansible.builtin.include_tasks: post_migration_vm.yml
+ when: "cluster_healthcheck_post_migration_vms | length > 0"
+
+- name: Include report tasks
+ ansible.builtin.include_tasks: report.yml
+ when: cluster_healthcheck_generate_report
+...
diff --git a/roles/cluster_healthcheck/tasks/mtv_health.yml b/roles/cluster_healthcheck/tasks/mtv_health.yml
new file mode 100644
index 0000000..4aa5235
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/mtv_health.yml
@@ -0,0 +1,124 @@
+---
+- name: mtv_health | Check ForkliftController CR
+ kubernetes.core.k8s_info:
+ api_version: forklift.konveyor.io/v1beta1
+ kind: ForkliftController
+ namespace: "{{ cluster_healthcheck_mtv_namespace }}"
+ register: __cluster_healthcheck_forklift_controller
+
+- name: mtv_health | Evaluate ForkliftController health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_forklift_healthy: >-
+ {{ __cluster_healthcheck_forklift_controller.resources | length > 0 and
+ __cluster_healthcheck_forklift_controller.resources[0].status.conditions | default([])
+ | selectattr('type', 'equalto', 'Successful')
+ | map(attribute='status')
+ | first | default('False') == 'True' }}
+
+- name: mtv_health | Report ForkliftController status
+ ansible.builtin.debug:
+ msg: >-
+ ForkliftController -
+ Found: {{ __cluster_healthcheck_forklift_controller.resources | length > 0 }},
+ Healthy: {{ __cluster_healthcheck_forklift_healthy }}
+
+- name: mtv_health | Check MTV operator pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: "{{ cluster_healthcheck_mtv_namespace }}"
+ label_selectors:
+ - "app=forklift"
+ register: __cluster_healthcheck_mtv_pods
+
+- name: mtv_health | Evaluate MTV operator pod status
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_mtv_pods_running: >-
+ {{ __cluster_healthcheck_mtv_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running')
+ | list | length }}
+ __cluster_healthcheck_mtv_pods_total: >-
+ {{ __cluster_healthcheck_mtv_pods.resources | length }}
+
+- name: mtv_health | Check Provider CRs
+ kubernetes.core.k8s_info:
+ api_version: forklift.konveyor.io/v1beta1
+ kind: Provider
+ namespace: "{{ cluster_healthcheck_mtv_namespace }}"
+ register: __cluster_healthcheck_providers
+
+- name: mtv_health | Evaluate Provider readiness
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_providers_not_ready: >-
+ {% set not_ready = [] -%}
+ {% for provider in __cluster_healthcheck_providers.resources -%}
+ {% set conditions = provider.status.conditions | default([]) -%}
+ {% set ready = conditions | selectattr('type', 'equalto', 'Ready')
+ | map(attribute='status') | first | default('False') -%}
+ {% if ready != 'True' -%}
+ {% if not_ready.append(provider.metadata.name) -%}{% endif -%}
+ {% endif -%}
+ {% endfor -%}
+ {{ not_ready }}
+
+- name: mtv_health | Check for failed migration Plans
+ kubernetes.core.k8s_info:
+ api_version: forklift.konveyor.io/v1beta1
+ kind: Plan
+ namespace: "{{ cluster_healthcheck_mtv_namespace }}"
+ register: __cluster_healthcheck_plans
+
+- name: mtv_health | Evaluate failed Plans
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_failed_plans: >-
+ {{ __cluster_healthcheck_plans.resources
+ | selectattr('status.conditions', 'defined')
+ | selectattr('status.conditions', 'ansible.builtin.contains',
+ {'type': 'Failed', 'status': 'True'})
+ | map(attribute='metadata.name')
+ | list }}
+
+- name: mtv_health | Report failed Plans
+ ansible.builtin.debug:
+ msg: "Plan {{ item }} is in Failed state"
+ loop: "{{ __cluster_healthcheck_failed_plans }}"
+
+- name: mtv_health | Set MTV health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'mtv_health': {
+ 'status': ('fail' if (__cluster_healthcheck_forklift_controller.resources | length == 0 or
+ not __cluster_healthcheck_forklift_healthy or
+ __cluster_healthcheck_mtv_pods_running | int == 0)
+ else ('warning' if (__cluster_healthcheck_providers_not_ready | length > 0 or
+ __cluster_healthcheck_failed_plans | length > 0)
+ else 'pass')),
+ 'details': [
+ { 'check': 'ForkliftController',
+ 'status': ('pass' if (__cluster_healthcheck_forklift_controller.resources | length > 0 and
+ __cluster_healthcheck_forklift_healthy) else 'fail'),
+ 'message': ('ForkliftController is healthy'
+ if __cluster_healthcheck_forklift_healthy
+ else 'ForkliftController is NOT healthy or missing') },
+ { 'check': 'MTV Operator Pods',
+ 'status': ('pass' if __cluster_healthcheck_mtv_pods_running | int > 0 else 'fail'),
+ 'message': (__cluster_healthcheck_mtv_pods_running | string + '/' +
+ __cluster_healthcheck_mtv_pods_total | string + ' pods Running') },
+ { 'check': 'Providers Ready',
+ 'status': ('warning' if __cluster_healthcheck_providers_not_ready | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_providers_not_ready | length | string) +
+ ' provider(s) not Ready: ' +
+ __cluster_healthcheck_providers_not_ready | join(', '))
+ if __cluster_healthcheck_providers_not_ready | length > 0
+ else 'All providers Ready' },
+ { 'check': 'Failed Plans',
+ 'status': ('warning' if __cluster_healthcheck_failed_plans | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_failed_plans | length | string) +
+ ' plan(s) in Failed state')
+ if __cluster_healthcheck_failed_plans | length > 0
+ else 'No failed plans' }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/network_health.yml b/roles/cluster_healthcheck/tasks/network_health.yml
new file mode 100644
index 0000000..ca2aead
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/network_health.yml
@@ -0,0 +1,122 @@
+---
+- name: network_health | Check Multus pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: openshift-multus
+ label_selectors:
+ - "app=multus"
+ register: __cluster_healthcheck_multus_pods
+
+- name: network_health | Evaluate Multus pod health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_multus_running: >-
+ {{ __cluster_healthcheck_multus_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running')
+ | list | length }}
+ __cluster_healthcheck_multus_total: >-
+ {{ __cluster_healthcheck_multus_pods.resources | length }}
+
+- name: network_health | List NetworkAttachmentDefinitions
+ kubernetes.core.k8s_info:
+ api_version: k8s.cni.cncf.io/v1
+ kind: NetworkAttachmentDefinition
+ register: __cluster_healthcheck_nad
+
+- name: network_health | Report NetworkAttachmentDefinitions
+ ansible.builtin.debug:
+ msg: >-
+ NetworkAttachmentDefinition: {{ item.metadata.namespace }}/{{ item.metadata.name }}
+ loop: "{{ __cluster_healthcheck_nad.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+ when: cluster_healthcheck_debug
+
+- name: network_health | Check OVN-Kubernetes pods
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: openshift-ovn-kubernetes
+ label_selectors:
+ - "app=ovnkube-node"
+ register: __cluster_healthcheck_ovn_pods
+
+- name: network_health | Check OpenShiftSDN pods as fallback
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Pod
+ namespace: openshift-sdn
+ label_selectors:
+ - "app=sdn"
+ register: __cluster_healthcheck_sdn_pods
+ when: __cluster_healthcheck_ovn_pods.resources | length == 0
+
+- name: network_health | Evaluate SDN health
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_sdn_type: >-
+ {{ 'ovn-kubernetes' if __cluster_healthcheck_ovn_pods.resources | length > 0
+ else ('openshift-sdn' if (__cluster_healthcheck_sdn_pods.resources | default([]) | length > 0)
+ else 'unknown') }}
+ __cluster_healthcheck_sdn_running: >-
+ {{ (__cluster_healthcheck_ovn_pods.resources
+ | selectattr('status.phase', 'equalto', 'Running')
+ | list | length) if __cluster_healthcheck_ovn_pods.resources | length > 0
+ else (__cluster_healthcheck_sdn_pods.resources | default([])
+ | selectattr('status.phase', 'equalto', 'Running')
+ | list | length) }}
+
+- name: network_health | Get HyperConverged CR for migration network config
+ kubernetes.core.k8s_info:
+ api_version: hco.kubevirt.io/v1beta1
+ kind: HyperConverged
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ register: __cluster_healthcheck_hco_network
+
+- name: network_health | Extract configured migration network
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_migration_network: >-
+ {{ __cluster_healthcheck_hco_network.resources[0].spec.liveMigrationConfig.network
+ | default('') }}
+ when: __cluster_healthcheck_hco_network.resources | length > 0
+
+- name: network_health | Check migration network NAD
+ kubernetes.core.k8s_info:
+ api_version: k8s.cni.cncf.io/v1
+ kind: NetworkAttachmentDefinition
+ namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
+ register: __cluster_healthcheck_migration_nad
+ when: __cluster_healthcheck_migration_network | default('') | length > 0
+
+- name: network_health | Set network health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'network_health': {
+ 'status': ('fail' if __cluster_healthcheck_sdn_type == 'unknown'
+ else ('warning' if __cluster_healthcheck_multus_total | int == 0
+ else 'pass')),
+ 'details': [
+ { 'check': 'Multus Pods',
+ 'status': ('pass' if __cluster_healthcheck_multus_running | int > 0 else 'warning'),
+ 'message': (__cluster_healthcheck_multus_running | string + '/' +
+ __cluster_healthcheck_multus_total | string + ' Multus pods Running') },
+ { 'check': 'SDN Type',
+ 'status': ('pass' if __cluster_healthcheck_sdn_type != 'unknown' else 'fail'),
+ 'message': ('SDN: ' + __cluster_healthcheck_sdn_type) },
+ { 'check': 'NetworkAttachmentDefinitions',
+ 'status': 'pass',
+ 'message': (__cluster_healthcheck_nad.resources | length | string +
+ ' NAD(s) found across cluster') },
+ { 'check': 'Migration Network',
+ 'status': ('pass' if (__cluster_healthcheck_migration_network | default('') | length == 0 or
+ (__cluster_healthcheck_migration_nad.resources | default([]) | length > 0))
+ else 'warning'),
+ 'message': ('No dedicated migration network configured in HyperConverged CR'
+ if __cluster_healthcheck_migration_network | default('') | length == 0
+ else ((__cluster_healthcheck_migration_nad.resources | default([]) | length | string) +
+ ' NAD(s) in ' + cluster_healthcheck_kubevirt_namespace +
+ ' for migration network ' + __cluster_healthcheck_migration_network)) }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/ocp_node_health.yml b/roles/cluster_healthcheck/tasks/ocp_node_health.yml
new file mode 100644
index 0000000..0bf99b7
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/ocp_node_health.yml
@@ -0,0 +1,122 @@
+---
+- name: ocp_node_health | Get all cluster nodes
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: Node
+ register: __cluster_healthcheck_nodes
+
+- name: ocp_node_health | Evaluate node Ready status
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_nodes_not_ready: >-
+ {{ __cluster_healthcheck_nodes.resources
+ | selectattr('status.conditions', 'defined')
+ | map(attribute='metadata.name')
+ | zip(__cluster_healthcheck_nodes.resources
+ | map(attribute='status.conditions')
+ | map('selectattr', 'type', 'equalto', 'Ready')
+ | map('first'))
+ | selectattr('1.status', 'ne', 'True')
+ | map(attribute='0')
+ | list }}
+
+- name: ocp_node_health | Report nodes not Ready
+ ansible.builtin.debug:
+ msg: "Node {{ item }} is NOT Ready"
+ loop: "{{ __cluster_healthcheck_nodes_not_ready }}"
+
+- name: ocp_node_health | Check for resource pressure conditions
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_pressure_nodes: >-
+ {{ __cluster_healthcheck_pressure_nodes | default([]) +
+ [{ 'name': item.metadata.name,
+ 'pressures': item.status.conditions
+ | selectattr('type', 'in', ['MemoryPressure', 'DiskPressure', 'PIDPressure'])
+ | selectattr('status', 'equalto', 'True')
+ | map(attribute='type')
+ | list }] }}
+ loop: "{{ __cluster_healthcheck_nodes.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+ when: >-
+ item.status.conditions
+ | selectattr('type', 'in', ['MemoryPressure', 'DiskPressure', 'PIDPressure'])
+ | selectattr('status', 'equalto', 'True')
+ | list | length > 0
+
+- name: ocp_node_health | Report nodes with resource pressure
+ ansible.builtin.debug:
+ msg: "Node {{ item.name }} has pressure conditions: {{ item.pressures | join(', ') }}"
+ loop: "{{ __cluster_healthcheck_pressure_nodes | default([]) }}"
+ loop_control:
+ label: "{{ item.name }}"
+
+- name: ocp_node_health | Check allocatable vs capacity ratios
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_capacity_info: >-
+ {{ __cluster_healthcheck_capacity_info | default([]) +
+ [{ 'name': item.metadata.name,
+ 'cpu_allocatable': item.status.allocatable.cpu | default('0'),
+ 'cpu_capacity': item.status.capacity.cpu | default('0'),
+ 'memory_allocatable': item.status.allocatable.memory | default('0Ki'),
+ 'memory_capacity': item.status.capacity.memory | default('0Ki') }] }}
+ loop: "{{ __cluster_healthcheck_nodes.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+
+- name: ocp_node_health | Display capacity information
+ ansible.builtin.debug:
+ msg: >-
+ Node {{ item.name }} -
+ CPU: {{ item.cpu_allocatable }}/{{ item.cpu_capacity }},
+ Memory: {{ item.memory_allocatable }}/{{ item.memory_capacity }}
+ loop: "{{ __cluster_healthcheck_capacity_info | default([]) }}"
+ loop_control:
+ label: "{{ item.name }}"
+ when: cluster_healthcheck_debug
+
+- name: ocp_node_health | Verify worker nodes have kubevirt.io/schedulable label
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_workers_not_schedulable: >-
+ {{ __cluster_healthcheck_nodes.resources
+ | selectattr('metadata.labels', 'defined')
+ | selectattr('metadata.labels.node-role.kubernetes.io/worker', 'defined')
+ | rejectattr('metadata.labels', 'ansible.builtin.contains', 'kubevirt.io/schedulable')
+ | map(attribute='metadata.name')
+ | list }}
+
+- name: ocp_node_health | Report workers missing kubevirt.io/schedulable label
+ ansible.builtin.debug:
+ msg: "Worker node {{ item }} is missing the kubevirt.io/schedulable label"
+ loop: "{{ __cluster_healthcheck_workers_not_schedulable }}"
+
+- name: ocp_node_health | Set node health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'ocp_node_health': {
+ 'status': ('fail' if (__cluster_healthcheck_nodes_not_ready | length > 0 or
+ __cluster_healthcheck_pressure_nodes | default([]) | length > 0)
+ else ('warning' if __cluster_healthcheck_workers_not_schedulable | length > 0
+ else 'pass')),
+ 'details': [
+ { 'check': 'Nodes Ready',
+ 'status': ('fail' if __cluster_healthcheck_nodes_not_ready | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_nodes_not_ready | length | string) + ' node(s) not Ready')
+ if __cluster_healthcheck_nodes_not_ready | length > 0
+ else 'All nodes Ready' },
+ { 'check': 'Resource Pressure',
+ 'status': ('fail' if __cluster_healthcheck_pressure_nodes | default([]) | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_pressure_nodes | default([]) | length | string) +
+ ' node(s) with resource pressure')
+ if __cluster_healthcheck_pressure_nodes | default([]) | length > 0
+ else 'No resource pressure detected' },
+ { 'check': 'KubeVirt Schedulable',
+ 'status': ('warning' if __cluster_healthcheck_workers_not_schedulable | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_workers_not_schedulable | length | string) +
+ ' worker(s) missing kubevirt.io/schedulable label')
+ if __cluster_healthcheck_workers_not_schedulable | length > 0
+ else 'All workers have kubevirt.io/schedulable label' }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/post_migration_vm.yml b/roles/cluster_healthcheck/tasks/post_migration_vm.yml
new file mode 100644
index 0000000..61728d8
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/post_migration_vm.yml
@@ -0,0 +1,81 @@
+---
+- name: post_migration_vm | Check VirtualMachineInstance status
+ kubernetes.core.k8s_info:
+ api_version: kubevirt.io/v1
+ kind: VirtualMachineInstance
+ name: "{{ __cluster_healthcheck_vm.name }}"
+ namespace: "{{ __cluster_healthcheck_vm.namespace }}"
+ register: __cluster_healthcheck_vmi
+ loop: "{{ cluster_healthcheck_post_migration_vms }}"
+ loop_control:
+ loop_var: __cluster_healthcheck_vm
+ label: "{{ __cluster_healthcheck_vm.namespace }}/{{ __cluster_healthcheck_vm.name }}"
+
+- name: post_migration_vm | Evaluate VM status
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_vm_results: >-
+ {{ __cluster_healthcheck_vm_results | default([]) + [{
+ 'name': __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.namespace + '/' +
+ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.name,
+ 'running': (__cluster_healthcheck_vmi_item.resources | length > 0 and
+ __cluster_healthcheck_vmi_item.resources[0].status.phase | default('') == 'Running'),
+ 'guest_agent': (__cluster_healthcheck_vmi_item.resources | length > 0 and
+ __cluster_healthcheck_vmi_item.resources[0].status.guestOSInfo
+ | default({}) | length > 0),
+ 'interfaces': (__cluster_healthcheck_vmi_item.resources | length > 0 and
+ __cluster_healthcheck_vmi_item.resources[0].status.interfaces | default([])
+ | selectattr('ipAddress', 'defined')
+ | list | length > 0)
+ }] }}
+ loop: "{{ __cluster_healthcheck_vmi.results }}"
+ loop_control:
+ loop_var: __cluster_healthcheck_vmi_item
+ label: >-
+ {{ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.namespace }}/{{
+ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.name }}
+
+- name: post_migration_vm | Report VM status
+ ansible.builtin.debug:
+ msg: >-
+ VM {{ item.name }} -
+ Running: {{ item.running }},
+ Guest Agent: {{ item.guest_agent }},
+ Network: {{ item.interfaces }}
+ loop: "{{ __cluster_healthcheck_vm_results | default([]) }}"
+ loop_control:
+ label: "{{ item.name }}"
+
+- name: post_migration_vm | Optional SSH connectivity check
+ ansible.builtin.wait_for:
+ host: >-
+ {{ __cluster_healthcheck_vmi_item.resources[0].status.interfaces[0].ipAddress }}
+ port: 22
+ timeout: "{{ cluster_healthcheck_ssh_timeout }}"
+ state: started
+ loop: "{{ __cluster_healthcheck_vmi.results }}"
+ loop_control:
+ loop_var: __cluster_healthcheck_vmi_item
+ label: >-
+ {{ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.namespace }}/{{
+ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.name }}
+ when:
+ - __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.check_ssh | default(false)
+ - __cluster_healthcheck_vmi_item.resources | length > 0
+ - __cluster_healthcheck_vmi_item.resources[0].status.interfaces | default([]) | length > 0
+ ignore_errors: true # noqa: ignore-errors
+ register: __cluster_healthcheck_ssh_results
+
+- name: post_migration_vm | Set post-migration VM result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'post_migration_vm': {
+ 'status': ('fail' if (__cluster_healthcheck_vm_results | default([])
+ | selectattr('running', 'false') | list | length > 0)
+ else ('warning' if (__cluster_healthcheck_vm_results | default([])
+ | selectattr('guest_agent', 'false') | list | length > 0)
+ else 'pass')),
+ 'details': __cluster_healthcheck_vm_results | default([])
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/tasks/report.yml b/roles/cluster_healthcheck/tasks/report.yml
new file mode 100644
index 0000000..3b04aa7
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/report.yml
@@ -0,0 +1,22 @@
+---
+- name: report | Display healthcheck summary
+ ansible.builtin.debug:
+ msg: >-
+ Healthcheck Summary -
+ Nodes: {{ __cluster_healthcheck_results.ocp_node_health.status }},
+ KubeVirt: {{ __cluster_healthcheck_results.kubevirt_health.status }},
+ MTV: {{ __cluster_healthcheck_results.mtv_health.status }},
+ Storage: {{ __cluster_healthcheck_results.storage_health.status }},
+ Network: {{ __cluster_healthcheck_results.network_health.status }},
+ Post-Migration VMs: {{ __cluster_healthcheck_results.post_migration_vm.status }}
+
+- name: report | Generate HTML healthcheck report
+ ansible.builtin.template:
+ src: cluster_healthcheck_report.html.j2
+ dest: "{{ cluster_healthcheck_report_path }}"
+ mode: "0644"
+
+- name: report | Report file location
+ ansible.builtin.debug:
+ msg: "Healthcheck report written to {{ cluster_healthcheck_report_path }}"
+...
diff --git a/roles/cluster_healthcheck/tasks/storage_health.yml b/roles/cluster_healthcheck/tasks/storage_health.yml
new file mode 100644
index 0000000..abc1882
--- /dev/null
+++ b/roles/cluster_healthcheck/tasks/storage_health.yml
@@ -0,0 +1,109 @@
+---
+- name: storage_health | Get StorageClass resources
+ kubernetes.core.k8s_info:
+ api_version: storage.k8s.io/v1
+ kind: StorageClass
+ register: __cluster_healthcheck_storage_classes
+
+- name: storage_health | Check for default StorageClass
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_default_sc: >-
+ {{ __cluster_healthcheck_storage_classes.resources
+ | selectattr('metadata.annotations', 'defined')
+ | selectattr('metadata.annotations', 'ansible.builtin.contains',
+ 'storageclass.kubernetes.io/is-default-class')
+ | list }}
+
+- name: storage_health | Report StorageClasses
+ ansible.builtin.debug:
+ msg: >-
+ StorageClass: {{ item.metadata.name }},
+ Provisioner: {{ item.provisioner }},
+ Default: {{ 'Yes' if (item.metadata.annotations | default({})
+ ).get('storageclass.kubernetes.io/is-default-class', 'false') == 'true'
+ else 'No' }}
+ loop: "{{ __cluster_healthcheck_storage_classes.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+
+- name: storage_health | Check CSI driver pods
+ kubernetes.core.k8s_info:
+ api_version: storage.k8s.io/v1
+ kind: CSIDriver
+ register: __cluster_healthcheck_csi_drivers
+
+- name: storage_health | Report CSI drivers
+ ansible.builtin.debug:
+ msg: "CSI Driver: {{ item.metadata.name }}"
+ loop: "{{ __cluster_healthcheck_csi_drivers.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+
+- name: storage_health | Get PersistentVolumes
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: PersistentVolume
+ register: __cluster_healthcheck_pvs
+
+- name: storage_health | Evaluate PV capacity
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_pv_available: >-
+ {{ __cluster_healthcheck_pvs.resources
+ | selectattr('status.phase', 'equalto', 'Available')
+ | list | length }}
+ __cluster_healthcheck_pv_total: >-
+ {{ __cluster_healthcheck_pvs.resources | length }}
+
+- name: storage_health | Check for PVCs stuck in Pending
+ kubernetes.core.k8s_info:
+ api_version: v1
+ kind: PersistentVolumeClaim
+ field_selectors:
+ - status.phase=Pending
+ register: __cluster_healthcheck_pending_pvcs
+
+- name: storage_health | Report pending PVCs
+ ansible.builtin.debug:
+ msg: >-
+ PVC {{ item.metadata.namespace }}/{{ item.metadata.name }} is stuck in Pending state
+ loop: "{{ __cluster_healthcheck_pending_pvcs.resources }}"
+ loop_control:
+ label: "{{ item.metadata.name }}"
+
+- name: storage_health | Set storage health result
+ ansible.builtin.set_fact:
+ __cluster_healthcheck_results: >-
+ {{ __cluster_healthcheck_results | combine({
+ 'storage_health': {
+ 'status': ('fail' if __cluster_healthcheck_storage_classes.resources | length == 0
+ else ('warning' if (__cluster_healthcheck_default_sc | length == 0 or
+ __cluster_healthcheck_pending_pvcs.resources | length > 0)
+ else 'pass')),
+ 'details': [
+ { 'check': 'StorageClasses Exist',
+ 'status': ('pass' if __cluster_healthcheck_storage_classes.resources | length > 0 else 'fail'),
+ 'message': (__cluster_healthcheck_storage_classes.resources | length | string +
+ ' StorageClass(es) found') },
+ { 'check': 'Default StorageClass',
+ 'status': ('pass' if __cluster_healthcheck_default_sc | length > 0 else 'warning'),
+ 'message': ('Default StorageClass configured'
+ if __cluster_healthcheck_default_sc | length > 0
+ else 'No default StorageClass set') },
+ { 'check': 'CSI Drivers',
+ 'status': ('pass' if __cluster_healthcheck_csi_drivers.resources | length > 0 else 'warning'),
+ 'message': (__cluster_healthcheck_csi_drivers.resources | length | string +
+ ' CSI driver(s) found') },
+ { 'check': 'PV Capacity',
+ 'status': 'pass',
+ 'message': (__cluster_healthcheck_pv_available | string + '/' +
+ __cluster_healthcheck_pv_total | string + ' PV(s) Available') },
+ { 'check': 'Pending PVCs',
+ 'status': ('warning' if __cluster_healthcheck_pending_pvcs.resources | length > 0 else 'pass'),
+ 'message': ((__cluster_healthcheck_pending_pvcs.resources | length | string) +
+ ' PVC(s) stuck in Pending')
+ if __cluster_healthcheck_pending_pvcs.resources | length > 0
+ else 'No PVCs stuck in Pending' }
+ ]
+ }
+ }) }}
+...
diff --git a/roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2 b/roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2
new file mode 100644
index 0000000..c3ed2ac
--- /dev/null
+++ b/roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2
@@ -0,0 +1,81 @@
+
+
+
+
+ Cluster Healthcheck Report
+
+
+
+ Cluster Healthcheck Report
+ Generated: {{ ansible_date_time.iso8601 | default(lookup('pipe', 'date -u +%Y-%m-%dT%H:%M:%SZ')) }}
+
+ Summary
+
+
+ | Category |
+ Status |
+
+{% for category, result in __cluster_healthcheck_results.items() %}
+
+ | {{ category | replace('_', ' ') | title }} |
+ {{ result.status | upper }} |
+
+{% endfor %}
+
+
+{% for category, result in __cluster_healthcheck_results.items() %}
+{% if result.details | length > 0 %}
+ {{ category | replace('_', ' ') | title }}
+
+
+ | Check |
+ Status |
+ Details |
+
+{% for detail in result.details %}
+{% if detail.check is defined %}
+
+ | {{ detail.check }} |
+ {{ detail.status | upper }} |
+ {{ detail.message }} |
+
+{% elif detail.name is defined %}
+
+ | {{ detail.name }} |
+ {{ 'PASS' if detail.running else 'FAIL' }} |
+ Running: {{ detail.running }}, Guest Agent: {{ detail.guest_agent }}, Network: {{ detail.interfaces }} |
+
+{% endif %}
+{% endfor %}
+
+{% endif %}
+{% endfor %}
+
+ Recommendations
+
+{% for category, result in __cluster_healthcheck_results.items() %}
+{% if result.status == 'fail' %}
+ - {{ category | replace('_', ' ') | title }}: Critical issues detected. Investigate and resolve before proceeding with migrations.
+{% elif result.status == 'warning' %}
+ - {{ category | replace('_', ' ') | title }}: Non-critical issues found. Review and address when possible.
+{% endif %}
+{% endfor %}
+{% if __cluster_healthcheck_results.values() | map(attribute='status') | select('in', ['fail', 'warning']) | list | length == 0 %}
+ - All checks passed. The cluster is ready for migration workloads.
+{% endif %}
+
+
+
diff --git a/roles/cluster_healthcheck/tests/inventory b/roles/cluster_healthcheck/tests/inventory
new file mode 100644
index 0000000..2302eda
--- /dev/null
+++ b/roles/cluster_healthcheck/tests/inventory
@@ -0,0 +1 @@
+localhost ansible_connection=local
diff --git a/roles/cluster_healthcheck/tests/test.yml b/roles/cluster_healthcheck/tests/test.yml
new file mode 100644
index 0000000..0db8ed0
--- /dev/null
+++ b/roles/cluster_healthcheck/tests/test.yml
@@ -0,0 +1,8 @@
+---
+- name: Test cluster_healthcheck role
+ hosts: localhost
+ connection: local
+ gather_facts: false
+ roles:
+ - role: cluster_healthcheck
+...
diff --git a/roles/cluster_healthcheck/vars/main.yml b/roles/cluster_healthcheck/vars/main.yml
new file mode 100644
index 0000000..8955cfb
--- /dev/null
+++ b/roles/cluster_healthcheck/vars/main.yml
@@ -0,0 +1,4 @@
+---
+# vars file for cluster_healthcheck
+__cluster_healthcheck_results: {}
+...