From 51d077e5c637fd4e09641ae3b19fbbb91279df56 Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 22 May 2026 08:38:51 -0400 Subject: [PATCH] feat(cluster_healthcheck): add cluster health validation role Adds a cluster_healthcheck role that validates OpenShift cluster health for virtualization migration readiness across six categories: OCP nodes, KubeVirt, MTV, storage, network, and post-migration VMs. Generates an HTML summary report with pass/fail/warning status. Review feedback addressed: - Fix CDI pod labels to use app.kubernetes.io/component selectors - Fix Provider readiness to correctly detect Ready condition status - Make migration network check conditional on HyperConverged CR config - Check migration NAD in openshift-cnv namespace, not openshift-mtv - Drop unrelated scaffolding file changes (CODE_OF_CONDUCT, etc.) --- playbooks/cluster_healthcheck.yml | 10 + roles/cluster_healthcheck/README.md | 69 +++++++ roles/cluster_healthcheck/defaults/main.yml | 23 +++ roles/cluster_healthcheck/meta/main.yml | 10 + .../tasks/kubevirt_health.yml | 177 ++++++++++++++++++ roles/cluster_healthcheck/tasks/main.yml | 40 ++++ .../cluster_healthcheck/tasks/mtv_health.yml | 124 ++++++++++++ .../tasks/network_health.yml | 122 ++++++++++++ .../tasks/ocp_node_health.yml | 122 ++++++++++++ .../tasks/post_migration_vm.yml | 81 ++++++++ roles/cluster_healthcheck/tasks/report.yml | 22 +++ .../tasks/storage_health.yml | 109 +++++++++++ .../cluster_healthcheck_report.html.j2 | 81 ++++++++ roles/cluster_healthcheck/tests/inventory | 1 + roles/cluster_healthcheck/tests/test.yml | 8 + roles/cluster_healthcheck/vars/main.yml | 4 + 16 files changed, 1003 insertions(+) create mode 100644 playbooks/cluster_healthcheck.yml create mode 100644 roles/cluster_healthcheck/README.md create mode 100644 roles/cluster_healthcheck/defaults/main.yml create mode 100644 roles/cluster_healthcheck/meta/main.yml create mode 100644 roles/cluster_healthcheck/tasks/kubevirt_health.yml create mode 100644 roles/cluster_healthcheck/tasks/main.yml create mode 100644 roles/cluster_healthcheck/tasks/mtv_health.yml create mode 100644 roles/cluster_healthcheck/tasks/network_health.yml create mode 100644 roles/cluster_healthcheck/tasks/ocp_node_health.yml create mode 100644 roles/cluster_healthcheck/tasks/post_migration_vm.yml create mode 100644 roles/cluster_healthcheck/tasks/report.yml create mode 100644 roles/cluster_healthcheck/tasks/storage_health.yml create mode 100644 roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2 create mode 100644 roles/cluster_healthcheck/tests/inventory create mode 100644 roles/cluster_healthcheck/tests/test.yml create mode 100644 roles/cluster_healthcheck/vars/main.yml diff --git a/playbooks/cluster_healthcheck.yml b/playbooks/cluster_healthcheck.yml new file mode 100644 index 0000000..dafcabb --- /dev/null +++ b/playbooks/cluster_healthcheck.yml @@ -0,0 +1,10 @@ +--- +- name: Run cluster healthchecks + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Include cluster_healthcheck role + ansible.builtin.import_role: + name: infra.openshift_virtualization_migration.cluster_healthcheck +... diff --git a/roles/cluster_healthcheck/README.md b/roles/cluster_healthcheck/README.md new file mode 100644 index 0000000..d12ae0d --- /dev/null +++ b/roles/cluster_healthcheck/README.md @@ -0,0 +1,69 @@ +# cluster_healthcheck + +``` +Role belongs to infra/openshift_virtualization_migration +Namespace - infra +Collection - openshift_virtualization_migration +``` + +Description: Cluster health validation for OpenShift Virtualization migration environments. + +## Requirements + +- OpenShift cluster with `kubeconfig` configured +- `kubernetes.core` collection installed +- OpenShift Virtualization (CNV) operator installed +- Migration Toolkit for Virtualization (MTV) operator installed + +## Role Variables + +### Defaults + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `cluster_healthcheck_checks` | list | See defaults/main.yml | List of health checks to run | +| `cluster_healthcheck_post_migration_vms` | list | `[]` | VMs to check post-migration | +| `cluster_healthcheck_generate_report` | bool | `true` | Generate HTML report | +| `cluster_healthcheck_report_path` | str | `/tmp/cluster_healthcheck_report.html` | Report output path | +| `cluster_healthcheck_mtv_namespace` | str | `openshift-mtv` | MTV operator namespace | +| `cluster_healthcheck_kubevirt_namespace` | str | `openshift-cnv` | KubeVirt operator namespace | +| `cluster_healthcheck_ssh_timeout` | int | `10` | SSH check timeout in seconds | +| `cluster_healthcheck_debug` | bool | `false` | Enable verbose debug output | + +### Post-Migration VM Format + +```yaml +cluster_healthcheck_post_migration_vms: + - name: my-vm + namespace: my-namespace + check_ssh: true # optional, default false +``` + +## Health Checks + +| Check | Description | +|-------|-------------| +| `ocp_node_health` | Node Ready status, resource pressure, kubevirt.io/schedulable label | +| `kubevirt_health` | HyperConverged CR, virt-* pods, CDI operator | +| `mtv_health` | ForkliftController, MTV pods, Providers, Plans | +| `storage_health` | StorageClasses, CSI drivers, PV capacity, pending PVCs | +| `network_health` | Multus, NADs, OVN/SDN health, migration network | + +## Example Playbook + +```yaml +- name: Run cluster healthchecks + hosts: localhost + connection: local + gather_facts: false + roles: + - role: infra.openshift_virtualization_migration.cluster_healthcheck + vars: + cluster_healthcheck_post_migration_vms: + - name: rhel9-vm + namespace: migration-target +``` + +## License + +GPL-3.0-only diff --git a/roles/cluster_healthcheck/defaults/main.yml b/roles/cluster_healthcheck/defaults/main.yml new file mode 100644 index 0000000..f8b58e9 --- /dev/null +++ b/roles/cluster_healthcheck/defaults/main.yml @@ -0,0 +1,23 @@ +--- +# defaults file for cluster_healthcheck +cluster_healthcheck_checks: + - ocp_node_health + - kubevirt_health + - mtv_health + - storage_health + - network_health + +cluster_healthcheck_post_migration_vms: [] + +cluster_healthcheck_generate_report: true + +cluster_healthcheck_report_path: "/tmp/cluster_healthcheck_report.html" + +cluster_healthcheck_mtv_namespace: "openshift-mtv" + +cluster_healthcheck_kubevirt_namespace: "openshift-cnv" + +cluster_healthcheck_ssh_timeout: 10 + +cluster_healthcheck_debug: false +... diff --git a/roles/cluster_healthcheck/meta/main.yml b/roles/cluster_healthcheck/meta/main.yml new file mode 100644 index 0000000..7f4bf14 --- /dev/null +++ b/roles/cluster_healthcheck/meta/main.yml @@ -0,0 +1,10 @@ +--- +galaxy_info: + author: "" + description: Cluster health validation for OpenShift Virtualization migration environments. + company: Red Hat + license: GPL-3.0-only + min_ansible_version: 2.15.0 + galaxy_tags: [] +dependencies: [] +... diff --git a/roles/cluster_healthcheck/tasks/kubevirt_health.yml b/roles/cluster_healthcheck/tasks/kubevirt_health.yml new file mode 100644 index 0000000..d8d4df0 --- /dev/null +++ b/roles/cluster_healthcheck/tasks/kubevirt_health.yml @@ -0,0 +1,177 @@ +--- +- name: kubevirt_health | Get HyperConverged CR status + kubernetes.core.k8s_info: + api_version: hco.kubevirt.io/v1beta1 + kind: HyperConverged + namespace: "{{ cluster_healthcheck_kubevirt_namespace }}" + register: __cluster_healthcheck_hco + +- name: kubevirt_health | Evaluate HyperConverged conditions + ansible.builtin.set_fact: + __cluster_healthcheck_hco_available: >- + {{ __cluster_healthcheck_hco.resources | length > 0 and + __cluster_healthcheck_hco.resources[0].status.conditions | default([]) + | selectattr('type', 'equalto', 'Available') + | map(attribute='status') + | first | default('False') == 'True' }} + __cluster_healthcheck_hco_degraded: >- + {{ __cluster_healthcheck_hco.resources | length > 0 and + __cluster_healthcheck_hco.resources[0].status.conditions | default([]) + | selectattr('type', 'equalto', 'Degraded') + | map(attribute='status') + | first | default('False') == 'True' }} + +- name: kubevirt_health | Report HyperConverged status + ansible.builtin.debug: + msg: >- + HyperConverged CR - + Available: {{ __cluster_healthcheck_hco_available }}, + Degraded: {{ __cluster_healthcheck_hco_degraded }} + +- name: kubevirt_health | Check virt-operator pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cluster_healthcheck_kubevirt_namespace }}" + label_selectors: + - "kubevirt.io=virt-operator" + register: __cluster_healthcheck_virt_operator_pods + +- name: kubevirt_health | Check virt-controller pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cluster_healthcheck_kubevirt_namespace }}" + label_selectors: + - "kubevirt.io=virt-controller" + register: __cluster_healthcheck_virt_controller_pods + +- name: kubevirt_health | Check virt-handler pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cluster_healthcheck_kubevirt_namespace }}" + label_selectors: + - "kubevirt.io=virt-handler" + register: __cluster_healthcheck_virt_handler_pods + +- name: kubevirt_health | Check virt-api pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cluster_healthcheck_kubevirt_namespace }}" + label_selectors: + - "kubevirt.io=virt-api" + register: __cluster_healthcheck_virt_api_pods + +- name: kubevirt_health | Evaluate KubeVirt pod health + ansible.builtin.set_fact: + __cluster_healthcheck_kubevirt_pods: + virt_operator: + running: "{{ __cluster_healthcheck_virt_operator_pods.resources + | selectattr('status.phase', 'equalto', 'Running') | list | length }}" + total: "{{ __cluster_healthcheck_virt_operator_pods.resources | length }}" + virt_controller: + running: "{{ __cluster_healthcheck_virt_controller_pods.resources + | selectattr('status.phase', 'equalto', 'Running') | list | length }}" + total: "{{ __cluster_healthcheck_virt_controller_pods.resources | length }}" + virt_handler: + running: "{{ __cluster_healthcheck_virt_handler_pods.resources + | selectattr('status.phase', 'equalto', 'Running') | list | length }}" + total: "{{ __cluster_healthcheck_virt_handler_pods.resources | length }}" + virt_api: + running: "{{ __cluster_healthcheck_virt_api_pods.resources + | selectattr('status.phase', 'equalto', 'Running') | list | length }}" + total: "{{ __cluster_healthcheck_virt_api_pods.resources | length }}" + +- name: kubevirt_health | Check CDI operator pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cluster_healthcheck_kubevirt_namespace }}" + label_selectors: + - "app.kubernetes.io/component=cdi-operator" + register: __cluster_healthcheck_cdi_operator_pods + +- name: kubevirt_health | Check CDI deployment pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cluster_healthcheck_kubevirt_namespace }}" + label_selectors: + - "app.kubernetes.io/component=cdi-deployment" + register: __cluster_healthcheck_cdi_deployment_pods + +- name: kubevirt_health | Evaluate CDI health + ansible.builtin.set_fact: + __cluster_healthcheck_cdi_pods: + cdi_operator: + running: "{{ __cluster_healthcheck_cdi_operator_pods.resources + | selectattr('status.phase', 'equalto', 'Running') | list | length }}" + total: "{{ __cluster_healthcheck_cdi_operator_pods.resources | length }}" + cdi_deployment: + running: "{{ __cluster_healthcheck_cdi_deployment_pods.resources + | selectattr('status.phase', 'equalto', 'Running') | list | length }}" + total: "{{ __cluster_healthcheck_cdi_deployment_pods.resources | length }}" + +- name: kubevirt_health | Set kubevirt health result + ansible.builtin.set_fact: + __cluster_healthcheck_results: >- + {{ __cluster_healthcheck_results | combine({ + 'kubevirt_health': { + 'status': ('fail' if (not __cluster_healthcheck_hco_available or + __cluster_healthcheck_hco_degraded or + __cluster_healthcheck_kubevirt_pods.virt_operator.total | int == 0 or + __cluster_healthcheck_kubevirt_pods.virt_controller.total | int == 0) + else 'pass'), + 'details': [ + { 'check': 'HyperConverged Available', + 'status': ('pass' if __cluster_healthcheck_hco_available else 'fail'), + 'message': ('HyperConverged CR is Available' + if __cluster_healthcheck_hco_available + else 'HyperConverged CR is NOT Available') }, + { 'check': 'HyperConverged Not Degraded', + 'status': ('fail' if __cluster_healthcheck_hco_degraded else 'pass'), + 'message': ('HyperConverged CR is Degraded' + if __cluster_healthcheck_hco_degraded + else 'HyperConverged CR is not Degraded') }, + { 'check': 'virt-operator', + 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_operator.running | int > 0 + else 'fail'), + 'message': (__cluster_healthcheck_kubevirt_pods.virt_operator.running | string + + '/' + __cluster_healthcheck_kubevirt_pods.virt_operator.total | string + + ' pods Running') }, + { 'check': 'virt-controller', + 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_controller.running | int > 0 + else 'fail'), + 'message': (__cluster_healthcheck_kubevirt_pods.virt_controller.running | string + + '/' + __cluster_healthcheck_kubevirt_pods.virt_controller.total | string + + ' pods Running') }, + { 'check': 'virt-handler', + 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_handler.running | int > 0 + else 'fail'), + 'message': (__cluster_healthcheck_kubevirt_pods.virt_handler.running | string + + '/' + __cluster_healthcheck_kubevirt_pods.virt_handler.total | string + + ' pods Running') }, + { 'check': 'virt-api', + 'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_api.running | int > 0 + else 'fail'), + 'message': (__cluster_healthcheck_kubevirt_pods.virt_api.running | string + + '/' + __cluster_healthcheck_kubevirt_pods.virt_api.total | string + + ' pods Running') }, + { 'check': 'CDI Operator', + 'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_operator.running | int > 0 + else 'fail'), + 'message': (__cluster_healthcheck_cdi_pods.cdi_operator.running | string + + '/' + __cluster_healthcheck_cdi_pods.cdi_operator.total | string + + ' pods Running') }, + { 'check': 'CDI Deployment', + 'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_deployment.running | int > 0 + else 'fail'), + 'message': (__cluster_healthcheck_cdi_pods.cdi_deployment.running | string + + '/' + __cluster_healthcheck_cdi_pods.cdi_deployment.total | string + + ' pods Running') } + ] + } + }) }} +... diff --git a/roles/cluster_healthcheck/tasks/main.yml b/roles/cluster_healthcheck/tasks/main.yml new file mode 100644 index 0000000..3bb1475 --- /dev/null +++ b/roles/cluster_healthcheck/tasks/main.yml @@ -0,0 +1,40 @@ +--- +# tasks file for cluster_healthcheck +- name: Initialize healthcheck results + ansible.builtin.set_fact: + __cluster_healthcheck_results: + ocp_node_health: { status: "skipped", details: [] } + kubevirt_health: { status: "skipped", details: [] } + mtv_health: { status: "skipped", details: [] } + storage_health: { status: "skipped", details: [] } + network_health: { status: "skipped", details: [] } + post_migration_vm: { status: "skipped", details: [] } + +- name: Include ocp_node_health tasks + ansible.builtin.include_tasks: ocp_node_health.yml + when: "'ocp_node_health' in cluster_healthcheck_checks" + +- name: Include kubevirt_health tasks + ansible.builtin.include_tasks: kubevirt_health.yml + when: "'kubevirt_health' in cluster_healthcheck_checks" + +- name: Include mtv_health tasks + ansible.builtin.include_tasks: mtv_health.yml + when: "'mtv_health' in cluster_healthcheck_checks" + +- name: Include storage_health tasks + ansible.builtin.include_tasks: storage_health.yml + when: "'storage_health' in cluster_healthcheck_checks" + +- name: Include network_health tasks + ansible.builtin.include_tasks: network_health.yml + when: "'network_health' in cluster_healthcheck_checks" + +- name: Include post_migration_vm tasks + ansible.builtin.include_tasks: post_migration_vm.yml + when: "cluster_healthcheck_post_migration_vms | length > 0" + +- name: Include report tasks + ansible.builtin.include_tasks: report.yml + when: cluster_healthcheck_generate_report +... diff --git a/roles/cluster_healthcheck/tasks/mtv_health.yml b/roles/cluster_healthcheck/tasks/mtv_health.yml new file mode 100644 index 0000000..4aa5235 --- /dev/null +++ b/roles/cluster_healthcheck/tasks/mtv_health.yml @@ -0,0 +1,124 @@ +--- +- name: mtv_health | Check ForkliftController CR + kubernetes.core.k8s_info: + api_version: forklift.konveyor.io/v1beta1 + kind: ForkliftController + namespace: "{{ cluster_healthcheck_mtv_namespace }}" + register: __cluster_healthcheck_forklift_controller + +- name: mtv_health | Evaluate ForkliftController health + ansible.builtin.set_fact: + __cluster_healthcheck_forklift_healthy: >- + {{ __cluster_healthcheck_forklift_controller.resources | length > 0 and + __cluster_healthcheck_forklift_controller.resources[0].status.conditions | default([]) + | selectattr('type', 'equalto', 'Successful') + | map(attribute='status') + | first | default('False') == 'True' }} + +- name: mtv_health | Report ForkliftController status + ansible.builtin.debug: + msg: >- + ForkliftController - + Found: {{ __cluster_healthcheck_forklift_controller.resources | length > 0 }}, + Healthy: {{ __cluster_healthcheck_forklift_healthy }} + +- name: mtv_health | Check MTV operator pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cluster_healthcheck_mtv_namespace }}" + label_selectors: + - "app=forklift" + register: __cluster_healthcheck_mtv_pods + +- name: mtv_health | Evaluate MTV operator pod status + ansible.builtin.set_fact: + __cluster_healthcheck_mtv_pods_running: >- + {{ __cluster_healthcheck_mtv_pods.resources + | selectattr('status.phase', 'equalto', 'Running') + | list | length }} + __cluster_healthcheck_mtv_pods_total: >- + {{ __cluster_healthcheck_mtv_pods.resources | length }} + +- name: mtv_health | Check Provider CRs + kubernetes.core.k8s_info: + api_version: forklift.konveyor.io/v1beta1 + kind: Provider + namespace: "{{ cluster_healthcheck_mtv_namespace }}" + register: __cluster_healthcheck_providers + +- name: mtv_health | Evaluate Provider readiness + ansible.builtin.set_fact: + __cluster_healthcheck_providers_not_ready: >- + {% set not_ready = [] -%} + {% for provider in __cluster_healthcheck_providers.resources -%} + {% set conditions = provider.status.conditions | default([]) -%} + {% set ready = conditions | selectattr('type', 'equalto', 'Ready') + | map(attribute='status') | first | default('False') -%} + {% if ready != 'True' -%} + {% if not_ready.append(provider.metadata.name) -%}{% endif -%} + {% endif -%} + {% endfor -%} + {{ not_ready }} + +- name: mtv_health | Check for failed migration Plans + kubernetes.core.k8s_info: + api_version: forklift.konveyor.io/v1beta1 + kind: Plan + namespace: "{{ cluster_healthcheck_mtv_namespace }}" + register: __cluster_healthcheck_plans + +- name: mtv_health | Evaluate failed Plans + ansible.builtin.set_fact: + __cluster_healthcheck_failed_plans: >- + {{ __cluster_healthcheck_plans.resources + | selectattr('status.conditions', 'defined') + | selectattr('status.conditions', 'ansible.builtin.contains', + {'type': 'Failed', 'status': 'True'}) + | map(attribute='metadata.name') + | list }} + +- name: mtv_health | Report failed Plans + ansible.builtin.debug: + msg: "Plan {{ item }} is in Failed state" + loop: "{{ __cluster_healthcheck_failed_plans }}" + +- name: mtv_health | Set MTV health result + ansible.builtin.set_fact: + __cluster_healthcheck_results: >- + {{ __cluster_healthcheck_results | combine({ + 'mtv_health': { + 'status': ('fail' if (__cluster_healthcheck_forklift_controller.resources | length == 0 or + not __cluster_healthcheck_forklift_healthy or + __cluster_healthcheck_mtv_pods_running | int == 0) + else ('warning' if (__cluster_healthcheck_providers_not_ready | length > 0 or + __cluster_healthcheck_failed_plans | length > 0) + else 'pass')), + 'details': [ + { 'check': 'ForkliftController', + 'status': ('pass' if (__cluster_healthcheck_forklift_controller.resources | length > 0 and + __cluster_healthcheck_forklift_healthy) else 'fail'), + 'message': ('ForkliftController is healthy' + if __cluster_healthcheck_forklift_healthy + else 'ForkliftController is NOT healthy or missing') }, + { 'check': 'MTV Operator Pods', + 'status': ('pass' if __cluster_healthcheck_mtv_pods_running | int > 0 else 'fail'), + 'message': (__cluster_healthcheck_mtv_pods_running | string + '/' + + __cluster_healthcheck_mtv_pods_total | string + ' pods Running') }, + { 'check': 'Providers Ready', + 'status': ('warning' if __cluster_healthcheck_providers_not_ready | length > 0 else 'pass'), + 'message': ((__cluster_healthcheck_providers_not_ready | length | string) + + ' provider(s) not Ready: ' + + __cluster_healthcheck_providers_not_ready | join(', ')) + if __cluster_healthcheck_providers_not_ready | length > 0 + else 'All providers Ready' }, + { 'check': 'Failed Plans', + 'status': ('warning' if __cluster_healthcheck_failed_plans | length > 0 else 'pass'), + 'message': ((__cluster_healthcheck_failed_plans | length | string) + + ' plan(s) in Failed state') + if __cluster_healthcheck_failed_plans | length > 0 + else 'No failed plans' } + ] + } + }) }} +... diff --git a/roles/cluster_healthcheck/tasks/network_health.yml b/roles/cluster_healthcheck/tasks/network_health.yml new file mode 100644 index 0000000..ca2aead --- /dev/null +++ b/roles/cluster_healthcheck/tasks/network_health.yml @@ -0,0 +1,122 @@ +--- +- name: network_health | Check Multus pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: openshift-multus + label_selectors: + - "app=multus" + register: __cluster_healthcheck_multus_pods + +- name: network_health | Evaluate Multus pod health + ansible.builtin.set_fact: + __cluster_healthcheck_multus_running: >- + {{ __cluster_healthcheck_multus_pods.resources + | selectattr('status.phase', 'equalto', 'Running') + | list | length }} + __cluster_healthcheck_multus_total: >- + {{ __cluster_healthcheck_multus_pods.resources | length }} + +- name: network_health | List NetworkAttachmentDefinitions + kubernetes.core.k8s_info: + api_version: k8s.cni.cncf.io/v1 + kind: NetworkAttachmentDefinition + register: __cluster_healthcheck_nad + +- name: network_health | Report NetworkAttachmentDefinitions + ansible.builtin.debug: + msg: >- + NetworkAttachmentDefinition: {{ item.metadata.namespace }}/{{ item.metadata.name }} + loop: "{{ __cluster_healthcheck_nad.resources }}" + loop_control: + label: "{{ item.metadata.name }}" + when: cluster_healthcheck_debug + +- name: network_health | Check OVN-Kubernetes pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: openshift-ovn-kubernetes + label_selectors: + - "app=ovnkube-node" + register: __cluster_healthcheck_ovn_pods + +- name: network_health | Check OpenShiftSDN pods as fallback + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: openshift-sdn + label_selectors: + - "app=sdn" + register: __cluster_healthcheck_sdn_pods + when: __cluster_healthcheck_ovn_pods.resources | length == 0 + +- name: network_health | Evaluate SDN health + ansible.builtin.set_fact: + __cluster_healthcheck_sdn_type: >- + {{ 'ovn-kubernetes' if __cluster_healthcheck_ovn_pods.resources | length > 0 + else ('openshift-sdn' if (__cluster_healthcheck_sdn_pods.resources | default([]) | length > 0) + else 'unknown') }} + __cluster_healthcheck_sdn_running: >- + {{ (__cluster_healthcheck_ovn_pods.resources + | selectattr('status.phase', 'equalto', 'Running') + | list | length) if __cluster_healthcheck_ovn_pods.resources | length > 0 + else (__cluster_healthcheck_sdn_pods.resources | default([]) + | selectattr('status.phase', 'equalto', 'Running') + | list | length) }} + +- name: network_health | Get HyperConverged CR for migration network config + kubernetes.core.k8s_info: + api_version: hco.kubevirt.io/v1beta1 + kind: HyperConverged + namespace: "{{ cluster_healthcheck_kubevirt_namespace }}" + register: __cluster_healthcheck_hco_network + +- name: network_health | Extract configured migration network + ansible.builtin.set_fact: + __cluster_healthcheck_migration_network: >- + {{ __cluster_healthcheck_hco_network.resources[0].spec.liveMigrationConfig.network + | default('') }} + when: __cluster_healthcheck_hco_network.resources | length > 0 + +- name: network_health | Check migration network NAD + kubernetes.core.k8s_info: + api_version: k8s.cni.cncf.io/v1 + kind: NetworkAttachmentDefinition + namespace: "{{ cluster_healthcheck_kubevirt_namespace }}" + register: __cluster_healthcheck_migration_nad + when: __cluster_healthcheck_migration_network | default('') | length > 0 + +- name: network_health | Set network health result + ansible.builtin.set_fact: + __cluster_healthcheck_results: >- + {{ __cluster_healthcheck_results | combine({ + 'network_health': { + 'status': ('fail' if __cluster_healthcheck_sdn_type == 'unknown' + else ('warning' if __cluster_healthcheck_multus_total | int == 0 + else 'pass')), + 'details': [ + { 'check': 'Multus Pods', + 'status': ('pass' if __cluster_healthcheck_multus_running | int > 0 else 'warning'), + 'message': (__cluster_healthcheck_multus_running | string + '/' + + __cluster_healthcheck_multus_total | string + ' Multus pods Running') }, + { 'check': 'SDN Type', + 'status': ('pass' if __cluster_healthcheck_sdn_type != 'unknown' else 'fail'), + 'message': ('SDN: ' + __cluster_healthcheck_sdn_type) }, + { 'check': 'NetworkAttachmentDefinitions', + 'status': 'pass', + 'message': (__cluster_healthcheck_nad.resources | length | string + + ' NAD(s) found across cluster') }, + { 'check': 'Migration Network', + 'status': ('pass' if (__cluster_healthcheck_migration_network | default('') | length == 0 or + (__cluster_healthcheck_migration_nad.resources | default([]) | length > 0)) + else 'warning'), + 'message': ('No dedicated migration network configured in HyperConverged CR' + if __cluster_healthcheck_migration_network | default('') | length == 0 + else ((__cluster_healthcheck_migration_nad.resources | default([]) | length | string) + + ' NAD(s) in ' + cluster_healthcheck_kubevirt_namespace + + ' for migration network ' + __cluster_healthcheck_migration_network)) } + ] + } + }) }} +... diff --git a/roles/cluster_healthcheck/tasks/ocp_node_health.yml b/roles/cluster_healthcheck/tasks/ocp_node_health.yml new file mode 100644 index 0000000..0bf99b7 --- /dev/null +++ b/roles/cluster_healthcheck/tasks/ocp_node_health.yml @@ -0,0 +1,122 @@ +--- +- name: ocp_node_health | Get all cluster nodes + kubernetes.core.k8s_info: + api_version: v1 + kind: Node + register: __cluster_healthcheck_nodes + +- name: ocp_node_health | Evaluate node Ready status + ansible.builtin.set_fact: + __cluster_healthcheck_nodes_not_ready: >- + {{ __cluster_healthcheck_nodes.resources + | selectattr('status.conditions', 'defined') + | map(attribute='metadata.name') + | zip(__cluster_healthcheck_nodes.resources + | map(attribute='status.conditions') + | map('selectattr', 'type', 'equalto', 'Ready') + | map('first')) + | selectattr('1.status', 'ne', 'True') + | map(attribute='0') + | list }} + +- name: ocp_node_health | Report nodes not Ready + ansible.builtin.debug: + msg: "Node {{ item }} is NOT Ready" + loop: "{{ __cluster_healthcheck_nodes_not_ready }}" + +- name: ocp_node_health | Check for resource pressure conditions + ansible.builtin.set_fact: + __cluster_healthcheck_pressure_nodes: >- + {{ __cluster_healthcheck_pressure_nodes | default([]) + + [{ 'name': item.metadata.name, + 'pressures': item.status.conditions + | selectattr('type', 'in', ['MemoryPressure', 'DiskPressure', 'PIDPressure']) + | selectattr('status', 'equalto', 'True') + | map(attribute='type') + | list }] }} + loop: "{{ __cluster_healthcheck_nodes.resources }}" + loop_control: + label: "{{ item.metadata.name }}" + when: >- + item.status.conditions + | selectattr('type', 'in', ['MemoryPressure', 'DiskPressure', 'PIDPressure']) + | selectattr('status', 'equalto', 'True') + | list | length > 0 + +- name: ocp_node_health | Report nodes with resource pressure + ansible.builtin.debug: + msg: "Node {{ item.name }} has pressure conditions: {{ item.pressures | join(', ') }}" + loop: "{{ __cluster_healthcheck_pressure_nodes | default([]) }}" + loop_control: + label: "{{ item.name }}" + +- name: ocp_node_health | Check allocatable vs capacity ratios + ansible.builtin.set_fact: + __cluster_healthcheck_capacity_info: >- + {{ __cluster_healthcheck_capacity_info | default([]) + + [{ 'name': item.metadata.name, + 'cpu_allocatable': item.status.allocatable.cpu | default('0'), + 'cpu_capacity': item.status.capacity.cpu | default('0'), + 'memory_allocatable': item.status.allocatable.memory | default('0Ki'), + 'memory_capacity': item.status.capacity.memory | default('0Ki') }] }} + loop: "{{ __cluster_healthcheck_nodes.resources }}" + loop_control: + label: "{{ item.metadata.name }}" + +- name: ocp_node_health | Display capacity information + ansible.builtin.debug: + msg: >- + Node {{ item.name }} - + CPU: {{ item.cpu_allocatable }}/{{ item.cpu_capacity }}, + Memory: {{ item.memory_allocatable }}/{{ item.memory_capacity }} + loop: "{{ __cluster_healthcheck_capacity_info | default([]) }}" + loop_control: + label: "{{ item.name }}" + when: cluster_healthcheck_debug + +- name: ocp_node_health | Verify worker nodes have kubevirt.io/schedulable label + ansible.builtin.set_fact: + __cluster_healthcheck_workers_not_schedulable: >- + {{ __cluster_healthcheck_nodes.resources + | selectattr('metadata.labels', 'defined') + | selectattr('metadata.labels.node-role.kubernetes.io/worker', 'defined') + | rejectattr('metadata.labels', 'ansible.builtin.contains', 'kubevirt.io/schedulable') + | map(attribute='metadata.name') + | list }} + +- name: ocp_node_health | Report workers missing kubevirt.io/schedulable label + ansible.builtin.debug: + msg: "Worker node {{ item }} is missing the kubevirt.io/schedulable label" + loop: "{{ __cluster_healthcheck_workers_not_schedulable }}" + +- name: ocp_node_health | Set node health result + ansible.builtin.set_fact: + __cluster_healthcheck_results: >- + {{ __cluster_healthcheck_results | combine({ + 'ocp_node_health': { + 'status': ('fail' if (__cluster_healthcheck_nodes_not_ready | length > 0 or + __cluster_healthcheck_pressure_nodes | default([]) | length > 0) + else ('warning' if __cluster_healthcheck_workers_not_schedulable | length > 0 + else 'pass')), + 'details': [ + { 'check': 'Nodes Ready', + 'status': ('fail' if __cluster_healthcheck_nodes_not_ready | length > 0 else 'pass'), + 'message': ((__cluster_healthcheck_nodes_not_ready | length | string) + ' node(s) not Ready') + if __cluster_healthcheck_nodes_not_ready | length > 0 + else 'All nodes Ready' }, + { 'check': 'Resource Pressure', + 'status': ('fail' if __cluster_healthcheck_pressure_nodes | default([]) | length > 0 else 'pass'), + 'message': ((__cluster_healthcheck_pressure_nodes | default([]) | length | string) + + ' node(s) with resource pressure') + if __cluster_healthcheck_pressure_nodes | default([]) | length > 0 + else 'No resource pressure detected' }, + { 'check': 'KubeVirt Schedulable', + 'status': ('warning' if __cluster_healthcheck_workers_not_schedulable | length > 0 else 'pass'), + 'message': ((__cluster_healthcheck_workers_not_schedulable | length | string) + + ' worker(s) missing kubevirt.io/schedulable label') + if __cluster_healthcheck_workers_not_schedulable | length > 0 + else 'All workers have kubevirt.io/schedulable label' } + ] + } + }) }} +... diff --git a/roles/cluster_healthcheck/tasks/post_migration_vm.yml b/roles/cluster_healthcheck/tasks/post_migration_vm.yml new file mode 100644 index 0000000..61728d8 --- /dev/null +++ b/roles/cluster_healthcheck/tasks/post_migration_vm.yml @@ -0,0 +1,81 @@ +--- +- name: post_migration_vm | Check VirtualMachineInstance status + kubernetes.core.k8s_info: + api_version: kubevirt.io/v1 + kind: VirtualMachineInstance + name: "{{ __cluster_healthcheck_vm.name }}" + namespace: "{{ __cluster_healthcheck_vm.namespace }}" + register: __cluster_healthcheck_vmi + loop: "{{ cluster_healthcheck_post_migration_vms }}" + loop_control: + loop_var: __cluster_healthcheck_vm + label: "{{ __cluster_healthcheck_vm.namespace }}/{{ __cluster_healthcheck_vm.name }}" + +- name: post_migration_vm | Evaluate VM status + ansible.builtin.set_fact: + __cluster_healthcheck_vm_results: >- + {{ __cluster_healthcheck_vm_results | default([]) + [{ + 'name': __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.namespace + '/' + + __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.name, + 'running': (__cluster_healthcheck_vmi_item.resources | length > 0 and + __cluster_healthcheck_vmi_item.resources[0].status.phase | default('') == 'Running'), + 'guest_agent': (__cluster_healthcheck_vmi_item.resources | length > 0 and + __cluster_healthcheck_vmi_item.resources[0].status.guestOSInfo + | default({}) | length > 0), + 'interfaces': (__cluster_healthcheck_vmi_item.resources | length > 0 and + __cluster_healthcheck_vmi_item.resources[0].status.interfaces | default([]) + | selectattr('ipAddress', 'defined') + | list | length > 0) + }] }} + loop: "{{ __cluster_healthcheck_vmi.results }}" + loop_control: + loop_var: __cluster_healthcheck_vmi_item + label: >- + {{ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.namespace }}/{{ + __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.name }} + +- name: post_migration_vm | Report VM status + ansible.builtin.debug: + msg: >- + VM {{ item.name }} - + Running: {{ item.running }}, + Guest Agent: {{ item.guest_agent }}, + Network: {{ item.interfaces }} + loop: "{{ __cluster_healthcheck_vm_results | default([]) }}" + loop_control: + label: "{{ item.name }}" + +- name: post_migration_vm | Optional SSH connectivity check + ansible.builtin.wait_for: + host: >- + {{ __cluster_healthcheck_vmi_item.resources[0].status.interfaces[0].ipAddress }} + port: 22 + timeout: "{{ cluster_healthcheck_ssh_timeout }}" + state: started + loop: "{{ __cluster_healthcheck_vmi.results }}" + loop_control: + loop_var: __cluster_healthcheck_vmi_item + label: >- + {{ __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.namespace }}/{{ + __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.name }} + when: + - __cluster_healthcheck_vmi_item.__cluster_healthcheck_vm.check_ssh | default(false) + - __cluster_healthcheck_vmi_item.resources | length > 0 + - __cluster_healthcheck_vmi_item.resources[0].status.interfaces | default([]) | length > 0 + ignore_errors: true # noqa: ignore-errors + register: __cluster_healthcheck_ssh_results + +- name: post_migration_vm | Set post-migration VM result + ansible.builtin.set_fact: + __cluster_healthcheck_results: >- + {{ __cluster_healthcheck_results | combine({ + 'post_migration_vm': { + 'status': ('fail' if (__cluster_healthcheck_vm_results | default([]) + | selectattr('running', 'false') | list | length > 0) + else ('warning' if (__cluster_healthcheck_vm_results | default([]) + | selectattr('guest_agent', 'false') | list | length > 0) + else 'pass')), + 'details': __cluster_healthcheck_vm_results | default([]) + } + }) }} +... diff --git a/roles/cluster_healthcheck/tasks/report.yml b/roles/cluster_healthcheck/tasks/report.yml new file mode 100644 index 0000000..3b04aa7 --- /dev/null +++ b/roles/cluster_healthcheck/tasks/report.yml @@ -0,0 +1,22 @@ +--- +- name: report | Display healthcheck summary + ansible.builtin.debug: + msg: >- + Healthcheck Summary - + Nodes: {{ __cluster_healthcheck_results.ocp_node_health.status }}, + KubeVirt: {{ __cluster_healthcheck_results.kubevirt_health.status }}, + MTV: {{ __cluster_healthcheck_results.mtv_health.status }}, + Storage: {{ __cluster_healthcheck_results.storage_health.status }}, + Network: {{ __cluster_healthcheck_results.network_health.status }}, + Post-Migration VMs: {{ __cluster_healthcheck_results.post_migration_vm.status }} + +- name: report | Generate HTML healthcheck report + ansible.builtin.template: + src: cluster_healthcheck_report.html.j2 + dest: "{{ cluster_healthcheck_report_path }}" + mode: "0644" + +- name: report | Report file location + ansible.builtin.debug: + msg: "Healthcheck report written to {{ cluster_healthcheck_report_path }}" +... diff --git a/roles/cluster_healthcheck/tasks/storage_health.yml b/roles/cluster_healthcheck/tasks/storage_health.yml new file mode 100644 index 0000000..abc1882 --- /dev/null +++ b/roles/cluster_healthcheck/tasks/storage_health.yml @@ -0,0 +1,109 @@ +--- +- name: storage_health | Get StorageClass resources + kubernetes.core.k8s_info: + api_version: storage.k8s.io/v1 + kind: StorageClass + register: __cluster_healthcheck_storage_classes + +- name: storage_health | Check for default StorageClass + ansible.builtin.set_fact: + __cluster_healthcheck_default_sc: >- + {{ __cluster_healthcheck_storage_classes.resources + | selectattr('metadata.annotations', 'defined') + | selectattr('metadata.annotations', 'ansible.builtin.contains', + 'storageclass.kubernetes.io/is-default-class') + | list }} + +- name: storage_health | Report StorageClasses + ansible.builtin.debug: + msg: >- + StorageClass: {{ item.metadata.name }}, + Provisioner: {{ item.provisioner }}, + Default: {{ 'Yes' if (item.metadata.annotations | default({}) + ).get('storageclass.kubernetes.io/is-default-class', 'false') == 'true' + else 'No' }} + loop: "{{ __cluster_healthcheck_storage_classes.resources }}" + loop_control: + label: "{{ item.metadata.name }}" + +- name: storage_health | Check CSI driver pods + kubernetes.core.k8s_info: + api_version: storage.k8s.io/v1 + kind: CSIDriver + register: __cluster_healthcheck_csi_drivers + +- name: storage_health | Report CSI drivers + ansible.builtin.debug: + msg: "CSI Driver: {{ item.metadata.name }}" + loop: "{{ __cluster_healthcheck_csi_drivers.resources }}" + loop_control: + label: "{{ item.metadata.name }}" + +- name: storage_health | Get PersistentVolumes + kubernetes.core.k8s_info: + api_version: v1 + kind: PersistentVolume + register: __cluster_healthcheck_pvs + +- name: storage_health | Evaluate PV capacity + ansible.builtin.set_fact: + __cluster_healthcheck_pv_available: >- + {{ __cluster_healthcheck_pvs.resources + | selectattr('status.phase', 'equalto', 'Available') + | list | length }} + __cluster_healthcheck_pv_total: >- + {{ __cluster_healthcheck_pvs.resources | length }} + +- name: storage_health | Check for PVCs stuck in Pending + kubernetes.core.k8s_info: + api_version: v1 + kind: PersistentVolumeClaim + field_selectors: + - status.phase=Pending + register: __cluster_healthcheck_pending_pvcs + +- name: storage_health | Report pending PVCs + ansible.builtin.debug: + msg: >- + PVC {{ item.metadata.namespace }}/{{ item.metadata.name }} is stuck in Pending state + loop: "{{ __cluster_healthcheck_pending_pvcs.resources }}" + loop_control: + label: "{{ item.metadata.name }}" + +- name: storage_health | Set storage health result + ansible.builtin.set_fact: + __cluster_healthcheck_results: >- + {{ __cluster_healthcheck_results | combine({ + 'storage_health': { + 'status': ('fail' if __cluster_healthcheck_storage_classes.resources | length == 0 + else ('warning' if (__cluster_healthcheck_default_sc | length == 0 or + __cluster_healthcheck_pending_pvcs.resources | length > 0) + else 'pass')), + 'details': [ + { 'check': 'StorageClasses Exist', + 'status': ('pass' if __cluster_healthcheck_storage_classes.resources | length > 0 else 'fail'), + 'message': (__cluster_healthcheck_storage_classes.resources | length | string + + ' StorageClass(es) found') }, + { 'check': 'Default StorageClass', + 'status': ('pass' if __cluster_healthcheck_default_sc | length > 0 else 'warning'), + 'message': ('Default StorageClass configured' + if __cluster_healthcheck_default_sc | length > 0 + else 'No default StorageClass set') }, + { 'check': 'CSI Drivers', + 'status': ('pass' if __cluster_healthcheck_csi_drivers.resources | length > 0 else 'warning'), + 'message': (__cluster_healthcheck_csi_drivers.resources | length | string + + ' CSI driver(s) found') }, + { 'check': 'PV Capacity', + 'status': 'pass', + 'message': (__cluster_healthcheck_pv_available | string + '/' + + __cluster_healthcheck_pv_total | string + ' PV(s) Available') }, + { 'check': 'Pending PVCs', + 'status': ('warning' if __cluster_healthcheck_pending_pvcs.resources | length > 0 else 'pass'), + 'message': ((__cluster_healthcheck_pending_pvcs.resources | length | string) + + ' PVC(s) stuck in Pending') + if __cluster_healthcheck_pending_pvcs.resources | length > 0 + else 'No PVCs stuck in Pending' } + ] + } + }) }} +... diff --git a/roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2 b/roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2 new file mode 100644 index 0000000..c3ed2ac --- /dev/null +++ b/roles/cluster_healthcheck/templates/cluster_healthcheck_report.html.j2 @@ -0,0 +1,81 @@ + + + + + Cluster Healthcheck Report + + + +

Cluster Healthcheck Report

+

Generated: {{ ansible_date_time.iso8601 | default(lookup('pipe', 'date -u +%Y-%m-%dT%H:%M:%SZ')) }}

+ +

Summary

+ + + + + +{% for category, result in __cluster_healthcheck_results.items() %} + + + + +{% endfor %} +
CategoryStatus
{{ category | replace('_', ' ') | title }}{{ result.status | upper }}
+ +{% for category, result in __cluster_healthcheck_results.items() %} +{% if result.details | length > 0 %} +

{{ category | replace('_', ' ') | title }}

+ + + + + + +{% for detail in result.details %} +{% if detail.check is defined %} + + + + + +{% elif detail.name is defined %} + + + + + +{% endif %} +{% endfor %} +
CheckStatusDetails
{{ detail.check }}{{ detail.status | upper }}{{ detail.message }}
{{ detail.name }}{{ 'PASS' if detail.running else 'FAIL' }}Running: {{ detail.running }}, Guest Agent: {{ detail.guest_agent }}, Network: {{ detail.interfaces }}
+{% endif %} +{% endfor %} + +

Recommendations

+ + + diff --git a/roles/cluster_healthcheck/tests/inventory b/roles/cluster_healthcheck/tests/inventory new file mode 100644 index 0000000..2302eda --- /dev/null +++ b/roles/cluster_healthcheck/tests/inventory @@ -0,0 +1 @@ +localhost ansible_connection=local diff --git a/roles/cluster_healthcheck/tests/test.yml b/roles/cluster_healthcheck/tests/test.yml new file mode 100644 index 0000000..0db8ed0 --- /dev/null +++ b/roles/cluster_healthcheck/tests/test.yml @@ -0,0 +1,8 @@ +--- +- name: Test cluster_healthcheck role + hosts: localhost + connection: local + gather_facts: false + roles: + - role: cluster_healthcheck +... diff --git a/roles/cluster_healthcheck/vars/main.yml b/roles/cluster_healthcheck/vars/main.yml new file mode 100644 index 0000000..8955cfb --- /dev/null +++ b/roles/cluster_healthcheck/vars/main.yml @@ -0,0 +1,4 @@ +--- +# vars file for cluster_healthcheck +__cluster_healthcheck_results: {} +...