Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions playbooks/cluster_healthcheck.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
- name: Run cluster healthchecks
hosts: localhost
connection: local
gather_facts: false
tasks:
- name: Include cluster_healthcheck role
ansible.builtin.import_role:
name: infra.openshift_virtualization_migration.cluster_healthcheck
...
69 changes: 69 additions & 0 deletions roles/cluster_healthcheck/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# cluster_healthcheck

```
Role belongs to infra/openshift_virtualization_migration
Namespace - infra
Collection - openshift_virtualization_migration
```

Description: Cluster health validation for OpenShift Virtualization migration environments.

## Requirements

- OpenShift cluster with `kubeconfig` configured
- `kubernetes.core` collection installed
- OpenShift Virtualization (CNV) operator installed
- Migration Toolkit for Virtualization (MTV) operator installed

## Role Variables

### Defaults

| Variable | Type | Default | Description |
|----------|------|---------|-------------|
| `cluster_healthcheck_checks` | list | See defaults/main.yml | List of health checks to run |
| `cluster_healthcheck_post_migration_vms` | list | `[]` | VMs to check post-migration |
| `cluster_healthcheck_generate_report` | bool | `true` | Generate HTML report |
| `cluster_healthcheck_report_path` | str | `/tmp/cluster_healthcheck_report.html` | Report output path |
| `cluster_healthcheck_mtv_namespace` | str | `openshift-mtv` | MTV operator namespace |
| `cluster_healthcheck_kubevirt_namespace` | str | `openshift-cnv` | KubeVirt operator namespace |
| `cluster_healthcheck_ssh_timeout` | int | `10` | SSH check timeout in seconds |
| `cluster_healthcheck_debug` | bool | `false` | Enable verbose debug output |

### Post-Migration VM Format

```yaml
cluster_healthcheck_post_migration_vms:
- name: my-vm
namespace: my-namespace
check_ssh: true # optional, default false
```

## Health Checks

| Check | Description |
|-------|-------------|
| `ocp_node_health` | Node Ready status, resource pressure, kubevirt.io/schedulable label |
| `kubevirt_health` | HyperConverged CR, virt-* pods, CDI operator |
| `mtv_health` | ForkliftController, MTV pods, Providers, Plans |
| `storage_health` | StorageClasses, CSI drivers, PV capacity, pending PVCs |
| `network_health` | Multus, NADs, OVN/SDN health, migration network |

## Example Playbook

```yaml
- name: Run cluster healthchecks
hosts: localhost
connection: local
gather_facts: false
roles:
- role: infra.openshift_virtualization_migration.cluster_healthcheck
vars:
cluster_healthcheck_post_migration_vms:
- name: rhel9-vm
namespace: migration-target
```

## License

GPL-3.0-only
23 changes: 23 additions & 0 deletions roles/cluster_healthcheck/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
---
# defaults file for cluster_healthcheck
cluster_healthcheck_checks:
- ocp_node_health
- kubevirt_health
- mtv_health
- storage_health
- network_health

cluster_healthcheck_post_migration_vms: []

cluster_healthcheck_generate_report: true

cluster_healthcheck_report_path: "/tmp/cluster_healthcheck_report.html"

cluster_healthcheck_mtv_namespace: "openshift-mtv"

cluster_healthcheck_kubevirt_namespace: "openshift-cnv"

cluster_healthcheck_ssh_timeout: 10

cluster_healthcheck_debug: false
...
10 changes: 10 additions & 0 deletions roles/cluster_healthcheck/meta/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
galaxy_info:
author: ""
description: Cluster health validation for OpenShift Virtualization migration environments.
company: Red Hat
license: GPL-3.0-only
min_ansible_version: 2.15.0
galaxy_tags: []
dependencies: []
...
177 changes: 177 additions & 0 deletions roles/cluster_healthcheck/tasks/kubevirt_health.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
---
- name: kubevirt_health | Get HyperConverged CR status
kubernetes.core.k8s_info:
api_version: hco.kubevirt.io/v1beta1
kind: HyperConverged
namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
register: __cluster_healthcheck_hco

- name: kubevirt_health | Evaluate HyperConverged conditions
ansible.builtin.set_fact:
__cluster_healthcheck_hco_available: >-
{{ __cluster_healthcheck_hco.resources | length > 0 and
__cluster_healthcheck_hco.resources[0].status.conditions | default([])
| selectattr('type', 'equalto', 'Available')
| map(attribute='status')
| first | default('False') == 'True' }}
__cluster_healthcheck_hco_degraded: >-
{{ __cluster_healthcheck_hco.resources | length > 0 and
__cluster_healthcheck_hco.resources[0].status.conditions | default([])
| selectattr('type', 'equalto', 'Degraded')
| map(attribute='status')
| first | default('False') == 'True' }}

- name: kubevirt_health | Report HyperConverged status
ansible.builtin.debug:
msg: >-
HyperConverged CR -
Available: {{ __cluster_healthcheck_hco_available }},
Degraded: {{ __cluster_healthcheck_hco_degraded }}

- name: kubevirt_health | Check virt-operator pods
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
label_selectors:
- "kubevirt.io=virt-operator"
register: __cluster_healthcheck_virt_operator_pods

- name: kubevirt_health | Check virt-controller pods
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
label_selectors:
- "kubevirt.io=virt-controller"
register: __cluster_healthcheck_virt_controller_pods

- name: kubevirt_health | Check virt-handler pods
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
label_selectors:
- "kubevirt.io=virt-handler"
register: __cluster_healthcheck_virt_handler_pods

- name: kubevirt_health | Check virt-api pods
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
label_selectors:
- "kubevirt.io=virt-api"
register: __cluster_healthcheck_virt_api_pods

- name: kubevirt_health | Evaluate KubeVirt pod health
ansible.builtin.set_fact:
__cluster_healthcheck_kubevirt_pods:
virt_operator:
running: "{{ __cluster_healthcheck_virt_operator_pods.resources
| selectattr('status.phase', 'equalto', 'Running') | list | length }}"
total: "{{ __cluster_healthcheck_virt_operator_pods.resources | length }}"
virt_controller:
running: "{{ __cluster_healthcheck_virt_controller_pods.resources
| selectattr('status.phase', 'equalto', 'Running') | list | length }}"
total: "{{ __cluster_healthcheck_virt_controller_pods.resources | length }}"
virt_handler:
running: "{{ __cluster_healthcheck_virt_handler_pods.resources
| selectattr('status.phase', 'equalto', 'Running') | list | length }}"
total: "{{ __cluster_healthcheck_virt_handler_pods.resources | length }}"
virt_api:
running: "{{ __cluster_healthcheck_virt_api_pods.resources
| selectattr('status.phase', 'equalto', 'Running') | list | length }}"
total: "{{ __cluster_healthcheck_virt_api_pods.resources | length }}"

- name: kubevirt_health | Check CDI operator pods
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
label_selectors:
- "app.kubernetes.io/component=cdi-operator"
register: __cluster_healthcheck_cdi_operator_pods

- name: kubevirt_health | Check CDI deployment pods
kubernetes.core.k8s_info:
api_version: v1
kind: Pod
namespace: "{{ cluster_healthcheck_kubevirt_namespace }}"
label_selectors:
- "app.kubernetes.io/component=cdi-deployment"
register: __cluster_healthcheck_cdi_deployment_pods

- name: kubevirt_health | Evaluate CDI health
ansible.builtin.set_fact:
__cluster_healthcheck_cdi_pods:
cdi_operator:
running: "{{ __cluster_healthcheck_cdi_operator_pods.resources
| selectattr('status.phase', 'equalto', 'Running') | list | length }}"
total: "{{ __cluster_healthcheck_cdi_operator_pods.resources | length }}"
cdi_deployment:
running: "{{ __cluster_healthcheck_cdi_deployment_pods.resources
| selectattr('status.phase', 'equalto', 'Running') | list | length }}"
total: "{{ __cluster_healthcheck_cdi_deployment_pods.resources | length }}"

- name: kubevirt_health | Set kubevirt health result
ansible.builtin.set_fact:
__cluster_healthcheck_results: >-
{{ __cluster_healthcheck_results | combine({
'kubevirt_health': {
'status': ('fail' if (not __cluster_healthcheck_hco_available or
__cluster_healthcheck_hco_degraded or
__cluster_healthcheck_kubevirt_pods.virt_operator.total | int == 0 or
__cluster_healthcheck_kubevirt_pods.virt_controller.total | int == 0)
else 'pass'),
'details': [
{ 'check': 'HyperConverged Available',
'status': ('pass' if __cluster_healthcheck_hco_available else 'fail'),
'message': ('HyperConverged CR is Available'
if __cluster_healthcheck_hco_available
else 'HyperConverged CR is NOT Available') },
{ 'check': 'HyperConverged Not Degraded',
'status': ('fail' if __cluster_healthcheck_hco_degraded else 'pass'),
'message': ('HyperConverged CR is Degraded'
if __cluster_healthcheck_hco_degraded
else 'HyperConverged CR is not Degraded') },
{ 'check': 'virt-operator',
'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_operator.running | int > 0
else 'fail'),
'message': (__cluster_healthcheck_kubevirt_pods.virt_operator.running | string +
'/' + __cluster_healthcheck_kubevirt_pods.virt_operator.total | string +
' pods Running') },
{ 'check': 'virt-controller',
'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_controller.running | int > 0
else 'fail'),
'message': (__cluster_healthcheck_kubevirt_pods.virt_controller.running | string +
'/' + __cluster_healthcheck_kubevirt_pods.virt_controller.total | string +
' pods Running') },
{ 'check': 'virt-handler',
'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_handler.running | int > 0
else 'fail'),
'message': (__cluster_healthcheck_kubevirt_pods.virt_handler.running | string +
'/' + __cluster_healthcheck_kubevirt_pods.virt_handler.total | string +
' pods Running') },
{ 'check': 'virt-api',
'status': ('pass' if __cluster_healthcheck_kubevirt_pods.virt_api.running | int > 0
else 'fail'),
'message': (__cluster_healthcheck_kubevirt_pods.virt_api.running | string +
'/' + __cluster_healthcheck_kubevirt_pods.virt_api.total | string +
' pods Running') },
{ 'check': 'CDI Operator',
'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_operator.running | int > 0
else 'fail'),
'message': (__cluster_healthcheck_cdi_pods.cdi_operator.running | string +
'/' + __cluster_healthcheck_cdi_pods.cdi_operator.total | string +
' pods Running') },
{ 'check': 'CDI Deployment',
'status': ('pass' if __cluster_healthcheck_cdi_pods.cdi_deployment.running | int > 0
else 'fail'),
'message': (__cluster_healthcheck_cdi_pods.cdi_deployment.running | string +
'/' + __cluster_healthcheck_cdi_pods.cdi_deployment.total | string +
' pods Running') }
]
}
}) }}
...
40 changes: 40 additions & 0 deletions roles/cluster_healthcheck/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
# tasks file for cluster_healthcheck
- name: Initialize healthcheck results
ansible.builtin.set_fact:
__cluster_healthcheck_results:
ocp_node_health: { status: "skipped", details: [] }
kubevirt_health: { status: "skipped", details: [] }
mtv_health: { status: "skipped", details: [] }
storage_health: { status: "skipped", details: [] }
network_health: { status: "skipped", details: [] }
post_migration_vm: { status: "skipped", details: [] }

- name: Include ocp_node_health tasks
ansible.builtin.include_tasks: ocp_node_health.yml
when: "'ocp_node_health' in cluster_healthcheck_checks"

- name: Include kubevirt_health tasks
ansible.builtin.include_tasks: kubevirt_health.yml
when: "'kubevirt_health' in cluster_healthcheck_checks"

- name: Include mtv_health tasks
ansible.builtin.include_tasks: mtv_health.yml
when: "'mtv_health' in cluster_healthcheck_checks"

- name: Include storage_health tasks
ansible.builtin.include_tasks: storage_health.yml
when: "'storage_health' in cluster_healthcheck_checks"

- name: Include network_health tasks
ansible.builtin.include_tasks: network_health.yml
when: "'network_health' in cluster_healthcheck_checks"

- name: Include post_migration_vm tasks
ansible.builtin.include_tasks: post_migration_vm.yml
when: "cluster_healthcheck_post_migration_vms | length > 0"

- name: Include report tasks
ansible.builtin.include_tasks: report.yml
when: cluster_healthcheck_generate_report
...
Loading
Loading