From 328d6186fafd3df75dee10ca6c44545a9483a242 Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Tue, 10 Mar 2026 11:35:40 +0100 Subject: [PATCH 01/10] NO-JIRA: fix: replace timeout command with AWS CLI built-in waiter The 'timeout' command is not available on macOS by default, causing 'make deploy' to fail during instance creation. Remove the dependency by relying on the AWS CLI built-in waiter which already polls every 15s for up to 40 attempts (~10 minutes) by default. pre-commit.check-secrets: ENABLED --- deploy/aws-hypervisor/scripts/create.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/deploy/aws-hypervisor/scripts/create.sh b/deploy/aws-hypervisor/scripts/create.sh index 97bdd5d..854a7b2 100755 --- a/deploy/aws-hypervisor/scripts/create.sh +++ b/deploy/aws-hypervisor/scripts/create.sh @@ -386,7 +386,14 @@ echo "${HOST_PUBLIC_IP}" > "${SCRIPT_DIR}/../${SHARED_DIR}/public_address" echo "${HOST_PRIVATE_IP}" > "${SCRIPT_DIR}/../${SHARED_DIR}/private_address" echo "Waiting up to 10 mins for RHEL host to be up." -timeout 10m aws ec2 wait instance-status-ok --instance-id "${INSTANCE_ID}" --no-cli-pager +# The 'aws ec2 wait' command polls every 15s for up to 40 attempts (~10 min) by default. +# This avoids using the 'timeout' command which is not available on macOS. +if ! aws ec2 wait instance-status-ok \ + --instance-id "${INSTANCE_ID}" \ + --no-cli-pager; then + echo "ERROR: Instance ${INSTANCE_ID} failed to reach OK status within the timeout period" + exit 1 +fi # Add the host key to known_hosts to avoid prompts while maintaining security echo "Adding host key for $HOST_PUBLIC_IP to known_hosts..." From a5dd3768d91b277792962a35e4a02a50b0628dd2 Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Tue, 10 Mar 2026 11:35:48 +0100 Subject: [PATCH 02/10] NO-JIRA: fix: use portable lowercase conversion in redeploy script Replace bash 4+ syntax (${var,,}) with portable 'tr' alternative. macOS ships with bash 3.2 which does not support this syntax, causing 'make redeploy-cluster' to fail with 'bad substitution'. pre-commit.check-secrets: ENABLED --- deploy/openshift-clusters/scripts/redeploy-cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/openshift-clusters/scripts/redeploy-cluster.sh b/deploy/openshift-clusters/scripts/redeploy-cluster.sh index 8fbb2c0..1238b7b 100755 --- a/deploy/openshift-clusters/scripts/redeploy-cluster.sh +++ b/deploy/openshift-clusters/scripts/redeploy-cluster.sh @@ -217,7 +217,7 @@ echo "==================================" # Convert method to lowercase for ansible (state file stores uppercase) ansible-playbook redeploy.yml -i inventory.ini \ --extra-vars "topology=${topology}" \ - --extra-vars "method=${current_installation_method,,}" \ + --extra-vars "method=$(echo "${current_installation_method}" | tr '[:upper:]' '[:lower:]')" \ --extra-vars "vm_cleanup_needed=${vm_cleanup_needed}" \ --extra-vars "clean_needed=${clean_needed:-false}" \ --extra-vars "cleanup_reason=${cleanup_reason}" \ From b458662dbdb583e9c69ee7e4d11a11a82a1d7160 Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Tue, 10 Mar 2026 11:35:56 +0100 Subject: [PATCH 03/10] NO-JIRA: fix: enable CRB repository on AWS RHUI instances On AWS RHUI-managed instances, 'subscription-manager repos --enable codeready-builder-*' fails silently because repos are managed by RHUI configuration, not subscription-manager. This causes libvirt-devel to be unavailable, breaking the dev-scripts requirements installation. Use '/usr/bin/crb enable' which handles both RHUI and non-RHUI environments correctly. pre-commit.check-secrets: ENABLED --- deploy/aws-hypervisor/scripts/configure.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/deploy/aws-hypervisor/scripts/configure.sh b/deploy/aws-hypervisor/scripts/configure.sh index 8a24cbe..9548863 100644 --- a/deploy/aws-hypervisor/scripts/configure.sh +++ b/deploy/aws-hypervisor/scripts/configure.sh @@ -48,3 +48,15 @@ sudo subscription-manager repos \ --enable "rhel-9-for-$(uname -m)-appstream-rpms" \ --enable "rhel-9-for-$(uname -m)-baseos-rpms" \ --enable "rhocp-$(get_ocp_version)-for-rhel-9-$(uname -m)-rpms" + +# Enable CodeReady Builder (CRB) repo for -devel packages (e.g. libvirt-devel). +# On RHUI instances (like AWS), subscription-manager repos --enable doesn't work +# for CRB because repos are managed by RHUI configuration. The 'crb' command +# handles both RHUI and non-RHUI environments correctly. +if command -v crb &>/dev/null; then + echo "Enabling CRB repository..." + sudo /usr/bin/crb enable +else + echo "WARNING: 'crb' command not found, attempting subscription-manager fallback" + sudo subscription-manager repos --enable "codeready-builder-for-rhel-9-$(uname -m)-rpms" || true +fi From 30a1f816347d40450e16b28a3f81ee2486ea373a Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Tue, 10 Mar 2026 11:36:04 +0100 Subject: [PATCH 04/10] NO-JIRA: fix: make OCP project handler resilient to missing kubeconfig The 'Set OCP project' handler fires even when the cluster deployment fails, producing a confusing secondary error about missing 'oc' or kubeconfig that masks the actual failure. Add failed_when: false so the handler does not error when kubeconfig does not exist. pre-commit.check-secrets: ENABLED --- .../roles/dev-scripts/install-dev/handlers/main.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml index c2a73bb..cf04dd4 100644 --- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml +++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml @@ -1,3 +1,6 @@ --- - name: Set OCP project - command: oc --kubeconfig="{{kubeconfig_path}}" project openshift-machine-api + command: oc --kubeconfig="{{ kubeconfig_path }}" project openshift-machine-api + when: kubeconfig_path is defined + failed_when: false + changed_when: false From 3eeb854ef303d10b2e5721c54465f6ec9b407e07 Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Tue, 10 Mar 2026 11:36:12 +0100 Subject: [PATCH 05/10] NO-JIRA: fix: add pre-flight validation for CI registry credentials When the config uses a CI registry image (registry.ci.openshift.org) but the pull secret lacks CI credentials, the deployment runs for ~20 minutes before failing with an unclear 'unauthorized' error. Add an early check that fails immediately with a clear message explaining how to obtain CI registry credentials. pre-commit.check-secrets: ENABLED --- .../dev-scripts/install-dev/tasks/config.yml | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml index f281efe..c47a09a 100644 --- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml +++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml @@ -1,4 +1,28 @@ --- +- name: Read config file to check for CI registry usage + set_fact: + config_content: "{{ lookup('file', config_file[method]) }}" + delegate_to: localhost + become: false + +- name: Read pull secret to check for CI registry auth + set_fact: + pull_secret_content: "{{ lookup('file', 'pull-secret.json') | from_json }}" + delegate_to: localhost + become: false + +- name: Warn if using CI registry images without CI auth in pull secret + fail: + msg: >- + Your config uses a CI registry image (registry.ci.openshift.org) but your + pull secret does not include credentials for registry.ci.openshift.org. + Either add CI registry credentials to your pull secret or switch to a + public release image (e.g. quay.io/openshift-release-dev/ocp-release). + See the setup guide for instructions on obtaining CI registry credentials. + when: + - "'registry.ci.openshift.org' in config_content" + - "'registry.ci.openshift.org' not in (pull_secret_content.auths | default({}) | list)" + - name: Copy pull secrets copy: dest: "{{dev_scripts_path}}/pull_secret.json" From fe13899bc80f7c786c53e7825fe7461371cb667b Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Tue, 10 Mar 2026 11:43:23 +0100 Subject: [PATCH 06/10] Revert "NO-JIRA: fix: replace timeout command with AWS CLI built-in waiter" This reverts commit 328d6186fafd3df75dee10ca6c44545a9483a242. --- deploy/aws-hypervisor/scripts/create.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/deploy/aws-hypervisor/scripts/create.sh b/deploy/aws-hypervisor/scripts/create.sh index 854a7b2..97bdd5d 100755 --- a/deploy/aws-hypervisor/scripts/create.sh +++ b/deploy/aws-hypervisor/scripts/create.sh @@ -386,14 +386,7 @@ echo "${HOST_PUBLIC_IP}" > "${SCRIPT_DIR}/../${SHARED_DIR}/public_address" echo "${HOST_PRIVATE_IP}" > "${SCRIPT_DIR}/../${SHARED_DIR}/private_address" echo "Waiting up to 10 mins for RHEL host to be up." -# The 'aws ec2 wait' command polls every 15s for up to 40 attempts (~10 min) by default. -# This avoids using the 'timeout' command which is not available on macOS. -if ! aws ec2 wait instance-status-ok \ - --instance-id "${INSTANCE_ID}" \ - --no-cli-pager; then - echo "ERROR: Instance ${INSTANCE_ID} failed to reach OK status within the timeout period" - exit 1 -fi +timeout 10m aws ec2 wait instance-status-ok --instance-id "${INSTANCE_ID}" --no-cli-pager # Add the host key to known_hosts to avoid prompts while maintaining security echo "Adding host key for $HOST_PUBLIC_IP to known_hosts..." From 063ea375dec8eaea3f37cdb7b37fc9d9d5860db3 Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Mon, 16 Mar 2026 14:56:46 +0100 Subject: [PATCH 07/10] NO-JIRA: fix: add portability comment for tr lowercase conversion Adds a comment explaining why tr is used instead of ${var,,} for bash 3.2 (macOS) compatibility in the redeploy script. pre-commit.check-secrets: ENABLED --- deploy/openshift-clusters/scripts/redeploy-cluster.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deploy/openshift-clusters/scripts/redeploy-cluster.sh b/deploy/openshift-clusters/scripts/redeploy-cluster.sh index 1238b7b..02a8db1 100755 --- a/deploy/openshift-clusters/scripts/redeploy-cluster.sh +++ b/deploy/openshift-clusters/scripts/redeploy-cluster.sh @@ -215,6 +215,7 @@ echo "==================================" # Call ansible in non-interactive mode with all parameters pre-determined # Convert method to lowercase for ansible (state file stores uppercase) +# Uses tr instead of ${var,,} for bash 3.2 (macOS) compatibility. ansible-playbook redeploy.yml -i inventory.ini \ --extra-vars "topology=${topology}" \ --extra-vars "method=$(echo "${current_installation_method}" | tr '[:upper:]' '[:lower:]')" \ @@ -234,4 +235,4 @@ echo "1. Source the proxy environment from anywhere:" echo " source ${DEPLOY_DIR}/openshift-clusters/proxy.env" echo " (or from openshift-clusters directory: source proxy.env)" echo "2. Verify cluster access: oc get nodes" -echo "3. Access the cluster console if needed" \ No newline at end of file +echo "3. Access the cluster console if needed" From 5dc337a9f5c51ece996cf7618529e049950013bc Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Mon, 16 Mar 2026 14:57:00 +0100 Subject: [PATCH 08/10] NO-JIRA: fix: fail explicitly on CRB repo enablement failure Refactors CRB enablement into a function and removes silent || true fallback. Uses consistent 'crb' command instead of mixing with '/usr/bin/crb'. pre-commit.check-secrets: ENABLED --- deploy/aws-hypervisor/scripts/configure.sh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/deploy/aws-hypervisor/scripts/configure.sh b/deploy/aws-hypervisor/scripts/configure.sh index 9548863..9c21602 100644 --- a/deploy/aws-hypervisor/scripts/configure.sh +++ b/deploy/aws-hypervisor/scripts/configure.sh @@ -53,10 +53,16 @@ sudo subscription-manager repos \ # On RHUI instances (like AWS), subscription-manager repos --enable doesn't work # for CRB because repos are managed by RHUI configuration. The 'crb' command # handles both RHUI and non-RHUI environments correctly. -if command -v crb &>/dev/null; then - echo "Enabling CRB repository..." - sudo /usr/bin/crb enable -else - echo "WARNING: 'crb' command not found, attempting subscription-manager fallback" - sudo subscription-manager repos --enable "codeready-builder-for-rhel-9-$(uname -m)-rpms" || true +enable_crb_repo() { + if command -v crb &>/dev/null; then + sudo crb enable + else + sudo subscription-manager repos --enable "codeready-builder-for-rhel-9-$(uname -m)-rpms" + fi +} + +echo "Enabling CRB repository..." +if ! enable_crb_repo; then + echo "ERROR: Failed to enable CRB repository. libvirt-devel will be unavailable." + exit 1 fi From 05440bbb93352cb6d702dd8187d3d3bfbb4bdebe Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Mon, 16 Mar 2026 14:57:19 +0100 Subject: [PATCH 09/10] NO-JIRA: fix: check kubeconfig existence before setting OCP project Replaces failed_when: false with a stat check and debug warning. Uses listen directive so all three handlers fire on the same notification. pre-commit.check-secrets: ENABLED --- .../dev-scripts/install-dev/handlers/main.yml | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml index cf04dd4..4f3761e 100644 --- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml +++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/handlers/main.yml @@ -1,6 +1,25 @@ --- -- name: Set OCP project - command: oc --kubeconfig="{{ kubeconfig_path }}" project openshift-machine-api +- name: Check kubeconfig exists for OCP project + stat: + path: "{{ kubeconfig_path }}" + register: kubeconfig_stat when: kubeconfig_path is defined - failed_when: false + listen: Set OCP project + +- name: Run oc project + command: oc --kubeconfig="{{ kubeconfig_path }}" project openshift-machine-api + when: + - kubeconfig_path is defined + - kubeconfig_stat.stat.exists | default(false) changed_when: false + listen: Set OCP project + +- name: Warn about missing kubeconfig + debug: + msg: >- + Could not set OCP project: kubeconfig not found at '{{ kubeconfig_path }}'. + The cluster may not have been deployed yet. + when: + - kubeconfig_path is defined + - not (kubeconfig_stat.stat.exists | default(false)) + listen: Set OCP project From 758b47337c90e9915c830d9c0fedb314d3b14f29 Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Mon, 16 Mar 2026 14:57:37 +0100 Subject: [PATCH 10/10] NO-JIRA: fix: wrap pull-secret parsing in block/rescue Catches malformed JSON in pull-secret.json and provides a clear error message instead of a raw Jinja2 traceback. pre-commit.check-secrets: ENABLED --- .../dev-scripts/install-dev/tasks/config.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml b/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml index c47a09a..c3ffad9 100644 --- a/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml +++ b/deploy/openshift-clusters/roles/dev-scripts/install-dev/tasks/config.yml @@ -5,11 +5,19 @@ delegate_to: localhost become: false -- name: Read pull secret to check for CI registry auth - set_fact: - pull_secret_content: "{{ lookup('file', 'pull-secret.json') | from_json }}" - delegate_to: localhost - become: false +- name: Parse pull secret JSON + block: + - name: Read pull secret to check for CI registry auth + set_fact: + pull_secret_content: "{{ lookup('file', 'pull-secret.json') | from_json }}" + delegate_to: localhost + become: false + rescue: + - name: Fail with pull secret parse error + fail: + msg: >- + Failed to parse pull-secret.json. Ensure the file contains valid JSON. + You can validate it with: python3 -m json.tool pull-secret.json - name: Warn if using CI registry images without CI auth in pull secret fail: