diff --git a/agent/06_agent_create_cluster.sh b/agent/06_agent_create_cluster.sh index 01385a8ca..c28c67d6d 100755 --- a/agent/06_agent_create_cluster.sh +++ b/agent/06_agent_create_cluster.sh @@ -436,6 +436,55 @@ function run_agent_test_cases() { echo "Finished fixing DNS through agent-tui" fi + + if [[ $AGENT_TEST_CASES =~ "copy_network" ]]; then + if [[ ${NUM_MASTERS} -lt 2 ]]; then + echo "ERROR: copy_network test case requires at least 2 master nodes (NUM_MASTERS=${NUM_MASTERS}). Not supported with SNO topology." + exit 1 + fi + if [[ "${IP_STACK}" != "v4" ]]; then + echo "ERROR: copy_network test case only supports IPv4 (IP_STACK=${IP_STACK})." + exit 1 + fi + echo "Running test scenario: inject static network keyfile on master_0 and master_1" + + # Inject a NetworkManager keyfile on master_0 and master_1 via SSH into the live + # environment, simulating what a user would create via the agent-tui. The keyfile + # must be in place before agent-set-host-copy-network-arg.service runs. + # The script waits for SSH to become available on each node. + # master_1 is intentionally included as a non-rendezvous node, which is the + # scenario the bug affected. + # master-0 is the rendezvous/bootstrap node whose IP is recorded in the etcd + # cluster during installation - preserve its DHCP IP as the static IP so etcd + # membership is not broken after reboot. + # master-1 uses a distinct static IP outside the DHCP range to prove the + # static config was copied and persists after installation. + subnet_prefix=$(echo "${EXTERNAL_SUBNET_V4}" | cut -d'/' -f2) + master0_hostname=$(printf ${MASTER_HOSTNAME_FORMAT} 0) + master0_dhcp_ip=$(sudo virsh net-dumpxml ${BAREMETAL_NETWORK_NAME} | xmllint --xpath \ + "string(//dns[*]/host/hostname[. = '${master0_hostname}']/../@ip)" -) + # Find an unused static IP for master-1 by scanning virsh for unassigned offsets + master1_static_ip="" + for offset in $(seq 90 254); do + candidate=$(nth_ip ${EXTERNAL_SUBNET_V4} ${offset}) + if ! sudo virsh net-dumpxml ${BAREMETAL_NETWORK_NAME} | xmllint --xpath "//dns[*]/host[@ip = '${candidate}']" - &>/dev/null; then + master1_static_ip=${candidate} + break + fi + done + if [ -z "${master1_static_ip}" ]; then + echo "ERROR: could not find an unused IP in ${EXTERNAL_SUBNET_V4} for master-1 static config" + exit 1 + fi + echo "Using static IP ${master1_static_ip} for master-1" + declare -A COPY_NETWORK_STATIC_IPS=([0]="${master0_dhcp_ip}/${subnet_prefix}" [1]="${master1_static_ip}/${subnet_prefix}") + for node_index in 0 1; do + echo "Injecting keyfile on master_${node_index}" + ./agent/e2e/agent-tui/test-copy-network.sh $node_index ${COPY_NETWORK_STATIC_IPS[$node_index]} + done + + echo "Finished injecting network keyfiles" + fi } # Setup the environment to allow iPXE booting, by reusing libvirt native features diff --git a/agent/08_agent_post_install_validation.sh b/agent/08_agent_post_install_validation.sh index a40e5245b..fbf0626fc 100755 --- a/agent/08_agent_post_install_validation.sh +++ b/agent/08_agent_post_install_validation.sh @@ -43,3 +43,8 @@ if (( $NUM_MASTERS != $installed_control_plane_nodes )); then fi oc get clusterversion + +if [[ "${AGENT_TEST_CASES:-}" =~ "copy_network" ]]; then + echo "Validating static network config persistence after installation" + ./agent/e2e/agent-tui/validate-copy-network.sh +fi diff --git a/agent/e2e/agent-tui/test-copy-network.sh b/agent/e2e/agent-tui/test-copy-network.sh new file mode 100755 index 000000000..f37fcf5e3 --- /dev/null +++ b/agent/e2e/agent-tui/test-copy-network.sh @@ -0,0 +1,75 @@ +#!/bin/bash +set -euxo pipefail + +SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../../" && pwd )" +source $SCRIPTDIR/common.sh + +NODE_INDEX=${1:-0} +STATIC_IP=${2:-"192.168.111.90/24"} +CONNECTION_NAME="copy-network-static" + +SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -q" + +# Derive the DNS hostname from MASTER_HOSTNAME_FORMAT (e.g. "master-0") +node_hostname=$(printf ${MASTER_HOSTNAME_FORMAT} ${NODE_INDEX}) + +# Get the node's current DHCP IP (assigned by virsh) to SSH into the live environment +node_ip=$(sudo virsh net-dumpxml ${BAREMETAL_NETWORK_NAME} | xmllint --xpath \ + "string(//dns[*]/host/hostname[. = '${node_hostname}']/../@ip)" -) + +if [ -z "$node_ip" ]; then + echo "ERROR: Could not resolve IP for ${node_hostname} on network ${BAREMETAL_NETWORK_NAME}" + exit 1 +fi + +echo "Waiting for live environment SSH on ${node_hostname} (${node_ip})..." +until ssh $SSH_OPTS core@${node_ip} true 2>/dev/null; do + sleep 10 +done + +echo "SSH available on ${node_hostname}, injecting static network keyfile" + +# Determine the MAC address of the interface that has the current DHCP IP, +# and the default gateway and DNS from the live environment. +iface_mac=$(ssh $SSH_OPTS core@${node_ip} \ + "ip -j addr show | jq -r '.[] | select(.addr_info[]? | .local == \"${node_ip}\") | .address'") +gateway=$(ssh $SSH_OPTS core@${node_ip} \ + "ip route show default | awk '/default/ {print \$3; exit}'") +dns=$(ssh $SSH_OPTS core@${node_ip} \ + "awk '/^nameserver/ {print \$2; exit}' /etc/resolv.conf") + +echo "Interface MAC: ${iface_mac}, Gateway: ${gateway}, DNS: ${dns}, Static IP: ${STATIC_IP}" + +if [ -z "$iface_mac" ] || [ -z "$gateway" ]; then + echo "ERROR: Could not determine interface MAC or gateway on ${node_hostname}" + exit 1 +fi + +# Write a static NetworkManager keyfile using a static IP distinct from the DHCP +# address, so the installed OS can be verified to be using the static config. +# Bound to the primary interface by MAC address and uses autoconnect-priority=1 +# to take precedence over auto-generated DHCP connections (priority -100). +ssh $SSH_OPTS core@${node_ip} \ + "sudo bash -c 'umask 177; cat > /etc/NetworkManager/system-connections/${CONNECTION_NAME}.nmconnection'" << EOF +[connection] +id=${CONNECTION_NAME} +type=ethernet +autoconnect=true +autoconnect-priority=1 + +[ethernet] +mac-address=${iface_mac} + +[ipv4] +address1=${STATIC_IP},${gateway} +dns=${dns}; +method=manual + +[ipv6] +method=disabled + +[proxy] +EOF + +echo "Injected static keyfile '${CONNECTION_NAME}.nmconnection' on ${node_hostname}" +echo " Static IP: ${STATIC_IP}, Gateway: ${gateway}, DNS: ${dns}" diff --git a/agent/e2e/agent-tui/validate-copy-network.sh b/agent/e2e/agent-tui/validate-copy-network.sh new file mode 100755 index 000000000..16fb1f92c --- /dev/null +++ b/agent/e2e/agent-tui/validate-copy-network.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -euxo pipefail + +SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../../" && pwd )" +source $SCRIPTDIR/common.sh +source $SCRIPTDIR/agent/common.sh +source $SCRIPTDIR/network.sh + +CONNECTION_NAME="copy-network-static" + +SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -q" + +# master-0 keeps its DHCP IP as the static IP (to preserve etcd membership), +# so we reach it via its virsh-assigned IP. +# master-1 uses a distinct static IP outside the DHCP range, so we SSH to that +# IP directly to prove the installed OS is using the static config. +subnet_prefix=$(echo "${EXTERNAL_SUBNET_V4}" | cut -d'/' -f2) +master0_hostname=$(printf ${MASTER_HOSTNAME_FORMAT} 0) +master0_ip=$(sudo virsh net-dumpxml ${BAREMETAL_NETWORK_NAME} | xmllint --xpath \ + "string(//dns[*]/host/hostname[. = '${master0_hostname}']/../@ip)" -) +master1_ip="" +for offset in $(seq 90 254); do + candidate=$(nth_ip ${EXTERNAL_SUBNET_V4} ${offset}) + if ! sudo virsh net-dumpxml ${BAREMETAL_NETWORK_NAME} | xmllint --xpath "//dns[*]/host[@ip = '${candidate}']" - &>/dev/null; then + master1_ip=${candidate} + break + fi +done +if [ -z "${master1_ip}" ]; then + echo "ERROR: could not find the static IP for master-1 in ${EXTERNAL_SUBNET_V4}" + exit 1 +fi + +declare -A NODE_IPS=([0]="${master0_ip}" [1]="${master1_ip}") + +failed=0 +for node_index in 0 1; do + node_hostname=$(printf ${MASTER_HOSTNAME_FORMAT} ${node_index}) + node_ip=${NODE_IPS[$node_index]} + + echo "Checking ${node_hostname} at ${node_ip} for connection '${CONNECTION_NAME}'..." + + if ! ssh $SSH_OPTS core@${node_ip} true 2>/dev/null; then + echo "FAIL: Cannot SSH to ${node_hostname} at ${node_ip}" + failed=1 + continue + fi + + # Verify the NetworkManager keyfile exists in the installed OS + if ! ssh $SSH_OPTS core@${node_ip} \ + "sudo ls /etc/NetworkManager/system-connections/ | grep -q '${CONNECTION_NAME}'"; then + echo "FAIL: Connection keyfile '${CONNECTION_NAME}' not found on ${node_hostname}" + failed=1 + continue + fi + + # Verify nmcli reports the connection with static method - this is the proof + # that --copy-network copied the user-created keyfile to the installed OS + if ! ssh $SSH_OPTS core@${node_ip} \ + "sudo nmcli -f ipv4.method connection show '${CONNECTION_NAME}' | grep -q 'manual'"; then + echo "FAIL: Connection '${CONNECTION_NAME}' does not have static IPv4 method on ${node_hostname}" + failed=1 + continue + fi + + echo "PASS: ${node_hostname} has connection '${CONNECTION_NAME}' with method=manual" +done + +if [ $failed -ne 0 ]; then + echo "FAIL: Network config persistence validation failed on one or more nodes" + exit 1 +fi + +echo "PASS: Static network config persisted after installation on all nodes" diff --git a/config_example.sh b/config_example.sh index 0ef83949b..35df25601 100755 --- a/config_example.sh +++ b/config_example.sh @@ -924,7 +924,19 @@ set -x # then the wait-for commands should timeout and fail. # This test case is only supported when IP_STACK=v4. # +# 2. 'copy_network' test case: +# Validates that static network connections created manually via the agent-tui during boot +# persist into the installed OS (OCPBUGS-63472). Use a DHCP scenario so that no static +# networking is pre-configured in the manifests. During boot, the test uses console key +# presses to create a new static connection via nmtui on master_0 and master_1. After +# installation, the post-install validation checks that the connection keyfile and nmcli +# entry are present on both nodes, confirming that --copy-network was set per-host by +# agent-set-host-copy-network-arg.service and coreos-installer copied the keyfile into the OS. +# Requires: AGENT_E2E_TEST_SCENARIO=COMPACT_IPV4_DHCP (or any HA_IPV4_DHCP), IP_STACK=v4. +# Not supported with SNO topology (requires at least 2 master nodes) or IPv6. +# # export AGENT_TEST_CASES='bad_dns' +# export AGENT_TEST_CASES='copy_network' # Uncomment the following line to deploy the cluster using the appliance model # The appliance model boots the host using the unconfigured ignition. It then mounts