openshift · kasturinarra · May 26, 2026 · coderabbitai · May 28, 2026 · kasturinarra
diff --git a/test/extended/edge_topologies/tnf_recovery.go b/test/extended/edge_topologies/tnf_recovery.go
@@ -421,6 +421,37 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 		// Requires resource-agents >= 4.10.0-71.el9_6.13 (RHEL 9) or >= 4.16.0-33.el10 (RHEL 10).
 		survivedNode := peerNode
 
+		// Set up two-hop SSH (local → hypervisor → node) for post-panic verification.
+		// After kernel panic the Kubernetes API is unstable for minutes, making oc debug
+		// unreliable. SSH via the hypervisor bypasses the API entirely.
+		if !exutil.HasHypervisorConfig() {
+			g.Skip("Hypervisor SSH config required for kernel panic verification")
+		}
+		sshCfg := exutil.GetHypervisorConfig()
+		o.Expect(sshCfg).NotTo(o.BeNil(), "Failed to parse hypervisor config")
+		o.Expect(sshCfg.HypervisorIP).NotTo(o.BeEmpty(), "Hypervisor IP is empty")
+		o.Expect(sshCfg.SSHUser).NotTo(o.BeEmpty(), "Hypervisor SSH user is empty")
+		o.Expect(sshCfg.PrivateKeyPath).NotTo(o.BeEmpty(), "Hypervisor private key path is empty")
+		_, err := os.Stat(sshCfg.PrivateKeyPath)
+		o.Expect(err).NotTo(o.HaveOccurred(), "Hypervisor private key not readable at %s", sshCfg.PrivateKeyPath)
+		hypervisorConfig := core.SSHConfig{
+			IP:             sshCfg.HypervisorIP,
+			User:           sshCfg.SSHUser,
+			PrivateKeyPath: sshCfg.PrivateKeyPath,
+		}
+		localKH, err := core.PrepareLocalKnownHostsFile(&hypervisorConfig)
+		o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare local known hosts")
+
+		survivedNodeIP := utils.GetNodeInternalIP(&survivedNode)
+		o.Expect(survivedNodeIP).NotTo(o.BeEmpty(), "survived node has no internal IP")
+		targetNodeIP := utils.GetNodeInternalIP(&targetNode)
+		o.Expect(targetNodeIP).NotTo(o.BeEmpty(), "target node has no internal IP")
+
+		survivedRemoteKH, err := core.PrepareRemoteKnownHostsFile(survivedNodeIP, &hypervisorConfig, localKH)
+		o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for survived node")
+		targetRemoteKH, err := core.PrepareRemoteKnownHostsFile(targetNodeIP, &hypervisorConfig, localKH)
+		o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for target node")
+
 		g.By("Logging resource-agents RPM version")
 		raVersion, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
 			"bash", "-c", "rpm -q resource-agents")
@@ -462,8 +493,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 		g.By("Reading bump-amount from journal log on survived node")
 		var journalBump int
 		o.Eventually(func() error {
-			journalOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
-				"bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp))
+			journalOutput, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
+				fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp),
+				&hypervisorConfig, localKH, survivedRemoteKH)
 			if err != nil {
 				return fmt.Errorf("failed to read journal: %v", err)
 			}
@@ -482,8 +514,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 		g.By("Verifying force-new-cluster-bump-amount in config.yaml matches journal bump-amount")
 		var configBump int
 		o.Eventually(func() error {
-			bumpAmountStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
-				"bash", "-c", "grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'")
+			bumpAmountStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
+				"sudo grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'",
+				&hypervisorConfig, localKH, survivedRemoteKH)
 			if err != nil {
 				return fmt.Errorf("failed to read bump amount: %v", err)
 			}
@@ -497,8 +530,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 			fmt.Sprintf("config.yaml bump-amount %d should match journal bump-amount %d", configBump, journalBump))
 
 		g.By("Independently verifying bump amount is approximately floor(maxRaftIndex * 0.2)")
-		raftIndexStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
-			"bash", "-c", "jq -r '.maxRaftIndex' /var/lib/etcd/revision.json")
+		raftIndexStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
+			"sudo jq -r '.maxRaftIndex' /var/lib/etcd/revision.json",
+			&hypervisorConfig, localKH, survivedRemoteKH)
 		o.Expect(err).To(o.BeNil())
 		maxRaftIndex, err := strconv.Atoi(strings.TrimSpace(raftIndexStr))
 		o.Expect(err).To(o.BeNil())
@@ -521,22 +555,40 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
 			memberPromotedVotingTimeout, utils.FiveSecondPollInterval)
 
 		g.By(fmt.Sprintf("Verifying etcd container is running on %s", targetNode.Name))
-		got, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
-			strings.Split(ensurePodmanEtcdContainerIsRunning, " ")...)
-		o.Expect(err).To(o.BeNil())
-		o.Expect(got).To(o.Equal("'true'"), fmt.Sprintf("expected etcd container running on %s", targetNode.Name))
+		o.Eventually(func() error {
+			got, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
+				"sudo "+ensurePodmanEtcdContainerIsRunning,
+				&hypervisorConfig, localKH, targetRemoteKH)
+			if err != nil {
+				return fmt.Errorf("failed to inspect etcd container: %v", err)
+			}
+			if strings.TrimSpace(got) != "true" {
+				return fmt.Errorf("etcd container not running on %s: got %s", targetNode.Name, got)
+			}
+			return nil
+		}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
+			fmt.Sprintf("expected etcd container running on %s", targetNode.Name))
 
 		g.By(fmt.Sprintf("Verifying etcd-previous container exists on %s", targetNode.Name))
-		prevOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
-			"bash", "-c", "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous")
-		o.Expect(err).To(o.BeNil(), fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name))
-		o.Expect(strings.TrimSpace(prevOutput)).To(o.Equal("etcd-previous"),
-			fmt.Sprintf("expected etcd-previous container on %s", targetNode.Name))
+		o.Eventually(func() error {
+			prevOutput, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
+				"sudo podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous",
+				&hypervisorConfig, localKH, targetRemoteKH)
+			if err != nil {
+				return fmt.Errorf("etcd-previous container not found on %s: %v", targetNode.Name, err)
+			}
+			if strings.TrimSpace(prevOutput) != "etcd-previous" {
+				return fmt.Errorf("expected etcd-previous container on %s, got %q", targetNode.Name, prevOutput)
+			}
+			return nil
+		}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
+			fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name))
 
 		g.By(fmt.Sprintf("Verifying pod.yaml was recreated on %s via pacemaker log", targetNode.Name))
 		o.Eventually(func() error {
-			_, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
-				"bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp))
+			_, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
+				fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp),
+				&hypervisorConfig, localKH, targetRemoteKH)
 			return err
 		}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
 			"Expected pacemaker log to contain pod.yaml recreation entry after reboot")