Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 69 additions & 17 deletions test/extended/edge_topologies/tnf_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,37 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
// Requires resource-agents >= 4.10.0-71.el9_6.13 (RHEL 9) or >= 4.16.0-33.el10 (RHEL 10).
survivedNode := peerNode

// Set up two-hop SSH (local → hypervisor → node) for post-panic verification.
// After kernel panic the Kubernetes API is unstable for minutes, making oc debug
// unreliable. SSH via the hypervisor bypasses the API entirely.
if !exutil.HasHypervisorConfig() {
g.Skip("Hypervisor SSH config required for kernel panic verification")
}
sshCfg := exutil.GetHypervisorConfig()
o.Expect(sshCfg).NotTo(o.BeNil(), "Failed to parse hypervisor config")
o.Expect(sshCfg.HypervisorIP).NotTo(o.BeEmpty(), "Hypervisor IP is empty")
o.Expect(sshCfg.SSHUser).NotTo(o.BeEmpty(), "Hypervisor SSH user is empty")
o.Expect(sshCfg.PrivateKeyPath).NotTo(o.BeEmpty(), "Hypervisor private key path is empty")
_, err := os.Stat(sshCfg.PrivateKeyPath)
o.Expect(err).NotTo(o.HaveOccurred(), "Hypervisor private key not readable at %s", sshCfg.PrivateKeyPath)
hypervisorConfig := core.SSHConfig{
IP: sshCfg.HypervisorIP,
User: sshCfg.SSHUser,
PrivateKeyPath: sshCfg.PrivateKeyPath,
}
localKH, err := core.PrepareLocalKnownHostsFile(&hypervisorConfig)
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare local known hosts")
Comment thread
coderabbitai[bot] marked this conversation as resolved.

survivedNodeIP := utils.GetNodeInternalIP(&survivedNode)
o.Expect(survivedNodeIP).NotTo(o.BeEmpty(), "survived node has no internal IP")
targetNodeIP := utils.GetNodeInternalIP(&targetNode)
o.Expect(targetNodeIP).NotTo(o.BeEmpty(), "target node has no internal IP")

survivedRemoteKH, err := core.PrepareRemoteKnownHostsFile(survivedNodeIP, &hypervisorConfig, localKH)
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for survived node")
targetRemoteKH, err := core.PrepareRemoteKnownHostsFile(targetNodeIP, &hypervisorConfig, localKH)
o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for target node")

g.By("Logging resource-agents RPM version")
raVersion, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
"bash", "-c", "rpm -q resource-agents")
Expand Down Expand Up @@ -462,8 +493,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
g.By("Reading bump-amount from journal log on survived node")
var journalBump int
o.Eventually(func() error {
journalOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
"bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp))
journalOutput, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp),
&hypervisorConfig, localKH, survivedRemoteKH)
if err != nil {
Comment on lines +496 to 499
Copy link
Copy Markdown

@coderabbitai coderabbitai Bot May 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "== SSH helper timeout-related code =="
rg -n -C4 'func ExecuteSSHCommand|func ExecuteRemoteSSHCommand|ConnectTimeout|BatchMode|ServerAliveInterval|ServerAliveCountMax|CommandContext|WithTimeout|context\.' test/extended/edge_topologies/utils/core/ssh.go

echo
echo "== SSH-based recovery call sites =="
sed -n '493,594p' test/extended/edge_topologies/tnf_recovery.go | awk '{print NR+492 ":" $0}'

Repository: openshift/origin

Length of output: 6907


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "== ssh.go: ExecuteSSHCommand (key options) =="
sed -n '120,240p' test/extended/edge_topologies/utils/core/ssh.go

echo
echo "== ssh.go: ExecuteRemoteSSHCommand (nestedSSHCommand) =="
sed -n '180,260p' test/extended/edge_topologies/utils/core/ssh.go

echo
echo "== ssh.go: timeout/cancel related usage =="
rg -n 'ConnectTimeout|BatchMode|ServerAliveInterval|ServerAliveCountMax|CommandContext|WithTimeout|context\.' test/extended/edge_topologies/utils/core/ssh.go

Repository: openshift/origin

Length of output: 8508


Bound each two-hop SSH attempt.

test/extended/edge_topologies/utils/core/ssh.go’s ExecuteRemoteSSHCommand just builds a nested ssh and calls ExecuteSSHCommand, which runs exec.Command("ssh", ...) with no ConnectTimeout/BatchMode/keepalive options and no CommandContext/deadline. A hung (half-booted) node can therefore block the Eventually callback past the assertion’s retry window instead of returning and retrying.

Make each SSH invocation time-bounded (e.g., switch to exec.CommandContext with a per-attempt timeout, and/or add ssh options like -o BatchMode=yes and -o ConnectTimeout=<...> in the helper).

Also applies to the core.ExecuteRemoteSSHCommand call sites at: 517-519, 533-535, 558-570, 573-585, 589-591.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@test/extended/edge_topologies/tnf_recovery.go` around lines 496 - 499, The
SSH helpers (ExecuteRemoteSSHCommand and ExecuteSSHCommand) currently spawn
exec.Command("ssh", ...) with no per-attempt deadline, which can hang the test;
update these helpers to time-bound every two-hop invocation by using
exec.CommandContext with a context created via context.WithTimeout for each call
and/or append safe SSH options such as "-o BatchMode=yes" and "-o
ConnectTimeout=<seconds>" (and optionally "-o ServerAliveInterval=<sec>") to the
constructed ssh arguments so nested SSH calls also time out; ensure every call
site that builds a nested ssh command (including ExecuteRemoteSSHCommand) uses
the new context-aware invocation so a hung node returns an error quickly and
allows the Eventually retry loop to proceed.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The SSH calls are already inside Eventually loops with 5-20 minute timeouts and 5-second poll intervals. If an SSH call hangs, the Eventually will time out and fail the
test. Adding CommandContext or ConnectTimeout to the shared SSH helpers would change behavior for all callers across the test suite (cold-boot tests, node replacement tests, etc.) — that's
a much larger blast radius than the kernel panic fix we're making. The finding is valid in principle but it's not causing the current failures and shouldn't be mixed into this PR.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Resolve the CodeRabbit review comment on this PR
gh api \
  --method PUT \
  -H "Accept: application/vnd.github+json" \
  "repos/openshift/origin/pulls/31216/reviews" 2>/dev/null || true

# Find and resolve the specific review comment (ID from the URL: r3316838694)
gh api \
  --method PATCH \
  -H "Accept: application/vnd.github+json" \
  "repos/openshift/origin/pulls/comments/3316838694/replies" 2>/dev/null || true

# List review threads to find the right one to resolve
gh api \
  --method GET \
  -H "Accept: application/vnd.github.v3+json" \
  "repos/openshift/origin/pulls/31216/comments" \
  --jq '.[] | select(.user.login == "coderabbitai[bot]") | {id: .id, body: .body[0:80]}'

Length of output: 1595


🏁 Script executed:

#!/bin/bash
# Use GraphQL to find the review thread and resolve it
# First, get the pull request node ID and review threads
gh api graphql -f query='
{
  repository(owner: "openshift", name: "origin") {
    pullRequest(number: 31216) {
      id
      reviewThreads(first: 50) {
        nodes {
          id
          isResolved
          comments(first: 1) {
            nodes {
              databaseId
              author { login }
            }
          }
        }
      }
    }
  }
}' --jq '.data.repository.pullRequest.reviewThreads.nodes[] | select(.comments.nodes[0].databaseId == 3316838694) | {threadId: .id, isResolved: .isResolved}'

Length of output: 747


🏁 Script executed:

#!/bin/bash
# Resolve the review thread
gh api graphql -f query='
mutation {
  resolveReviewThread(input: {threadId: "PRRT_kwDOAVZyrM6FWyxq"}) {
    thread {
      id
      isResolved
    }
  }
}'

Length of output: 705



@kasturinarra, understood — the reasoning is solid and I agree this belongs in a separate PR. I wasn't able to auto-resolve the thread due to integration permissions; you can resolve it manually.

Would you like me to open a follow-up GitHub issue to track adding per-attempt ConnectTimeout/CommandContext to the shared SSH helpers (ExecuteSSHCommand / ExecuteRemoteSSHCommand) so it doesn't get lost?


🧠 Learnings used
Learnt from: Chandan9112
Repo: openshift/origin PR: 31182
File: test/extended/node/node_e2e/node.go:0-0
Timestamp: 2026-05-19T12:28:55.867Z
Learning: When reviewing Go code that uses OpenShift configv1 (`github.com/openshift/api/config/v1`), note that `ImageSpec.RegistrySources` is declared as a non-pointer value (`RegistrySources RegistrySources`). It is always safely zero-initialized by Go, so it can never be `nil`. Therefore, do not add nil-guards for `imageSpec.RegistrySources` before accessing its fields (e.g., `AllowedRegistries`, `ContainerRuntimeSearchRegistries`).

Learnt from: Chandan9112
Repo: openshift/origin PR: 31182
File: test/extended/node/node_e2e/node.go:0-0
Timestamp: 2026-05-19T12:28:55.867Z
Learning: When using the OpenShift `configv1` API (`github.com/openshift/api/config/v1`), treat `ImageSpec.RegistrySources` as a non-nil Go struct value (`RegistrySources`, not `*RegistrySources`). Because it can never be nil (it’s always zero-initialized), don’t add nil-guards before accessing its fields (e.g., `AllowedRegistries`, `ContainerRuntimeSearchRegistries`). You may still need to handle zero-value contents, but a nil check on `RegistrySources` itself is unnecessary.

return fmt.Errorf("failed to read journal: %v", err)
}
Expand All @@ -482,8 +514,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
g.By("Verifying force-new-cluster-bump-amount in config.yaml matches journal bump-amount")
var configBump int
o.Eventually(func() error {
bumpAmountStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
"bash", "-c", "grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'")
bumpAmountStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
"sudo grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'",
&hypervisorConfig, localKH, survivedRemoteKH)
Comment thread
kasturinarra marked this conversation as resolved.
if err != nil {
return fmt.Errorf("failed to read bump amount: %v", err)
}
Expand All @@ -497,8 +530,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
fmt.Sprintf("config.yaml bump-amount %d should match journal bump-amount %d", configBump, journalBump))

g.By("Independently verifying bump amount is approximately floor(maxRaftIndex * 0.2)")
raftIndexStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd",
"bash", "-c", "jq -r '.maxRaftIndex' /var/lib/etcd/revision.json")
raftIndexStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP,
"sudo jq -r '.maxRaftIndex' /var/lib/etcd/revision.json",
&hypervisorConfig, localKH, survivedRemoteKH)
o.Expect(err).To(o.BeNil())
maxRaftIndex, err := strconv.Atoi(strings.TrimSpace(raftIndexStr))
o.Expect(err).To(o.BeNil())
Expand All @@ -521,22 +555,40 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
memberPromotedVotingTimeout, utils.FiveSecondPollInterval)

g.By(fmt.Sprintf("Verifying etcd container is running on %s", targetNode.Name))
got, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
strings.Split(ensurePodmanEtcdContainerIsRunning, " ")...)
o.Expect(err).To(o.BeNil())
o.Expect(got).To(o.Equal("'true'"), fmt.Sprintf("expected etcd container running on %s", targetNode.Name))
o.Eventually(func() error {
got, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
"sudo "+ensurePodmanEtcdContainerIsRunning,
&hypervisorConfig, localKH, targetRemoteKH)
if err != nil {
return fmt.Errorf("failed to inspect etcd container: %v", err)
}
if strings.TrimSpace(got) != "true" {
return fmt.Errorf("etcd container not running on %s: got %s", targetNode.Name, got)
}
return nil
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
fmt.Sprintf("expected etcd container running on %s", targetNode.Name))

g.By(fmt.Sprintf("Verifying etcd-previous container exists on %s", targetNode.Name))
prevOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
"bash", "-c", "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous")
o.Expect(err).To(o.BeNil(), fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name))
o.Expect(strings.TrimSpace(prevOutput)).To(o.Equal("etcd-previous"),
fmt.Sprintf("expected etcd-previous container on %s", targetNode.Name))
o.Eventually(func() error {
prevOutput, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
"sudo podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous",
&hypervisorConfig, localKH, targetRemoteKH)
if err != nil {
return fmt.Errorf("etcd-previous container not found on %s: %v", targetNode.Name, err)
}
if strings.TrimSpace(prevOutput) != "etcd-previous" {
return fmt.Errorf("expected etcd-previous container on %s, got %q", targetNode.Name, prevOutput)
}
return nil
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name))
Comment thread
kasturinarra marked this conversation as resolved.

g.By(fmt.Sprintf("Verifying pod.yaml was recreated on %s via pacemaker log", targetNode.Name))
o.Eventually(func() error {
_, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
"bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp))
_, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP,
fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp),
&hypervisorConfig, localKH, targetRemoteKH)
return err
}, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(),
"Expected pacemaker log to contain pod.yaml recreation entry after reboot")
Expand Down