-
Notifications
You must be signed in to change notification settings - Fork 4.8k
Update ocdebug to ssh #31216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Update ocdebug to ssh #31216
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -421,6 +421,37 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual | |
| // Requires resource-agents >= 4.10.0-71.el9_6.13 (RHEL 9) or >= 4.16.0-33.el10 (RHEL 10). | ||
| survivedNode := peerNode | ||
|
|
||
| // Set up two-hop SSH (local → hypervisor → node) for post-panic verification. | ||
| // After kernel panic the Kubernetes API is unstable for minutes, making oc debug | ||
| // unreliable. SSH via the hypervisor bypasses the API entirely. | ||
| if !exutil.HasHypervisorConfig() { | ||
| g.Skip("Hypervisor SSH config required for kernel panic verification") | ||
| } | ||
| sshCfg := exutil.GetHypervisorConfig() | ||
| o.Expect(sshCfg).NotTo(o.BeNil(), "Failed to parse hypervisor config") | ||
| o.Expect(sshCfg.HypervisorIP).NotTo(o.BeEmpty(), "Hypervisor IP is empty") | ||
| o.Expect(sshCfg.SSHUser).NotTo(o.BeEmpty(), "Hypervisor SSH user is empty") | ||
| o.Expect(sshCfg.PrivateKeyPath).NotTo(o.BeEmpty(), "Hypervisor private key path is empty") | ||
| _, err := os.Stat(sshCfg.PrivateKeyPath) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Hypervisor private key not readable at %s", sshCfg.PrivateKeyPath) | ||
| hypervisorConfig := core.SSHConfig{ | ||
| IP: sshCfg.HypervisorIP, | ||
| User: sshCfg.SSHUser, | ||
| PrivateKeyPath: sshCfg.PrivateKeyPath, | ||
| } | ||
| localKH, err := core.PrepareLocalKnownHostsFile(&hypervisorConfig) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare local known hosts") | ||
|
|
||
| survivedNodeIP := utils.GetNodeInternalIP(&survivedNode) | ||
| o.Expect(survivedNodeIP).NotTo(o.BeEmpty(), "survived node has no internal IP") | ||
| targetNodeIP := utils.GetNodeInternalIP(&targetNode) | ||
| o.Expect(targetNodeIP).NotTo(o.BeEmpty(), "target node has no internal IP") | ||
|
|
||
| survivedRemoteKH, err := core.PrepareRemoteKnownHostsFile(survivedNodeIP, &hypervisorConfig, localKH) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for survived node") | ||
| targetRemoteKH, err := core.PrepareRemoteKnownHostsFile(targetNodeIP, &hypervisorConfig, localKH) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Failed to prepare remote known hosts for target node") | ||
|
|
||
| g.By("Logging resource-agents RPM version") | ||
| raVersion, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd", | ||
| "bash", "-c", "rpm -q resource-agents") | ||
|
|
@@ -462,8 +493,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual | |
| g.By("Reading bump-amount from journal log on survived node") | ||
| var journalBump int | ||
| o.Eventually(func() error { | ||
| journalOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd", | ||
| "bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp)) | ||
| journalOutput, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP, | ||
| fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' | grep 'bump-amount' | tail -1", crashTimestamp), | ||
| &hypervisorConfig, localKH, survivedRemoteKH) | ||
| if err != nil { | ||
|
Comment on lines
+496
to
499
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "== SSH helper timeout-related code =="
rg -n -C4 'func ExecuteSSHCommand|func ExecuteRemoteSSHCommand|ConnectTimeout|BatchMode|ServerAliveInterval|ServerAliveCountMax|CommandContext|WithTimeout|context\.' test/extended/edge_topologies/utils/core/ssh.go
echo
echo "== SSH-based recovery call sites =="
sed -n '493,594p' test/extended/edge_topologies/tnf_recovery.go | awk '{print NR+492 ":" $0}'Repository: openshift/origin Length of output: 6907 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "== ssh.go: ExecuteSSHCommand (key options) =="
sed -n '120,240p' test/extended/edge_topologies/utils/core/ssh.go
echo
echo "== ssh.go: ExecuteRemoteSSHCommand (nestedSSHCommand) =="
sed -n '180,260p' test/extended/edge_topologies/utils/core/ssh.go
echo
echo "== ssh.go: timeout/cancel related usage =="
rg -n 'ConnectTimeout|BatchMode|ServerAliveInterval|ServerAliveCountMax|CommandContext|WithTimeout|context\.' test/extended/edge_topologies/utils/core/ssh.goRepository: openshift/origin Length of output: 8508 Bound each two-hop SSH attempt.
Make each SSH invocation time-bounded (e.g., switch to Also applies to the 🤖 Prompt for AI Agents
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The SSH calls are already inside Eventually loops with 5-20 minute timeouts and 5-second poll intervals. If an SSH call hangs, the Eventually will time out and fail the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Resolve the CodeRabbit review comment on this PR
gh api \
--method PUT \
-H "Accept: application/vnd.github+json" \
"repos/openshift/origin/pulls/31216/reviews" 2>/dev/null || true
# Find and resolve the specific review comment (ID from the URL: r3316838694)
gh api \
--method PATCH \
-H "Accept: application/vnd.github+json" \
"repos/openshift/origin/pulls/comments/3316838694/replies" 2>/dev/null || true
# List review threads to find the right one to resolve
gh api \
--method GET \
-H "Accept: application/vnd.github.v3+json" \
"repos/openshift/origin/pulls/31216/comments" \
--jq '.[] | select(.user.login == "coderabbitai[bot]") | {id: .id, body: .body[0:80]}'Length of output: 1595 🏁 Script executed: #!/bin/bash
# Use GraphQL to find the review thread and resolve it
# First, get the pull request node ID and review threads
gh api graphql -f query='
{
repository(owner: "openshift", name: "origin") {
pullRequest(number: 31216) {
id
reviewThreads(first: 50) {
nodes {
id
isResolved
comments(first: 1) {
nodes {
databaseId
author { login }
}
}
}
}
}
}
}' --jq '.data.repository.pullRequest.reviewThreads.nodes[] | select(.comments.nodes[0].databaseId == 3316838694) | {threadId: .id, isResolved: .isResolved}'Length of output: 747 🏁 Script executed: #!/bin/bash
# Resolve the review thread
gh api graphql -f query='
mutation {
resolveReviewThread(input: {threadId: "PRRT_kwDOAVZyrM6FWyxq"}) {
thread {
id
isResolved
}
}
}'Length of output: 705
Would you like me to open a follow-up GitHub issue to track adding per-attempt 🧠 Learnings used |
||
| return fmt.Errorf("failed to read journal: %v", err) | ||
| } | ||
|
|
@@ -482,8 +514,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual | |
| g.By("Verifying force-new-cluster-bump-amount in config.yaml matches journal bump-amount") | ||
| var configBump int | ||
| o.Eventually(func() error { | ||
| bumpAmountStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd", | ||
| "bash", "-c", "grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'") | ||
| bumpAmountStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP, | ||
| "sudo grep 'force-new-cluster-bump-amount:' /var/lib/etcd/config.yaml | awk '{print $2}'", | ||
| &hypervisorConfig, localKH, survivedRemoteKH) | ||
|
kasturinarra marked this conversation as resolved.
|
||
| if err != nil { | ||
| return fmt.Errorf("failed to read bump amount: %v", err) | ||
| } | ||
|
|
@@ -497,8 +530,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual | |
| fmt.Sprintf("config.yaml bump-amount %d should match journal bump-amount %d", configBump, journalBump)) | ||
|
|
||
| g.By("Independently verifying bump amount is approximately floor(maxRaftIndex * 0.2)") | ||
| raftIndexStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, survivedNode.Name, "openshift-etcd", | ||
| "bash", "-c", "jq -r '.maxRaftIndex' /var/lib/etcd/revision.json") | ||
| raftIndexStr, _, err := core.ExecuteRemoteSSHCommand(survivedNodeIP, | ||
| "sudo jq -r '.maxRaftIndex' /var/lib/etcd/revision.json", | ||
| &hypervisorConfig, localKH, survivedRemoteKH) | ||
| o.Expect(err).To(o.BeNil()) | ||
| maxRaftIndex, err := strconv.Atoi(strings.TrimSpace(raftIndexStr)) | ||
| o.Expect(err).To(o.BeNil()) | ||
|
|
@@ -521,22 +555,40 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual | |
| memberPromotedVotingTimeout, utils.FiveSecondPollInterval) | ||
|
|
||
| g.By(fmt.Sprintf("Verifying etcd container is running on %s", targetNode.Name)) | ||
| got, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", | ||
| strings.Split(ensurePodmanEtcdContainerIsRunning, " ")...) | ||
| o.Expect(err).To(o.BeNil()) | ||
| o.Expect(got).To(o.Equal("'true'"), fmt.Sprintf("expected etcd container running on %s", targetNode.Name)) | ||
| o.Eventually(func() error { | ||
| got, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP, | ||
| "sudo "+ensurePodmanEtcdContainerIsRunning, | ||
| &hypervisorConfig, localKH, targetRemoteKH) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to inspect etcd container: %v", err) | ||
| } | ||
| if strings.TrimSpace(got) != "true" { | ||
| return fmt.Errorf("etcd container not running on %s: got %s", targetNode.Name, got) | ||
| } | ||
| return nil | ||
| }, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), | ||
| fmt.Sprintf("expected etcd container running on %s", targetNode.Name)) | ||
|
|
||
| g.By(fmt.Sprintf("Verifying etcd-previous container exists on %s", targetNode.Name)) | ||
| prevOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", | ||
| "bash", "-c", "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous") | ||
| o.Expect(err).To(o.BeNil(), fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name)) | ||
| o.Expect(strings.TrimSpace(prevOutput)).To(o.Equal("etcd-previous"), | ||
| fmt.Sprintf("expected etcd-previous container on %s", targetNode.Name)) | ||
| o.Eventually(func() error { | ||
| prevOutput, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP, | ||
| "sudo podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous", | ||
| &hypervisorConfig, localKH, targetRemoteKH) | ||
| if err != nil { | ||
| return fmt.Errorf("etcd-previous container not found on %s: %v", targetNode.Name, err) | ||
| } | ||
| if strings.TrimSpace(prevOutput) != "etcd-previous" { | ||
| return fmt.Errorf("expected etcd-previous container on %s, got %q", targetNode.Name, prevOutput) | ||
| } | ||
| return nil | ||
| }, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), | ||
| fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name)) | ||
|
kasturinarra marked this conversation as resolved.
|
||
|
|
||
| g.By(fmt.Sprintf("Verifying pod.yaml was recreated on %s via pacemaker log", targetNode.Name)) | ||
| o.Eventually(func() error { | ||
| _, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", | ||
| "bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp)) | ||
| _, _, err := core.ExecuteRemoteSSHCommand(targetNodeIP, | ||
| fmt.Sprintf("sudo journalctl -u pacemaker --since '%s' --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", crashTimestamp), | ||
| &hypervisorConfig, localKH, targetRemoteKH) | ||
| return err | ||
| }, 5*time.Minute, utils.FiveSecondPollInterval).ShouldNot(o.HaveOccurred(), | ||
| "Expected pacemaker log to contain pod.yaml recreation entry after reboot") | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.