Skip to content

Commit 98eff7d

Browse files
authored
Collect coredumps on all nodes (#69344)
Configure all nodes to save coredumps, and collect any coredumps that were saved during the gather-core-dump step.
1 parent 4078231 commit 98eff7d

6 files changed

Lines changed: 151 additions & 0 deletions

File tree

ci-operator/config/openshift/release/openshift-release-master__ci-4.20-upgrade-from-stable-4.19.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@ tests:
9797
observers:
9898
enable:
9999
- observers-resource-watch
100+
post:
101+
- chain: gather-core-dump
102+
- chain: ipi-azure-post
103+
pre:
104+
- ref: enable-node-coredumps
105+
- chain: ipi-azure-pre-stableinitial
100106
workflow: openshift-upgrade-azure-ovn
101107
timeout: 5h30m0s
102108
- as: e2e-aws-ovn-uwm

ci-operator/config/openshift/router/openshift-router-master.yaml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ tests:
6161
skip_if_only_changed: ^docs/|\.md$|^(?:.*/)?(?:\.gitignore|OWNERS|OWNERS_ALIASES|PROJECT|LICENSE)$
6262
steps:
6363
cluster_profile: gcp-3
64+
post:
65+
- chain: gather-core-dump
66+
- chain: ipi-gcp-post
67+
pre:
68+
- ref: enable-node-coredumps
69+
- chain: ipi-gcp-pre
6470
workflow: openshift-e2e-gcp
6571
- always_run: false
6672
as: e2e-metal-ipi-ovn-ipv6
@@ -72,6 +78,12 @@ tests:
7278
DEVSCRIPTS_CONFIG: |
7379
IP_STACK=v6
7480
NETWORK_TYPE=OVNKubernetes
81+
post:
82+
- chain: gather-core-dump
83+
- chain: baremetalds-ofcir-post
84+
pre:
85+
- ref: enable-node-coredumps
86+
- chain: baremetalds-ofcir-pre
7587
workflow: baremetalds-e2e
7688
- always_run: false
7789
as: e2e-metal-ipi-ovn-dualstack
@@ -83,6 +95,12 @@ tests:
8395
DEVSCRIPTS_CONFIG: |
8496
IP_STACK=v4v6
8597
NETWORK_TYPE=OVNKubernetes
98+
post:
99+
- chain: gather-core-dump
100+
- chain: baremetalds-ofcir-post
101+
pre:
102+
- ref: enable-node-coredumps
103+
- chain: baremetalds-ofcir-pre
86104
workflow: baremetalds-e2e
87105
- as: e2e-aws-serial
88106
shard_count: 2
@@ -101,6 +119,12 @@ tests:
101119
DEVSCRIPTS_CONFIG: |
102120
IP_STACK=v4v6
103121
NETWORK_TYPE=OVNKubernetes
122+
post:
123+
- chain: gather-core-dump
124+
- chain: baremetalds-ofcir-post
125+
pre:
126+
- ref: enable-node-coredumps
127+
- chain: baremetalds-ofcir-pre
104128
test:
105129
- as: baremetalds-e2e-conf-router
106130
commands: |
@@ -149,6 +173,12 @@ tests:
149173
skip_if_only_changed: ^docs/|\.md$|^(?:.*/)?(?:\.gitignore|OWNERS|OWNERS_ALIASES|PROJECT|LICENSE)$
150174
steps:
151175
cluster_profile: azure4
176+
post:
177+
- chain: gather-core-dump
178+
- chain: ipi-azure-post
179+
pre:
180+
- ref: enable-node-coredumps
181+
- chain: ipi-azure-pre
152182
workflow: openshift-upgrade-azure
153183
- always_run: false
154184
as: perfscale-aws-ingress-perf
@@ -163,8 +193,10 @@ tests:
163193
OPENSHIFT_INFRA_NODE_INSTANCE_TYPE: c5.4xlarge
164194
SET_ENV_BY_PLATFORM: custom
165195
post:
196+
- chain: gather-core-dump
166197
- chain: ipi-aws-post
167198
pre:
199+
- ref: enable-node-coredumps
168200
- chain: ipi-aws-pre
169201
- chain: create-infra-move-ingress-monitoring-registry
170202
test:
@@ -184,8 +216,10 @@ tests:
184216
OPENSHIFT_INFRA_NODE_INSTANCE_TYPE: c5.4xlarge
185217
SET_ENV_BY_PLATFORM: custom
186218
post:
219+
- chain: gather-core-dump
187220
- chain: ipi-aws-post
188221
pre:
222+
- ref: enable-node-coredumps
189223
- chain: ipi-aws-pre
190224
- ref: fips-check
191225
- chain: create-infra-move-ingress-monitoring-registry
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
approvers:
2+
- knobunc
3+
- Miciah
4+
- candita
5+
- rfredette
6+
- alebedev87
7+
- gcs278
8+
- Thealisyed
9+
- grzpiotrowski
10+
- rikatz
11+
- bentito
12+
options: {}
13+
reviewers:
14+
- knobunc
15+
- Miciah
16+
- candita
17+
- rfredette
18+
- alebedev87
19+
- gcs278
20+
- Thealisyed
21+
- grzpiotrowski
22+
- rikatz
23+
- bentito
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
set -o errexit
3+
set -o nounset
4+
set -o pipefail
5+
6+
# Create a machine config that installs a systemd unit on nodes. The systemd unit configures the nodes to save any
7+
# coredumps that are generated, which will be collected during the gather-extra step.
8+
9+
echo "Creating manifests to enable coredump collection on nodes"
10+
11+
for role in master worker; do
12+
cat > "${SHARED_DIR}/manifest_enable_node_coredumps_machineconfig_${role}.yaml" <<EOF
13+
apiVersion: machineconfiguration.openshift.io/v1
14+
kind: MachineConfig
15+
metadata:
16+
labels:
17+
machineconfiguration.openshift.io/role: $role
18+
name: enable-node-coredumps-${role}
19+
spec:
20+
config:
21+
ignition:
22+
version: 3.2.0
23+
systemd:
24+
units:
25+
- contents: |
26+
[Unit]
27+
After=multi-user.target
28+
29+
[Service]
30+
Type=oneshot
31+
ExecStart=sysctl -w fs.suid_dumpable=1
32+
33+
[Install]
34+
WantedBy=multi-user.target
35+
enabled: true
36+
name: enable-node-coredumps.service
37+
EOF
38+
echo "manifest_enable_node_coredumps_machineconfig_${role}.yaml"
39+
echo "---------------------------------------------"
40+
cat ${SHARED_DIR}/manifest_enable_node_coredumps_machineconfig_${role}.yaml
41+
done
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"path": "enable-node-coredumps/enable-node-coredumps-ref.yaml",
3+
"owners": {
4+
"approvers": [
5+
"knobunc",
6+
"Miciah",
7+
"candita",
8+
"rfredette",
9+
"alebedev87",
10+
"gcs278",
11+
"Thealisyed",
12+
"grzpiotrowski",
13+
"rikatz",
14+
"bentito"
15+
],
16+
"reviewers": [
17+
"knobunc",
18+
"Miciah",
19+
"candita",
20+
"rfredette",
21+
"alebedev87",
22+
"gcs278",
23+
"Thealisyed",
24+
"grzpiotrowski",
25+
"rikatz",
26+
"bentito"
27+
]
28+
}
29+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
ref:
2+
as: enable-node-coredumps
3+
from_image:
4+
namespace: origin
5+
name: centos
6+
tag: '8'
7+
commands: enable-node-coredumps-commands.sh
8+
resources:
9+
requests:
10+
cpu: 10m
11+
memory: 100Mi
12+
documentation: |-
13+
The coredump service configures nodes to save all generated coredumps. This is useful for debugging failures of
14+
components running in privileged pods, such as router pods.
15+
The service is deployed by injecting an installer manifest containing a MachineConfig. This contains one systemd
16+
unit, which sets the sysctl variable fs.suid_dumpable to 1, instructing systemd-coredump to save any coredumps it
17+
encounters into /var/lib/systemd/coredump/ . Those coredump files can then be collected using the gather-core-dump
18+
chain.

0 commit comments

Comments
 (0)