Skip to content

Commit 5abf58b

Browse files
ci-penbot-01yansun1996Copilot
authored
[Fix] GPUOP-607 fail the ANR workflow when imagePullBackOff (#1274) (#500)
* [Fix] GPUOP-607 fail the ANR workflow when imagePullBackOff * Update internal/controllers/remediation/scripts/test.sh * Update internal/controllers/remediation/scripts/test.sh --------- (cherry picked from commit 344e480) Signed-off-by: yansun1996 <Yan.Sun3@amd.com> Co-authored-by: Yan Sun <Yan.Sun3@amd.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent ac2086b commit 5abf58b

1 file changed

Lines changed: 19 additions & 0 deletions

File tree

  • internal/controllers/remediation/scripts

internal/controllers/remediation/scripts/test.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,25 @@ while true; do
205205
fi
206206
isComplete=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')
207207
isFailure=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}')
208+
209+
# Check for ImagePullBackOff or ErrImagePull in pod status
210+
podStatus=$(kubectl get pods -n "$NAMESPACE" -l "job-name=$JOB_NAME" -o jsonpath='{.items[*].status.containerStatuses[*].state.waiting.reason}' 2>/dev/null || true)
211+
initContainerStatus=$(kubectl get pods -n "$NAMESPACE" -l "job-name=$JOB_NAME" -o jsonpath='{.items[*].status.initContainerStatuses[*].state.waiting.reason}' 2>/dev/null || true)
212+
213+
if [[ "$podStatus" == *"ImagePullBackOff"* ]] || [[ "$podStatus" == *"ErrImagePull"* ]]; then
214+
echo "Error: Image pull failure detected in container."
215+
kubectl describe pods -n "$NAMESPACE" -l "job-name=$JOB_NAME" || true
216+
kubectl logs -n $NAMESPACE job/$JOB_NAME 2>/dev/null || true
217+
exit 1
218+
fi
219+
220+
if [[ "$initContainerStatus" == *"ImagePullBackOff"* ]] || [[ "$initContainerStatus" == *"ErrImagePull"* ]]; then
221+
echo "Error: Image pull failure detected in init container."
222+
kubectl describe pods -n "$NAMESPACE" -l "job-name=$JOB_NAME" || true
223+
kubectl logs -n $NAMESPACE job/$JOB_NAME 2>/dev/null || true
224+
exit 1
225+
fi
226+
208227
if [ "$isComplete" = "True" ]; then
209228
echo "Test runner job completed successfully."
210229
kubectl logs -n $NAMESPACE job/$JOB_NAME

0 commit comments

Comments
 (0)