From 9d957bf8e231f1895b616078a54b6324cc8e5af2 Mon Sep 17 00:00:00 2001 From: Yan Sun Date: Thu, 2 Apr 2026 10:09:15 -0700 Subject: [PATCH] [Fix] GPUOP-607 fail the ANR workflow when imagePullBackOff (#1274) * [Fix] GPUOP-607 fail the ANR workflow when imagePullBackOff Signed-off-by: yansun1996 * Update internal/controllers/remediation/scripts/test.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update internal/controllers/remediation/scripts/test.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Signed-off-by: yansun1996 Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> (cherry picked from commit 344e480dacbe64c8c3e99129be5c3b09537fc5e5) --- .../controllers/remediation/scripts/test.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/internal/controllers/remediation/scripts/test.sh b/internal/controllers/remediation/scripts/test.sh index 170e29a9..3c850ede 100644 --- a/internal/controllers/remediation/scripts/test.sh +++ b/internal/controllers/remediation/scripts/test.sh @@ -205,6 +205,25 @@ while true; do fi isComplete=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}') isFailure=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}') + + # Check for ImagePullBackOff or ErrImagePull in pod status + podStatus=$(kubectl get pods -n "$NAMESPACE" -l "job-name=$JOB_NAME" -o jsonpath='{.items[*].status.containerStatuses[*].state.waiting.reason}' 2>/dev/null || true) + initContainerStatus=$(kubectl get pods -n "$NAMESPACE" -l "job-name=$JOB_NAME" -o jsonpath='{.items[*].status.initContainerStatuses[*].state.waiting.reason}' 2>/dev/null || true) + + if [[ "$podStatus" == *"ImagePullBackOff"* ]] || [[ "$podStatus" == *"ErrImagePull"* ]]; then + echo "Error: Image pull failure detected in container." + kubectl describe pods -n "$NAMESPACE" -l "job-name=$JOB_NAME" || true + kubectl logs -n $NAMESPACE job/$JOB_NAME 2>/dev/null || true + exit 1 + fi + + if [[ "$initContainerStatus" == *"ImagePullBackOff"* ]] || [[ "$initContainerStatus" == *"ErrImagePull"* ]]; then + echo "Error: Image pull failure detected in init container." + kubectl describe pods -n "$NAMESPACE" -l "job-name=$JOB_NAME" || true + kubectl logs -n $NAMESPACE job/$JOB_NAME 2>/dev/null || true + exit 1 + fi + if [ "$isComplete" = "True" ]; then echo "Test runner job completed successfully." kubectl logs -n $NAMESPACE job/$JOB_NAME