Skip to content

Commit 80d878a

Browse files
authored
Merge pull request #131 from rootcodelabs/deployment-est-gpu-Bimsara
fix error handling issue
2 parents b004860 + 9f4240e commit 80d878a

2 files changed

Lines changed: 98 additions & 51 deletions

File tree

DSL/CronManager/script/dataset_pipeline_s3.sh

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,17 +379,58 @@ EOF
379379
else
380380
log "S3 download failed - success status: $success_status"
381381
log "Response: $response_body"
382+
383+
# Update progress status to indicate failure
384+
progress_update_payload=$(cat <<EOF
385+
{
386+
"sessionId": "$sessionId",
387+
"generationStatus": "Fail",
388+
"generationMessage": "Generation Failed",
389+
"progressPercentage": 100,
390+
"processComplete": true
391+
}
392+
EOF
393+
)
394+
395+
progress_update_response=$(curl -s -X POST "$PROGRESS_UPDATE_URL" \
396+
-H "Content-Type: application/json" \
397+
-d "$progress_update_payload")
398+
log "Progress status updated to failed: $progress_update_response"
399+
382400
send_failure_status_update "S3 download and extraction failed" "$CURRENT_DATASET_ID" "$response_body" "extraction_failure"
383401
rm -f /tmp/download_response.json
384402
exit 1
385403
fi
386404

387405
else
388406
log "Python script execution failed with exit code: $exit_code"
407+
408+
# Update progress status to indicate failure
409+
progress_update_payload=$(cat <<EOF
410+
{
411+
"sessionId": "$sessionId",
412+
"generationStatus": "Fail",
413+
"generationMessage": "Generation Failed",
414+
"progressPercentage": 100,
415+
"processComplete": true
416+
}
417+
EOF
418+
)
419+
420+
progress_update_response=$(curl -s -X POST "$PROGRESS_UPDATE_URL" \
421+
-H "Content-Type: application/json" \
422+
-d "$progress_update_payload")
423+
log "Progress status updated to failed: $progress_update_response"
424+
389425
if [ -f "$temp_response" ]; then
390426
log "Error response: $(cat $temp_response)"
391-
rm -f /tmp/download_response.json
427+
response_body=$(cat "$temp_response")
428+
send_failure_status_update "Python script execution failed" "$CURRENT_DATASET_ID" "$response_body" "extraction_failure"
429+
else
430+
send_failure_status_update "Python script execution failed - no response data" "$CURRENT_DATASET_ID" "" "extraction_failure"
392431
fi
432+
433+
rm -f /tmp/download_response.json
393434
exit 1
394435
fi
395436

DSL/CronManager/script/train_script_starter.sh

Lines changed: 56 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,48 @@ GET_FIRST_COME_TRAINING_JOB_SQL="http://resql:8082/global-classifier/get-queued-
77
GET_DATA_MODEL_BY_MODEL_ID_SQL="http://resql:8082/global-classifier/get-data-model-info-by-given-model-id"
88
UPDATE_JOB_STATUS="http://resql:8082/global-classifier/update-training-job-status"
99

10+
# Centralized error handling function
11+
handle_training_failure() {
12+
local error_message="$1"
13+
echo "[FAILED] $error_message"
14+
15+
# Only proceed with status updates if we have the required variables
16+
if [ -n "$job_id" ] && [ -n "$model_id" ] && [ -n "$session_id" ]; then
17+
echo "[UPDATE] Updating job status to training-failed..."
18+
response_update_job_status=$(curl -s -X POST "$UPDATE_JOB_STATUS" \
19+
-H "Content-Type: application/json" \
20+
-d "{\"jobId\": $job_id, \"jobStatus\": \"training-failed\"}")
21+
22+
echo "[MODEL] Updating model training status to failed..."
23+
UPDATE_MODEL_TRAINING_STATUS_FAILED="http://resql:8082/global-classifier/update-training_status-failed"
24+
response_update_model_status=$(curl -s -X POST "$UPDATE_MODEL_TRAINING_STATUS_FAILED" \
25+
-H "Content-Type: application/json" \
26+
-d "{\"model_id\": $model_id}")
27+
28+
echo "[PROGRESS] Updating progress session to show training failure..."
29+
UPDATE_PROGRESS_SESSION_ENDPOINT="http://ruuter-public:8086/global-classifier/datamodels/progress/update"
30+
response_update_progress_failure=$(curl -s -X POST "$UPDATE_PROGRESS_SESSION_ENDPOINT" \
31+
-H "Content-Type: application/json" \
32+
-d "{
33+
\"sessionId\": $session_id,
34+
\"trainingStatus\": \"Training Failed\",
35+
\"trainingMessage\": \"Training Failed\",
36+
\"progressPercentage\": 100,
37+
\"processComplete\": false
38+
}")
39+
40+
if [ -z "$response_update_progress_failure" ]; then
41+
echo "[WARNING] Failed to update progress session with failure status"
42+
else
43+
echo "[PROGRESS] Progress session updated with failure status successfully"
44+
fi
45+
else
46+
echo "[WARNING] Cannot update training status - missing required variables (job_id, model_id, or session_id)"
47+
fi
48+
49+
exit 1
50+
}
51+
1052
echo "[START] Training script starter"
1153

1254
# Check if training is in progress
@@ -102,8 +144,7 @@ echo "[DEBUG] Create session response: '$response_create_session'"
102144

103145
# Extract session ID from response
104146
if [ -z "$response_create_session" ]; then
105-
echo "[ERROR] Failed to create training progress session - empty response"
106-
exit 1
147+
handle_training_failure "Failed to create training progress session - empty response"
107148
fi
108149

109150
# Check if session creation was successful
@@ -113,14 +154,14 @@ if echo "$response_create_session" | grep -q '"operationSuccessful":true'; then
113154
if [ -z "$session_id" ] || [ "$session_id" = "$response_create_session" ]; then
114155
echo "[ERROR] Failed to extract session ID from response"
115156
echo "[DEBUG] Raw response: '$response_create_session'"
116-
exit 1
157+
handle_training_failure "Failed to extract session ID from response"
117158
fi
118159

119160
echo "[SESSION] Training progress session created successfully with ID: $session_id"
120161
else
121162
echo "[ERROR] Training progress session creation failed"
122163
echo "[DEBUG] Raw response: '$response_create_session'"
123-
exit 1
164+
handle_training_failure "Training progress session creation failed"
124165
fi
125166

126167
# Update initial training progress
@@ -154,16 +195,15 @@ echo "[DEBUG] Dataset ID response: '$response_get_dataset_id'"
154195

155196
# Handle empty response
156197
if [ -z "$response_get_dataset_id" ] || [ "$response_get_dataset_id" = "[]" ]; then
157-
echo "[ERROR] No dataset information found for model ID: $model_id"
158-
exit 1
198+
handle_training_failure "No dataset information found for model ID: $model_id"
159199
fi
160200

161201
dataset_id=$(echo "$response_get_dataset_id" | sed -E 's/.*"connectedDsId":([0-9]+).*/\1/')
162202

163203
if [ -z "$dataset_id" ] || [ "$dataset_id" = "$response_get_dataset_id" ]; then
164204
echo "[ERROR] Connected Dataset ID not found in response"
165205
echo "[DEBUG] Raw response: '$response_get_dataset_id'"
166-
exit 1
206+
handle_training_failure "Connected Dataset ID not found in response"
167207
fi
168208

169209
echo "[DATASET] Dataset ID: $dataset_id"
@@ -177,12 +217,12 @@ else
177217
echo "[ERROR] Failed to extract base models from response"
178218
echo "[ERROR] Raw response: $response_get_dataset_id"
179219
echo "[ERROR] Extracted base_models: $base_models_json"
180-
exit 1
220+
handle_training_failure "Failed to extract base models from response"
181221
fi
182222

183223
# Activate existing virtualenv
184224
echo "[INFO] Activating existing virtualenv at /app/python_virtual_env"
185-
source /app/python_virtual_env/bin/activate || { echo "[ERROR] Failed to activate virtualenv"; exit 1; }
225+
source /app/python_virtual_env/bin/activate || { echo "[ERROR] Failed to activate virtualenv"; handle_training_failure "Failed to activate Python virtual environment"; }
186226
export PYTHONPATH="/app:/app/src:/app/src/training:/app/src/s3_dataset_processor:$PYTHONPATH"
187227
echo "[DEBUG] PYTHONPATH set to: $PYTHONPATH"
188228
# Add these debug commands
@@ -224,41 +264,41 @@ if [ ${#missing_pkgs[@]} -ne 0 ]; then
224264
# Create installation directory
225265
mkdir -p "$UV_INSTALL_DIR" || {
226266
echo "[ERROR] Failed to create UV installation directory"
227-
exit 1
267+
handle_training_failure "Failed to create UV installation directory"
228268
}
229269

230270
# Use unmanaged installation to avoid root directory modifications
231271
curl -LsSf https://astral.sh/uv/install.sh | env UV_UNMANAGED_INSTALL="$UV_INSTALL_DIR" sh || {
232272
echo "[ERROR] Failed to install uv"
233-
exit 1
273+
handle_training_failure "Failed to install UV package manager"
234274
}
235275

236276
# Verify installation
237277
if [ ! -x "$UV_BIN" ]; then
238278
echo "[ERROR] UV installation failed or not executable"
239-
exit 1
279+
handle_training_failure "UV installation failed or not executable"
240280
fi
241281

242282
# Verify functionality
243283
"$UV_BIN" --version || {
244284
echo "[ERROR] UV installation corrupted"
245-
exit 1
285+
handle_training_failure "UV installation corrupted"
246286
}
247287

248288
echo "[UV] Successfully installed uv (unmanaged) to $UV_INSTALL_DIR"
249289
fi
250290

251291
if [ ! -f /app/src/training/requirements-gpu.txt ]; then
252292
echo "/app/src/training/requirements-gpu.txt not found!"
253-
exit 1
293+
handle_training_failure "Training requirements file not found"
254294
fi
255295

256296
echo "[INSTALL] Installing from /app/src/training/requirements-gpu.txt using secure uv..."
257297
"$UV_BIN" pip install --python "$VIRTUAL_ENV/bin/python3" -r /app/src/training/requirements-gpu.txt || {
258298
echo "[WARNING] uv install failed — trying pip as fallback..."
259299
pip install -r /app/src/training/requirements-gpu.txt || {
260300
echo "[ERROR] Both uv and pip install failed inside virtualenv"
261-
exit 1
301+
handle_training_failure "Failed to install required Python packages"
262302
}
263303
}
264304

@@ -321,41 +361,7 @@ if [ $training_exit_code -eq 0 ]; then
321361

322362
echo "[DEBUG] Update job status to trained response: '$response_update_job_status_trained'"
323363
else
324-
echo "[FAILED] Training failed with exit code: $training_exit_code"
325-
326-
echo "[UPDATE] Updating job status to training-failed..."
327-
response_update_job_status=$(curl -s -X POST "$UPDATE_JOB_STATUS" \
328-
-H "Content-Type: application/json" \
329-
-d "{\"jobId\": $job_id, \"jobStatus\": \"training-failed\"}")
330-
331-
echo "[MODEL] Updating model training status to failed..."
332-
UPDATE_MODEL_TRAINING_STATUS_FAILED="http://resql:8082/global-classifier/update-training_status-failed"
333-
response_update_model_status=$(curl -s -X POST "$UPDATE_MODEL_TRAINING_STATUS_FAILED" \
334-
-H "Content-Type: application/json" \
335-
-d "{\"model_id\": $model_id}")
336-
337-
echo "[DEBUG] Update model training status response: '$response_update_model_status'"
338-
339-
echo "[PROGRESS] Updating progress session to show training failure..."
340-
response_update_progress_failure=$(curl -s -X POST "$UPDATE_PROGRESS_SESSION_ENDPOINT" \
341-
-H "Content-Type: application/json" \
342-
-d "{
343-
\"sessionId\": $session_id,
344-
\"trainingStatus\": \"Training Failed\",
345-
\"trainingMessage\": \"Model training has failed\",
346-
\"progressPercentage\": 100,
347-
\"processComplete\": false
348-
}")
349-
350-
echo "[DEBUG] Update progress failure response: '$response_update_progress_failure'"
351-
352-
if [ -z "$response_update_progress_failure" ]; then
353-
echo "[WARNING] Failed to update progress session with failure status"
354-
else
355-
echo "[PROGRESS] Progress session updated with failure status successfully"
356-
fi
357-
358-
exit 1
364+
handle_training_failure "Model training script failed with exit code: $training_exit_code"
359365
fi
360366

361367
echo "[DONE] Training script starter completed"

0 commit comments

Comments
 (0)