@@ -7,6 +7,48 @@ GET_FIRST_COME_TRAINING_JOB_SQL="http://resql:8082/global-classifier/get-queued-
77GET_DATA_MODEL_BY_MODEL_ID_SQL=" http://resql:8082/global-classifier/get-data-model-info-by-given-model-id"
88UPDATE_JOB_STATUS=" http://resql:8082/global-classifier/update-training-job-status"
99
10+ # Centralized error handling function
11+ handle_training_failure () {
12+ local error_message=" $1 "
13+ echo " [FAILED] $error_message "
14+
15+ # Only proceed with status updates if we have the required variables
16+ if [ -n " $job_id " ] && [ -n " $model_id " ] && [ -n " $session_id " ]; then
17+ echo " [UPDATE] Updating job status to training-failed..."
18+ response_update_job_status=$( curl -s -X POST " $UPDATE_JOB_STATUS " \
19+ -H " Content-Type: application/json" \
20+ -d " {\" jobId\" : $job_id , \" jobStatus\" : \" training-failed\" }" )
21+
22+ echo " [MODEL] Updating model training status to failed..."
23+ UPDATE_MODEL_TRAINING_STATUS_FAILED=" http://resql:8082/global-classifier/update-training_status-failed"
24+ response_update_model_status=$( curl -s -X POST " $UPDATE_MODEL_TRAINING_STATUS_FAILED " \
25+ -H " Content-Type: application/json" \
26+ -d " {\" model_id\" : $model_id }" )
27+
28+ echo " [PROGRESS] Updating progress session to show training failure..."
29+ UPDATE_PROGRESS_SESSION_ENDPOINT=" http://ruuter-public:8086/global-classifier/datamodels/progress/update"
30+ response_update_progress_failure=$( curl -s -X POST " $UPDATE_PROGRESS_SESSION_ENDPOINT " \
31+ -H " Content-Type: application/json" \
32+ -d " {
33+ \" sessionId\" : $session_id ,
34+ \" trainingStatus\" : \" Training Failed\" ,
35+ \" trainingMessage\" : \" Training Failed\" ,
36+ \" progressPercentage\" : 100,
37+ \" processComplete\" : false
38+ }" )
39+
40+ if [ -z " $response_update_progress_failure " ]; then
41+ echo " [WARNING] Failed to update progress session with failure status"
42+ else
43+ echo " [PROGRESS] Progress session updated with failure status successfully"
44+ fi
45+ else
46+ echo " [WARNING] Cannot update training status - missing required variables (job_id, model_id, or session_id)"
47+ fi
48+
49+ exit 1
50+ }
51+
1052echo " [START] Training script starter"
1153
1254# Check if training is in progress
@@ -102,8 +144,7 @@ echo "[DEBUG] Create session response: '$response_create_session'"
102144
103145# Extract session ID from response
104146if [ -z " $response_create_session " ]; then
105- echo " [ERROR] Failed to create training progress session - empty response"
106- exit 1
147+ handle_training_failure " Failed to create training progress session - empty response"
107148fi
108149
109150# Check if session creation was successful
@@ -113,14 +154,14 @@ if echo "$response_create_session" | grep -q '"operationSuccessful":true'; then
113154 if [ -z " $session_id " ] || [ " $session_id " = " $response_create_session " ]; then
114155 echo " [ERROR] Failed to extract session ID from response"
115156 echo " [DEBUG] Raw response: '$response_create_session '"
116- exit 1
157+ handle_training_failure " Failed to extract session ID from response "
117158 fi
118159
119160 echo " [SESSION] Training progress session created successfully with ID: $session_id "
120161else
121162 echo " [ERROR] Training progress session creation failed"
122163 echo " [DEBUG] Raw response: '$response_create_session '"
123- exit 1
164+ handle_training_failure " Training progress session creation failed "
124165fi
125166
126167# Update initial training progress
@@ -154,16 +195,15 @@ echo "[DEBUG] Dataset ID response: '$response_get_dataset_id'"
154195
155196# Handle empty response
156197if [ -z " $response_get_dataset_id " ] || [ " $response_get_dataset_id " = " []" ]; then
157- echo " [ERROR] No dataset information found for model ID: $model_id "
158- exit 1
198+ handle_training_failure " No dataset information found for model ID: $model_id "
159199fi
160200
161201dataset_id=$( echo " $response_get_dataset_id " | sed -E ' s/.*"connectedDsId":([0-9]+).*/\1/' )
162202
163203if [ -z " $dataset_id " ] || [ " $dataset_id " = " $response_get_dataset_id " ]; then
164204 echo " [ERROR] Connected Dataset ID not found in response"
165205 echo " [DEBUG] Raw response: '$response_get_dataset_id '"
166- exit 1
206+ handle_training_failure " Connected Dataset ID not found in response "
167207fi
168208
169209echo " [DATASET] Dataset ID: $dataset_id "
@@ -177,12 +217,12 @@ else
177217 echo " [ERROR] Failed to extract base models from response"
178218 echo " [ERROR] Raw response: $response_get_dataset_id "
179219 echo " [ERROR] Extracted base_models: $base_models_json "
180- exit 1
220+ handle_training_failure " Failed to extract base models from response "
181221fi
182222
183223# Activate existing virtualenv
184224echo " [INFO] Activating existing virtualenv at /app/python_virtual_env"
185- source /app/python_virtual_env/bin/activate || { echo " [ERROR] Failed to activate virtualenv" ; exit 1 ; }
225+ source /app/python_virtual_env/bin/activate || { echo " [ERROR] Failed to activate virtualenv" ; handle_training_failure " Failed to activate Python virtual environment " ; }
186226export PYTHONPATH=" /app:/app/src:/app/src/training:/app/src/s3_dataset_processor:$PYTHONPATH "
187227echo " [DEBUG] PYTHONPATH set to: $PYTHONPATH "
188228# Add these debug commands
@@ -224,41 +264,41 @@ if [ ${#missing_pkgs[@]} -ne 0 ]; then
224264 # Create installation directory
225265 mkdir -p " $UV_INSTALL_DIR " || {
226266 echo " [ERROR] Failed to create UV installation directory"
227- exit 1
267+ handle_training_failure " Failed to create UV installation directory "
228268 }
229269
230270 # Use unmanaged installation to avoid root directory modifications
231271 curl -LsSf https://astral.sh/uv/install.sh | env UV_UNMANAGED_INSTALL=" $UV_INSTALL_DIR " sh || {
232272 echo " [ERROR] Failed to install uv"
233- exit 1
273+ handle_training_failure " Failed to install UV package manager "
234274 }
235275
236276 # Verify installation
237277 if [ ! -x " $UV_BIN " ]; then
238278 echo " [ERROR] UV installation failed or not executable"
239- exit 1
279+ handle_training_failure " UV installation failed or not executable "
240280 fi
241281
242282 # Verify functionality
243283 " $UV_BIN " --version || {
244284 echo " [ERROR] UV installation corrupted"
245- exit 1
285+ handle_training_failure " UV installation corrupted "
246286 }
247287
248288 echo " [UV] Successfully installed uv (unmanaged) to $UV_INSTALL_DIR "
249289 fi
250290
251291 if [ ! -f /app/src/training/requirements-gpu.txt ]; then
252292 echo " /app/src/training/requirements-gpu.txt not found!"
253- exit 1
293+ handle_training_failure " Training requirements file not found "
254294 fi
255295
256296 echo " [INSTALL] Installing from /app/src/training/requirements-gpu.txt using secure uv..."
257297 " $UV_BIN " pip install --python " $VIRTUAL_ENV /bin/python3" -r /app/src/training/requirements-gpu.txt || {
258298 echo " [WARNING] uv install failed — trying pip as fallback..."
259299 pip install -r /app/src/training/requirements-gpu.txt || {
260300 echo " [ERROR] Both uv and pip install failed inside virtualenv"
261- exit 1
301+ handle_training_failure " Failed to install required Python packages "
262302 }
263303 }
264304
@@ -321,41 +361,7 @@ if [ $training_exit_code -eq 0 ]; then
321361
322362 echo " [DEBUG] Update job status to trained response: '$response_update_job_status_trained '"
323363else
324- echo " [FAILED] Training failed with exit code: $training_exit_code "
325-
326- echo " [UPDATE] Updating job status to training-failed..."
327- response_update_job_status=$( curl -s -X POST " $UPDATE_JOB_STATUS " \
328- -H " Content-Type: application/json" \
329- -d " {\" jobId\" : $job_id , \" jobStatus\" : \" training-failed\" }" )
330-
331- echo " [MODEL] Updating model training status to failed..."
332- UPDATE_MODEL_TRAINING_STATUS_FAILED=" http://resql:8082/global-classifier/update-training_status-failed"
333- response_update_model_status=$( curl -s -X POST " $UPDATE_MODEL_TRAINING_STATUS_FAILED " \
334- -H " Content-Type: application/json" \
335- -d " {\" model_id\" : $model_id }" )
336-
337- echo " [DEBUG] Update model training status response: '$response_update_model_status '"
338-
339- echo " [PROGRESS] Updating progress session to show training failure..."
340- response_update_progress_failure=$( curl -s -X POST " $UPDATE_PROGRESS_SESSION_ENDPOINT " \
341- -H " Content-Type: application/json" \
342- -d " {
343- \" sessionId\" : $session_id ,
344- \" trainingStatus\" : \" Training Failed\" ,
345- \" trainingMessage\" : \" Model training has failed\" ,
346- \" progressPercentage\" : 100,
347- \" processComplete\" : false
348- }" )
349-
350- echo " [DEBUG] Update progress failure response: '$response_update_progress_failure '"
351-
352- if [ -z " $response_update_progress_failure " ]; then
353- echo " [WARNING] Failed to update progress session with failure status"
354- else
355- echo " [PROGRESS] Progress session updated with failure status successfully"
356- fi
357-
358- exit 1
364+ handle_training_failure " Model training script failed with exit code: $training_exit_code "
359365fi
360366
361367echo " [DONE] Training script starter completed"
0 commit comments