Skip to content

Commit c947b4f

Browse files
authored
[OGUI-1879] Save firstTaskInError if already appeared (#3313)
* checks if a task was already in error and saved in cache before updating the environment from the interval gRPC request * adds new visual component to display the first task in error being it from ODC or ECS * adds source of task to easily distinguish between ECS and ODC
1 parent 679e46f commit c947b4f

10 files changed

Lines changed: 165 additions & 15 deletions

File tree

Control/lib/kafka/adapters/odc/odcDeviceEventAdapter.js

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@
1212
*/
1313

1414
const { OdcDeviceInfoAdapter } = require('../../../adapters/OdcDeviceInfoAdapter.js');
15+
const { SourceEventTypes } = require('../../enums/SourceEventsTypes.enum.js');
1516

1617
/**
17-
* @typedef {Object} deviceStateChanged
18+
* @typedef {OdcDeviceInfo} deviceStateChanged
1819
*
1920
*
20-
* @example
21+
* @example of RUNNING as received in the payload of the `odc.deviceStateChanged` event on integrated_service.odc topic
2122
* {
2223
* "partitionId": "2uvML7dXYm7",
2324
* "ddsSessionId": "64a39ff4-ee70-4a03-b2c4-3ed41c1bd5a2",
@@ -31,6 +32,21 @@ const { OdcDeviceInfoAdapter } = require('../../../adapters/OdcDeviceInfoAdapter
3132
* "expendable": false,
3233
* "rmsjobid": "6606"
3334
* }
35+
*
36+
* @xample of ERROR as received in the payload of the `odc.deviceStateChanged` event on integrated_service.odc topic
37+
* {
38+
* "partitionId": "2zqJdVsaHwL",
39+
* "ddsSessionId": "2ab25eb0-2de1-49cc-852c-8b0342096229",
40+
* "ddsSessionStatus": "RUNNING",
41+
* "state": "ERROR",
42+
* "ecsState": "ERROR",
43+
* "taskId": "807896542787881827",
44+
* "path": "main/RecoGroupMi100/RecoCollectionMi100_0/pvertex-track-matching_t1_reco1_0",
45+
* "ignored": true,
46+
* "host": "epn323.internal",
47+
* "expendable": false,
48+
* "rmsjobid": "unknown"
49+
* }
3450
*/
3551

3652
/**
@@ -46,6 +62,7 @@ exports.odcDeviceEventAdapter = (generalIntegratedServiceEvent) => {
4662
const odcDevice = OdcDeviceInfoAdapter.toEntity(payload);
4763

4864
return {
65+
source: SourceEventTypes.ODC,
4966
environmentId,
5067
error,
5168
timestamp,

Control/lib/kafka/adapters/taskEventAdapter.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
const { getTaskShortName } = require('../../adapters/task/getTaskShortName.js');
1515
const { TaskState } = require('../../common/taskState.enum.js');
1616
const { TaskStatus } = require('../../common/taskStatus.enum.js');
17+
const { SourceEventTypes } = require('../enums/SourceEventsTypes.enum.js');
1718

1819
/**
1920
* Adapter for event messages received on run topic
@@ -36,6 +37,7 @@ exports.taskEventAdapter = ({ taskEvent }) => {
3637
} = taskEvent;
3738

3839
return {
40+
source: SourceEventTypes.ECS,
3941
id: taskid,
4042
taskId: taskid,
4143
name: getTaskShortName(name),
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/**
2+
* @license
3+
* Copyright CERN and copyright holders of ALICE O2. This software is
4+
* distributed under the terms of the GNU General Public License v3 (GPL
5+
* Version 3), copied verbatim in the file "COPYING".
6+
*
7+
* See http://alice-o2.web.cern.ch/license for full licensing information.
8+
*
9+
* In applying this license CERN does not waive the privileges and immunities
10+
* granted to it by virtue of its status as an Intergovernmental Organization
11+
* or submit itself to any jurisdiction.
12+
*/
13+
14+
/**
15+
* Enum for the different types of task events, used to distinguish the source of the event in the cache and when emitting it
16+
*/
17+
exports.SourceEventTypes = Object.freeze({
18+
ECS: 'ECS',
19+
ODC: 'ODC'
20+
});

Control/lib/services/Environment.service.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class EnvironmentService {
140140
environmentInfo.events = [...cachedEnvironment.events];
141141
environmentInfo.isDeploying = cachedEnvironment.isDeploying;
142142
environmentInfo.deploymentError = cachedEnvironment.deploymentError;
143+
environmentInfo.firstTaskInError = cachedEnvironment.firstTaskInError;
143144
}
144145
return environmentInfo;
145146
}

Control/lib/services/environment/EnvironmentCache.service.js

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,19 +79,27 @@ class EnvironmentCacheService {
7979
* * Heartbeat calls (GetEnvironment/GetEnvironments) - which will NOT contain `isDeploying` and `deploymentError` properties
8080
* * Cache caught events - which should contain `isDeploying` and `deploymentError` properties
8181
* @param {string} id - the id of the environment to be updated
82-
* @param {EnvironmentInfo} environment - the new environment information to be set
82+
* @param {Partial<EnvironmentInfo>} environment - the new environment information to be set
8383
* @returns {void}
8484
*/
8585
addOrUpdateEnvironment(environment, shouldBroadcast = false) {
8686
const { id } = environment;
8787
if (this._environments.has(id)) {
8888
const cachedEnvironment = this._environments.get(id);
8989
const { events = [] } = cachedEnvironment;
90-
const {isDeploying, deploymentError } = cachedEnvironment;
90+
/**
91+
* @param {EnvironmentInfo} cachedEnvironment - the environment information currently stored in cache for the environment with the given id
92+
* @param {boolean} cachedEnvironment.isDeploying - the information if the environment is being deployed
93+
* @param {string} cachedEnvironment.deploymentError - the error message if the environment deployment failed
94+
* @param {TaskEvent|OdcDeviceInfoEvent} cachedEnvironment.firstTaskInError - the first task event in error for the environment, which can be either a FLP task or an ODC device state change
95+
*/
96+
const { isDeploying, deploymentError, firstTaskInError } = cachedEnvironment;
9197
const updatedEnvironment = Object.assign({}, cachedEnvironment, environment);
9298
updatedEnvironment.events = [...events];
9399
updatedEnvironment.isDeploying = isDeploying;
94100
updatedEnvironment.deploymentError = deploymentError;
101+
updatedEnvironment.firstTaskInError = firstTaskInError;
102+
95103
this._environments.set(id, updatedEnvironment);
96104
} else {
97105
this._environments.set(id, { ...environment, events: environment.events ?? [] });
@@ -195,10 +203,11 @@ class EnvironmentCacheService {
195203
*/
196204
_handleFirstTaskInError(environmentId, event) {
197205
if (
198-
(event.state === TaskState.ERROR || event.state === TaskState.ERROR_CRITICAL)
206+
(event.state === TaskState.ERROR_CRITICAL)
199207
&& this._environments.has(environmentId)
200208
&& !this._environments.get(environmentId).firstTaskInError
201209
) {
210+
this._logger.warnMessage(`Environment ${environmentId} has a first task in critical error: ${event.id}`);
202211
const environment = JSON.parse(JSON.stringify(this._environments.get(environmentId)));
203212
environment.firstTaskInError = event;
204213
this._environments.set(environmentId, environment);
@@ -236,6 +245,7 @@ class EnvironmentCacheService {
236245

237246
if (
238247
state === EnvironmentState.CONFIGURED &&
248+
transition?.name === EnvironmentTransitionType.CONFIGURE &&
239249
transition?.status === EcsOperationAndStepStatus.DONE_OK
240250
) {
241251
// Once the environment is configured and ongoing transition is done, we can set the isDeploying to false
@@ -253,9 +263,9 @@ class EnvironmentCacheService {
253263
this.addOrUpdateEnvironment(cachedEnvironment, false);
254264

255265
if (
266+
state === EnvironmentState.DONE &&
256267
transition?.name === EnvironmentTransitionType.DESTROY &&
257268
transition?.status === EcsOperationAndStepStatus.DONE_OK &&
258-
state === EnvironmentState.DONE &&
259269
!cachedEnvironment.deploymentError
260270
) {
261271
// That is, if the environment successfully ended the DESTROY transition

Control/lib/typedefs/TaskEvent.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
* TaskEvent type definition as parsed following the received message from the ECS Kafka task topic
1919
* The parsing is done based on the object received from ECS in `events.proto` definition
2020
*
21+
* @property {SourceEventTypes} type - the source of the event, in this case ECS
2122
* @property {String} id - task id, unique
2223
* @property {String} taskId - task id, unique
2324
* @property {String} name - task name, based on the of the task class and adapted in short form

Control/lib/typedefs/odc/OdcDeviceInfo.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
* This is parsed object by ECS and not the same as the one sent by ODC to ECS. For example:
2020
* * ODC sends 'id' as uint64 but ECS parses it to 'taskId' as string
2121
*
22+
* @property {SourceEventTypes} source - the source of the event, in this case ODC
2223
* @property {String} taskId - ODC 'id' but renamed by ECS to 'taskId'
2324
* @property {String} state
2425
* @property {String} epnState

Control/lib/typedefs/odc/OdcDeviceInfoEvent.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
* This is parsed object by ECS and not the same as the one sent by ODC to ECS. For example:
2020
* * ODC sends 'id' as uint64 but ECS parses it to 'taskId' as string
2121
*
22+
* @property {String} source - has the value 'ODC' to identify the source of the event
2223
* @property {String} taskId - ODC 'id' but renamed by ECS to 'taskId'
2324
* @property {String} state
2425
* @property {String} epnState

Control/public/pages/Environment/components/environmentComponentsSummary.js

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ const UNKNOWN = 'UNKNOWN';
2929
export const environmentComponentsSummary = (environmentInfo) => {
3030
const odcState = environmentInfo?.hardware?.epn?.info?.state ?? UNKNOWN;
3131
const ddsState = environmentInfo?.hardware?.epn?.info?.ddsSessionStatus ?? UNKNOWN;
32-
const { currentTransition = undefined, state } = environmentInfo;
32+
const { currentTransition = undefined, state, firstTaskInError } = environmentInfo;
3333

3434
const odcStateStyle = ODC_STATE_COLOR[odcState] ? `.${ODC_STATE_COLOR[odcState]}` : '';
3535
const ddsStateStyle = ODC_STATE_COLOR[ddsState] ? `.${ODC_STATE_COLOR[ddsState]}` : '';
@@ -39,11 +39,17 @@ export const environmentComponentsSummary = (environmentInfo) => {
3939
? `.${ALIECS_TRANSITION_COLOR[currentTransition] ? ALIECS_TRANSITION_COLOR[currentTransition] : ''}`
4040
: `.${ALIECS_STATE_COLOR[state] ? ALIECS_STATE_COLOR[state] : ''}`,
4141
};
42-
return miniCard(_getTitle(currentTransition), [
43-
h('.flex-column', [
44-
h(`${ecsData.style}`, ecsData.info),
45-
h(`${odcStateStyle}`, 'ODC state: ', odcState),
46-
h(`${ddsStateStyle}`, 'DDS state: ', ddsState),
42+
43+
return h('.flex-row.g2', [
44+
miniCard(_getTitle(currentTransition), [
45+
h('.flex-column', [
46+
h(`${ecsData.style}`, ecsData.info),
47+
h(`${odcStateStyle}`, 'ODC state: ', odcState),
48+
h(`${ddsStateStyle}`, 'DDS state: ', ddsState),
49+
]),
50+
]),
51+
firstTaskInError && miniCard(h('h5.danger','First Task In Critical Error'), [
52+
_firstTaskInErrorDisplay(firstTaskInError)
4753
]),
4854
]);
4955
};
@@ -64,3 +70,52 @@ const _getTitle = (currentTransition) =>
6470
h('h5.flex-column.flex-center', 'Components State')
6571
]
6672
);
73+
74+
/**
75+
* @private
76+
* Method to get the first task in error display, it checks if the event is an ODC device event or a ECS task event and creates the display accordingly
77+
* @param {TaskEvent | OdcDeviceInfoEvent} taskEvent - the task event with error information
78+
* @returns {vnode} - display of the task event in case of error
79+
*/
80+
const _firstTaskInErrorDisplay = (taskEvent) => {
81+
return h('.flex-column.danger',
82+
[
83+
h('span', `Source: ${taskEvent.source}`),
84+
...(taskEvent?.source === 'ODC' // SourceEventsTypes
85+
? _odcDeviceEventInErrorDisplay(taskEvent)
86+
: _ecsTaskEventInErrorDisplay(taskEvent))
87+
]
88+
);
89+
};
90+
91+
/**
92+
* @private
93+
* Method to create the display of the task event in case of error
94+
* @param {TaskEvent} taskEvent - the task event with error information
95+
* @returns {vnode} - display of the task event in case of error
96+
*/
97+
const _ecsTaskEventInErrorDisplay = (taskEvent = {}) => {
98+
const { name, hostname, id, status } = taskEvent;
99+
return [
100+
h('span', `ID: ${id}`),
101+
h('span', `Name: ${name}`),
102+
h('span', `Host: ${hostname}`),
103+
h('span', `Status: ${status}`),
104+
];
105+
};
106+
107+
/**
108+
* @private
109+
* Method to create the display of the ODC device event in case of error
110+
* @param {OdcDeviceInfoEvent} odcDeviceEvent - the ODC device event with error information
111+
* @returns {vnode} - display of the ODC device event in case of error
112+
*/
113+
const _odcDeviceEventInErrorDisplay = (odcDeviceEvent = {}) => {
114+
const { id, hostname, path, error } = odcDeviceEvent;
115+
return [
116+
h('span', `ID: ${id}`),
117+
h('span', `Host: ${hostname}`),
118+
h('span', `Path: ${path}`),
119+
error && h('.danger', `Error: ${error}`)
120+
];
121+
};

Control/test/lib/services/environment/mocha-environment-cache.service.test.js

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ describe(`'EnvironmentCacheService' - test suite`, () => {
6969
isDeploying: undefined,
7070
deploymentError: undefined,
7171
state: 'inactive',
72-
events: []
72+
events: [],
73+
firstTaskInError: undefined,
7374
});
7475
assert.strictEqual(broadcastServiceMock.broadcast.callCount, 1);
7576
});
@@ -88,6 +89,47 @@ describe(`'EnvironmentCacheService' - test suite`, () => {
8889

8990
assert.ok(environmentCacheService._lastUpdate >= beforeUpdate);
9091
});
92+
93+
it('should preserve the `firstTaskInError` field when updating an existing environment', () => {
94+
const firstTaskInError = {
95+
environmentId: 'env123',
96+
state: 'ERROR',
97+
taskid: 1,
98+
name: 'task1',
99+
hostname: 'host1',
100+
className: 'class1',
101+
isCritical: false,
102+
};
103+
104+
const initialEnvironment = {
105+
id: 'env123',
106+
state: 'RUNNING',
107+
firstTaskInError: firstTaskInError,
108+
};
109+
110+
environmentCacheService.addOrUpdateEnvironment(initialEnvironment);
111+
112+
assert.strictEqual(environmentCacheService._environments.size, 1);
113+
assert.deepStrictEqual(
114+
environmentCacheService._environments.get('env123').firstTaskInError,
115+
firstTaskInError
116+
);
117+
118+
const updatedEnvironment = {
119+
id: 'env123',
120+
state: 'CONFIGURED',
121+
someOtherField: 'newValue',
122+
};
123+
124+
environmentCacheService.addOrUpdateEnvironment(updatedEnvironment);
125+
126+
assert.strictEqual(environmentCacheService._environments.size, 1);
127+
const cachedEnv = environmentCacheService._environments.get('env123');
128+
assert.strictEqual(cachedEnv.state, 'CONFIGURED');
129+
assert.strictEqual(cachedEnv.someOtherField, 'newValue');
130+
assert.deepStrictEqual(cachedEnv.firstTaskInError, firstTaskInError,
131+
'firstTaskInError should be preserved after update');
132+
});
91133
});
92134

93135
describe('`get environments` method', () => {
@@ -332,12 +374,12 @@ describe(`'EnvironmentCacheService' - test suite`, () => {
332374
environmentCacheService.addOrUpdateEnvironment(initialEnvironment);
333375
const firstTaskInErrorEventSent = {
334376
environmentId: 'env1',
335-
state: 'ERROR',
377+
state: 'ERROR_CRITICAL',
336378
taskid: 1,
337379
name: 'task1',
338380
hostname: 'host1',
339381
className: 'class1',
340-
isCritical: false,
382+
isCritical: true,
341383
};
342384
eventEmitter.emit(TASKS_TRACK, {
343385
timestamp: Date.now(),

0 commit comments

Comments
 (0)