Skip to content

Commit ce8e798

Browse files
authored
Driver Upgrade Timeout (ROCm#304)
1 parent 8abb340 commit ce8e798

11 files changed

Lines changed: 173 additions & 25 deletions

File tree

api/v1alpha1/deviceconfig_types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,7 @@ type ModuleStatus struct {
455455
KernelVersion string `json:"kernelVersion,omitempty"`
456456
LastTransitionTime string `json:"lastTransitionTime,omitempty"`
457457
Status UpgradeState `json:"status,omitempty"`
458+
UpgradeStartTime string `json:"upgradeStartTime,omitempty"`
458459
}
459460

460461
// DeviceConfigStatus defines the observed state of Module.

bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ metadata:
3030
}
3131
]
3232
capabilities: Basic Install
33-
createdAt: "2024-12-14T11:49:07Z"
33+
createdAt: "2024-12-18T02:20:28Z"
3434
operatorframework.io/suggested-namespace: kube-amd-gpu
3535
operators.operatorframework.io/builder: operator-sdk-v1.32.0
3636
operators.operatorframework.io/project_layout: go.kubebuilder.io/v3

bundle/manifests/amd.com_deviceconfigs.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,8 @@ spec:
564564
description: UpgradeState captures the state of the upgrade
565565
process on a node
566566
type: string
567+
upgradeStartTime:
568+
type: string
567569
type: object
568570
description: NodeModuleStatus contains per node status of driver module
569571
installation

config/crd/bases/amd.com_deviceconfigs.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,8 @@ spec:
560560
description: UpgradeState captures the state of the upgrade
561561
process on a node
562562
type: string
563+
upgradeStartTime:
564+
type: string
563565
type: object
564566
description: NodeModuleStatus contains per node status of driver module
565567
installation

helm-charts-k8s/Chart.lock

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ dependencies:
66
repository: file://./charts/kmm
77
version: v1.0.0
88
digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
9-
generated: "2024-12-14T12:29:21.567399824Z"
9+
generated: "2024-12-18T02:20:17.803725581Z"

helm-charts-k8s/crds/deviceconfig-crd.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,8 @@ spec:
565565
description: UpgradeState captures the state of the upgrade process
566566
on a node
567567
type: string
568+
upgradeStartTime:
569+
type: string
568570
type: object
569571
description: NodeModuleStatus contains per node status of driver module
570572
installation

helm-charts-openshift/Chart.lock

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ dependencies:
66
repository: file://./charts/kmm
77
version: v1.0.0
88
digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
9-
generated: "2024-12-14T11:49:06.370435251Z"
9+
generated: "2024-12-18T02:20:27.600366626Z"

helm-charts-openshift/crds/deviceconfig-crd.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,8 @@ spec:
565565
description: UpgradeState captures the state of the upgrade process
566566
on a node
567567
type: string
568+
upgradeStartTime:
569+
type: string
568570
type: object
569571
description: NodeModuleStatus contains per node status of driver module
570572
installation

internal/controllers/device_config_reconciler.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -479,14 +479,27 @@ func (dcrh *deviceConfigReconcilerHelper) getDeviceConfigOwnedKMMModule(ctx cont
479479

480480
func (dcrh *deviceConfigReconcilerHelper) updateDeviceConfigNodeStatus(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error {
481481
logger := log.FromContext(ctx)
482+
previousUpgradeTimes := make(map[string]string)
483+
// Persist the UpgradeStartTime
484+
for nodeName, moduleStatus := range devConfig.Status.NodeModuleStatus {
485+
previousUpgradeTimes[nodeName] = moduleStatus.UpgradeStartTime
486+
}
482487
devConfig.Status.NodeModuleStatus = map[string]amdv1alpha1.ModuleStatus{}
483488

484489
// for each node, fetch its status of modules configured by given DeviceConfig
485490
for _, node := range nodes.Items {
486491
// if there is no module configured for given node
487-
// the info under that node name will have only status
492+
// the info under that node name will have only status and upgrade start time
488493
// then it will be clear to see which node didn't get module configured
489-
devConfig.Status.NodeModuleStatus[node.Name] = amdv1alpha1.ModuleStatus{Status: dcrh.upgradeMgrHandler.GetNodeStatus(node.Name)}
494+
upgradeStartTime := previousUpgradeTimes[node.Name]
495+
496+
currentStatus := dcrh.upgradeMgrHandler.GetNodeStatus(node.Name)
497+
if currentStatus == amdv1alpha1.UpgradeStateFailed || currentStatus == amdv1alpha1.UpgradeStateCordonFailed || currentStatus == amdv1alpha1.UpgradeStateUncordonFailed || currentStatus == amdv1alpha1.UpgradeStateDrainFailed || currentStatus == amdv1alpha1.UpgradeStateRebootFailed || currentStatus == amdv1alpha1.UpgradeStateComplete || currentStatus == amdv1alpha1.UpgradeStateInstallComplete {
498+
upgradeStartTime = ""
499+
} else if upgradeStartTime == "" {
500+
upgradeStartTime = dcrh.upgradeMgrHandler.GetNodeUpgradeStartTime(node.Name)
501+
}
502+
devConfig.Status.NodeModuleStatus[node.Name] = amdv1alpha1.ModuleStatus{Status: dcrh.upgradeMgrHandler.GetNodeStatus(node.Name), UpgradeStartTime: upgradeStartTime}
490503

491504
nmc := kmmv1beta1.NodeModulesConfig{}
492505
err := dcrh.client.Get(ctx, types.NamespacedName{Name: node.Name}, &nmc)
@@ -507,6 +520,7 @@ func (dcrh *deviceConfigReconcilerHelper) updateDeviceConfigNodeStatus(ctx conte
507520
KernelVersion: module.Config.KernelVersion,
508521
LastTransitionTime: module.LastTransitionTime.String(),
509522
Status: dcrh.upgradeMgrHandler.GetNodeStatus(node.Name),
523+
UpgradeStartTime: upgradeStartTime,
510524
}
511525
}
512526
}

internal/controllers/mock_upgrademgr.go

Lines changed: 70 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)