@@ -207,14 +207,15 @@ def detect(self) -> Devices | None: # noqa: PLR0915
207207 dev_mem_used = byte_to_mebibyte ( # byte to MiB
208208 dev_mem_info .used ,
209209 )
210- dev_mem_ecc_errors = pynvml .nvmlDeviceGetMemoryErrorCounter (
211- dev ,
212- pynvml .NVML_MEMORY_ERROR_TYPE_UNCORRECTED ,
213- pynvml .NVML_VOLATILE_ECC ,
214- pynvml .NVML_MEMORY_LOCATION_DRAM ,
215- )
216- if dev_mem_ecc_errors > 0 :
217- dev_mem_status = DeviceMemoryStatusEnum .UNHEALTHY
210+ if not envs .GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK :
211+ dev_mem_ecc_errors = pynvml .nvmlDeviceGetMemoryErrorCounter (
212+ dev ,
213+ pynvml .NVML_MEMORY_ERROR_TYPE_UNCORRECTED ,
214+ pynvml .NVML_VOLATILE_ECC ,
215+ pynvml .NVML_MEMORY_LOCATION_DRAM ,
216+ )
217+ if dev_mem_ecc_errors > 0 :
218+ dev_mem_status = DeviceMemoryStatusEnum .UNHEALTHY
218219 if dev_mem == 0 :
219220 dev_mem , dev_mem_used = get_memory ()
220221
@@ -287,14 +288,17 @@ def detect(self) -> Devices | None: # noqa: PLR0915
287288 mdev_mem_used = byte_to_mebibyte ( # byte to MiB
288289 mdev_mem_info .used ,
289290 )
290- mdev_mem_ecc_errors = pynvml .nvmlDeviceGetMemoryErrorCounter (
291- mdev ,
292- pynvml .NVML_MEMORY_ERROR_TYPE_UNCORRECTED ,
293- pynvml .NVML_AGGREGATE_ECC ,
294- pynvml .NVML_MEMORY_LOCATION_SRAM ,
295- )
296- if mdev_mem_ecc_errors > 0 :
297- mdev_mem_status = DeviceMemoryStatusEnum .UNHEALTHY
291+ if not envs .GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK :
292+ mdev_mem_ecc_errors = (
293+ pynvml .nvmlDeviceGetMemoryErrorCounter (
294+ mdev ,
295+ pynvml .NVML_MEMORY_ERROR_TYPE_UNCORRECTED ,
296+ pynvml .NVML_AGGREGATE_ECC ,
297+ pynvml .NVML_MEMORY_LOCATION_SRAM ,
298+ )
299+ )
300+ if mdev_mem_ecc_errors > 0 :
301+ mdev_mem_status = DeviceMemoryStatusEnum .UNHEALTHY
298302
299303 mdev_appendix = {
300304 "arch_family" : _get_arch_family (dev_cc_t ),
0 commit comments