Skip to content

Commit c5ac425

Browse files
committed
refactor: disable ecc errors check default
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent f4b8c8d commit c5ac425

9 files changed

Lines changed: 114 additions & 88 deletions

File tree

gpustack_runtime/detector/amd.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -179,25 +179,27 @@ def detect(self) -> Devices | None:
179179
dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
180180
dev_mem = dev_gpu_vram_usage.get("vram_total")
181181
dev_mem_used = dev_gpu_vram_usage.get("vram_used")
182-
dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count(
183-
dev,
184-
pyamdsmi.AmdSmiGpuBlock.UMC,
185-
)
186-
if dev_ecc_count.get("uncorrectable_count", 0) > 0:
187-
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
182+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
183+
dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count(
184+
dev,
185+
pyamdsmi.AmdSmiGpuBlock.UMC,
186+
)
187+
if dev_ecc_count.get("uncorrectable_count", 0) > 0:
188+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
188189
except pyamdsmi.AmdSmiException:
189190
dev_mem = byte_to_mebibyte( # byte to MiB
190191
pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
191192
)
192193
dev_mem_used = byte_to_mebibyte( # byte to MiB
193194
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
194195
)
195-
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
196-
dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
197-
dev_idx,
198-
)
199-
if dev_ecc_count.uncorrectable_err > 0:
200-
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
196+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
197+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
198+
dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
199+
dev_idx,
200+
)
201+
if dev_ecc_count.uncorrectable_err > 0:
202+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
201203

202204
dev_power = None
203205
dev_power_used = None

gpustack_runtime/detector/ascend.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -402,19 +402,20 @@ def _get_device_memory_status(dev_card_id, dev_device_id) -> DeviceMemoryStatusE
402402
DeviceMemoryStatusEnum indicating the ECC status.
403403
404404
"""
405-
for dev_mem_type in [pydcmi.DCMI_DEVICE_TYPE_HBM, pydcmi.DCMI_DEVICE_TYPE_DDR]:
406-
with contextlib.suppress(pydcmi.DCMIError):
407-
dev_ecc_info = pydcmi.dcmi_get_device_ecc_info(
408-
dev_card_id,
409-
dev_device_id,
410-
dev_mem_type,
411-
)
412-
if dev_ecc_info.enable_flag and (
413-
dev_ecc_info.single_bit_error_cnt > 0
414-
or dev_ecc_info.double_bit_error_cnt > 0
415-
):
416-
return DeviceMemoryStatusEnum.UNHEALTHY
417-
return DeviceMemoryStatusEnum.HEALTHY
405+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
406+
for dev_mem_type in [pydcmi.DCMI_DEVICE_TYPE_HBM, pydcmi.DCMI_DEVICE_TYPE_DDR]:
407+
with contextlib.suppress(pydcmi.DCMIError):
408+
dev_ecc_info = pydcmi.dcmi_get_device_ecc_info(
409+
dev_card_id,
410+
dev_device_id,
411+
dev_mem_type,
412+
)
413+
if dev_ecc_info.enable_flag and (
414+
dev_ecc_info.single_bit_error_cnt > 0
415+
or dev_ecc_info.double_bit_error_cnt > 0
416+
):
417+
return DeviceMemoryStatusEnum.UNHEALTHY
418+
return DeviceMemoryStatusEnum.HEALTHY
418419

419420
return DeviceMemoryStatusEnum.HEALTHY
420421

gpustack_runtime/detector/hygon.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -157,12 +157,13 @@ def detect(self) -> Devices | None:
157157
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
158158
)
159159
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
160-
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
161-
dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
162-
dev_idx,
163-
)
164-
if dev_ecc_count.uncorrectable_err > 0:
165-
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
160+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
161+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
162+
dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
163+
dev_idx,
164+
)
165+
if dev_ecc_count.uncorrectable_err > 0:
166+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
166167

167168
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
168169
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)

gpustack_runtime/detector/iluvatar.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -138,17 +138,18 @@ def detect(self) -> Devices | None:
138138
dev_mem = 0
139139
dev_mem_used = 0
140140
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
141-
with contextlib.suppress(pyixml.NVMLError):
142-
dev_mem_info = pyixml.nvmlDeviceGetMemoryInfo(dev)
143-
dev_mem = byte_to_mebibyte( # byte to MiB
144-
dev_mem_info.total,
145-
)
146-
dev_mem_used = byte_to_mebibyte( # byte to MiB
147-
dev_mem_info.used,
148-
)
149-
dev_health = pyixml.ixmlDeviceGetHealth(dev)
150-
if dev_health != pyixml.IXML_HEALTH_OK:
151-
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
141+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
142+
with contextlib.suppress(pyixml.NVMLError):
143+
dev_mem_info = pyixml.nvmlDeviceGetMemoryInfo(dev)
144+
dev_mem = byte_to_mebibyte( # byte to MiB
145+
dev_mem_info.total,
146+
)
147+
dev_mem_used = byte_to_mebibyte( # byte to MiB
148+
dev_mem_info.used,
149+
)
150+
dev_health = pyixml.ixmlDeviceGetHealth(dev)
151+
if dev_health != pyixml.IXML_HEALTH_OK:
152+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
152153

153154
dev_cores_util = None
154155
with contextlib.suppress(pyixml.NVMLError):

gpustack_runtime/detector/metax.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,10 +148,11 @@ def detect(self) -> Devices | None:
148148
dev_mem_info.vramUse,
149149
)
150150
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
151-
with contextlib.suppress(pymxsml.MXSMLError):
152-
dev_ecc_errors = pymxsml.mxSmlGetTotalEccErrors(dev_idx)
153-
if dev_ecc_errors.dramUE > 0:
154-
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
151+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
152+
with contextlib.suppress(pymxsml.MXSMLError):
153+
dev_ecc_errors = pymxsml.mxSmlGetTotalEccErrors(dev_idx)
154+
if dev_ecc_errors.dramUE > 0:
155+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
155156

156157
dev_temp = (
157158
pymxsml.mxSmlGetTemperatureInfo(

gpustack_runtime/detector/mthreads.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -150,14 +150,15 @@ def detect(self) -> Devices | None:
150150
dev_mem_used = byte_to_mebibyte( # byte to MiB
151151
pymtml.mtmlMemoryGetUsed(devmem),
152152
)
153-
dev_mem_ecc_errors = pymtml.mtmlMemoryGetEccErrorCounter(
154-
devmem,
155-
pymtml.MTML_MEMORY_ERROR_TYPE_UNCORRECTED,
156-
pymtml.MTML_VOLATILE_ECC,
157-
pymtml.MTML_MEMORY_LOCATION_DRAM,
158-
)
159-
if dev_mem_ecc_errors > 0:
160-
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
153+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
154+
dev_mem_ecc_errors = pymtml.mtmlMemoryGetEccErrorCounter(
155+
devmem,
156+
pymtml.MTML_MEMORY_ERROR_TYPE_UNCORRECTED,
157+
pymtml.MTML_VOLATILE_ECC,
158+
pymtml.MTML_MEMORY_LOCATION_DRAM,
159+
)
160+
if dev_mem_ecc_errors > 0:
161+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
161162

162163
dev_cores_util = None
163164
dev_temp = None

gpustack_runtime/detector/nvidia.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -207,14 +207,15 @@ def detect(self) -> Devices | None: # noqa: PLR0915
207207
dev_mem_used = byte_to_mebibyte( # byte to MiB
208208
dev_mem_info.used,
209209
)
210-
dev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
211-
dev,
212-
pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
213-
pynvml.NVML_VOLATILE_ECC,
214-
pynvml.NVML_MEMORY_LOCATION_DRAM,
215-
)
216-
if dev_mem_ecc_errors > 0:
217-
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
210+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
211+
dev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
212+
dev,
213+
pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
214+
pynvml.NVML_VOLATILE_ECC,
215+
pynvml.NVML_MEMORY_LOCATION_DRAM,
216+
)
217+
if dev_mem_ecc_errors > 0:
218+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
218219
if dev_mem == 0:
219220
dev_mem, dev_mem_used = get_memory()
220221

@@ -287,14 +288,17 @@ def detect(self) -> Devices | None: # noqa: PLR0915
287288
mdev_mem_used = byte_to_mebibyte( # byte to MiB
288289
mdev_mem_info.used,
289290
)
290-
mdev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
291-
mdev,
292-
pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
293-
pynvml.NVML_AGGREGATE_ECC,
294-
pynvml.NVML_MEMORY_LOCATION_SRAM,
295-
)
296-
if mdev_mem_ecc_errors > 0:
297-
mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
291+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
292+
mdev_mem_ecc_errors = (
293+
pynvml.nvmlDeviceGetMemoryErrorCounter(
294+
mdev,
295+
pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
296+
pynvml.NVML_AGGREGATE_ECC,
297+
pynvml.NVML_MEMORY_LOCATION_SRAM,
298+
)
299+
)
300+
if mdev_mem_ecc_errors > 0:
301+
mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
298302

299303
mdev_appendix = {
300304
"arch_family": _get_arch_family(dev_cc_t),

gpustack_runtime/detector/thead.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,15 @@ def detect(self) -> Devices | None:
199199
dev_mem_used = byte_to_mebibyte( # byte to MiB
200200
dev_mem_info.used,
201201
)
202-
dev_mem_ecc_errors = pyhgml.hgmlDeviceGetMemoryErrorCounter(
203-
dev,
204-
pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
205-
pyhgml.HGML_VOLATILE_ECC,
206-
pyhgml.HGML_MEMORY_LOCATION_DRAM,
207-
)
208-
if dev_mem_ecc_errors > 0:
209-
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
202+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
203+
dev_mem_ecc_errors = pyhgml.hgmlDeviceGetMemoryErrorCounter(
204+
dev,
205+
pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
206+
pyhgml.HGML_VOLATILE_ECC,
207+
pyhgml.HGML_MEMORY_LOCATION_DRAM,
208+
)
209+
if dev_mem_ecc_errors > 0:
210+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
210211

211212
dev_is_vgpu = False
212213
if dev_bdf:
@@ -270,14 +271,17 @@ def detect(self) -> Devices | None:
270271
mdev_mem_used = byte_to_mebibyte( # byte to MiB
271272
mdev_mem_info.used,
272273
)
273-
mdev_mem_ecc_errors = pyhgml.hgmlDeviceGetMemoryErrorCounter(
274-
mdev,
275-
pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
276-
pyhgml.HGML_AGGREGATE_ECC,
277-
pyhgml.HGML_MEMORY_LOCATION_SRAM,
278-
)
279-
if mdev_mem_ecc_errors > 0:
280-
mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
274+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK:
275+
mdev_mem_ecc_errors = (
276+
pyhgml.hgmlDeviceGetMemoryErrorCounter(
277+
mdev,
278+
pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
279+
pyhgml.HGML_AGGREGATE_ECC,
280+
pyhgml.HGML_MEMORY_LOCATION_SRAM,
281+
)
282+
)
283+
if mdev_mem_ecc_errors > 0:
284+
mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
281285

282286
mdev_appendix = {
283287
"vgpu": True,

gpustack_runtime/envs.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,23 @@
3636
"""
3737
GPUSTACK_RUNTIME_DETECT_NO_PCI_CHECK: bool = False
3838
"""
39-
Enable no PCI check during detection.
39+
Set true to disable PCI check during detection.
4040
Useful for WSL environments, where PCI information may not be available.
4141
"""
4242
GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL: bool = False
4343
"""
44-
Enable only using management libraries calls during detection.
44+
Set true to disable toolkit calls during detection.
4545
Device detection typically involves calling platform-side management libraries and platform-side toolkit to retrieve extra information.
4646
For example, during NVIDIA detection, the NVML and CUDA are called, with CUDA used to retrieve GPU cores.
4747
However, if certain toolchains are not correctly installed in the environment,
4848
such as the Nvidia Fabric Manager being missing, calling the CUDA can cause blocking.
4949
Enabling this parameter can prevent blocking events.
5050
"""
51+
GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK: bool = True
52+
"""
53+
Set true to disable ECC errors check during detection,
54+
which is used to determine the health status of the device.
55+
"""
5156
GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY: dict[str, str] | None = None
5257
"""
5358
The detected backend mapping to resource keys,
@@ -393,6 +398,12 @@
393398
"0",
394399
),
395400
),
401+
"GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK": lambda: to_bool(
402+
getenv(
403+
"GPUSTACK_RUNTIME_DETECT_NO_HEALTH_CHECK",
404+
"1",
405+
),
406+
),
396407
"GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY": lambda: to_dict(
397408
getenv(
398409
"GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY",

0 commit comments

Comments
 (0)