-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmetrics.py
More file actions
90 lines (70 loc) · 2.4 KB
/
metrics.py
File metadata and controls
90 lines (70 loc) · 2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
CPU/GPU Metrics
===============
Get information relating to the usage of the CPU and GPU (where applicable)
"""
import contextlib
import logging
import psutil
from .pynvml import (
nvmlDeviceGetComputeRunningProcesses,
nvmlDeviceGetCount,
nvmlDeviceGetGraphicsRunningProcesses,
nvmlDeviceGetHandleByIndex,
nvmlDeviceGetMemoryInfo,
nvmlDeviceGetUtilizationRates,
nvmlInit,
nvmlShutdown,
)
logger = logging.getLogger(__name__)
def get_process_memory(processes: list[psutil.Process]) -> int:
"""
Get the resident set size
"""
rss: int = 0
for process in processes:
with contextlib.suppress(Exception):
rss += process.memory_info().rss / 1024 / 1024
return rss
def get_process_cpu(
processes: list[psutil.Process], interval: float | None = None
) -> int:
"""
Get the CPU usage
If first time being called, use a small interval to collect initial CPU metrics.
"""
cpu_percent: int = 0
for process in processes:
with contextlib.suppress(Exception):
cpu_percent += process.cpu_percent(interval=interval)
return cpu_percent
def is_gpu_used(handle, processes: list[psutil.Process]) -> bool:
"""
Check if the GPU is being used by the list of processes
"""
pids = [process.pid for process in processes]
gpu_pids = [process.pid for process in nvmlDeviceGetComputeRunningProcesses(handle)]
gpu_pids.extend(
process.pid for process in nvmlDeviceGetGraphicsRunningProcesses(handle)
)
return len(list(set(gpu_pids) & set(pids))) > 0
def get_gpu_metrics(processes: list[psutil.Process]) -> dict[str, float]:
"""
Get GPU metrics
"""
gpu_metrics: dict[str, float] = {}
with contextlib.suppress(Exception):
nvmlInit()
device_count = nvmlDeviceGetCount()
for i in range(device_count):
handle = nvmlDeviceGetHandleByIndex(i)
if is_gpu_used(handle, processes):
utilisation_percent = nvmlDeviceGetUtilizationRates(handle).gpu
memory = nvmlDeviceGetMemoryInfo(handle)
memory_percent = 100 * memory.free / memory.total
gpu_metrics[f"resources/gpu.utilisation.percent.{i}"] = (
utilisation_percent
)
gpu_metrics[f"resources/gpu.memory.percent.{i}"] = memory_percent
nvmlShutdown()
return gpu_metrics