Skip to content

Commit ef08f3e

Browse files
authored
Add ability to speficy GPUs by UUID prefix (mlco2#923)
* Add the ability to pass UUID prefixes as a way of specifying what GPUs to track.. This is to address mlco2#873 Prior to this change you could only pass an index into the number of GPUs on the system. Now you can pass a UUID prefix, including the 'MIG-' prefix per https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-environment-variables if desired. Note that I have not been able to test this on real life repo. The reporter of mlco2#873 was not able to provide a repro. * Update logging and documentation for GPU indentifier prefix matching. * Update the rst parameters file for GPU prefix matching and generate the matching html file.
1 parent c436078 commit ef08f3e

7 files changed

Lines changed: 114 additions & 60 deletions

File tree

codecarbon/core/config.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import configparser
22
import os
33
from pathlib import Path
4-
from typing import List
4+
from typing import List, Union
55

66
from codecarbon.external.logger import logger
77

@@ -44,24 +44,33 @@ def parse_env_config() -> dict:
4444
}
4545

4646

47-
def parse_gpu_ids(gpu_ids_str: str) -> List[int]:
47+
def parse_gpu_ids(gpu_ids: Union[str, List[int]]) -> List[str]:
4848
"""
49-
Transforms the potential gpu_ids string into a list of int values
49+
Transforms the potential gpu_ids into a list of string id values.
50+
5051
5152
Args:
52-
gpu_ids_str (str): The config file or environment variable value for `gpu_ids`
53-
which is read as a string and should be parsed into a list of ints
53+
gpu_ids: The config file or environment variable value for `gpu_ids`
5454
5555
Returns:
56-
list[int]: The list of GPU ids available declared by the user.
56+
list[str]: The list of GPU ids available.
5757
Potentially empty.
5858
"""
59-
if not isinstance(gpu_ids_str, str):
60-
return gpu_ids_str
61-
62-
gpu_ids_str = "".join(c for c in gpu_ids_str if (c.isalnum() or c == ","))
63-
str_ids = [gpu_id for gpu_id in gpu_ids_str.split(",") if gpu_id]
64-
return list(map(int, str_ids))
59+
if isinstance(gpu_ids, str):
60+
# Allow '-' in id strings since UUIDs may include them.
61+
gpu_ids = "".join(c for c in gpu_ids if (c.isalnum() or c in ("-", ",")))
62+
str_ids = [gpu_id for gpu_id in gpu_ids.split(",") if gpu_id]
63+
return str_ids
64+
65+
elif isinstance(gpu_ids, list) and all(
66+
isinstance(gpu_id, int) for gpu_id in gpu_ids
67+
):
68+
return list(map(str, gpu_ids))
69+
70+
else:
71+
logger.warning(
72+
"Invalid gpu_ids format. Expected a string or a list of ints/strings."
73+
)
6574

6675

6776
def get_hierarchical_config():

codecarbon/core/resource_tracker.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -183,18 +183,11 @@ def set_CPU_tracking(self):
183183
def set_GPU_tracking(self):
184184
logger.info("[setup] GPU Tracking...")
185185
if self.tracker._gpu_ids:
186-
# If _gpu_ids is a string or a list of int, parse it to a list of ints
187-
if isinstance(self.tracker._gpu_ids, str) or (
188-
isinstance(self.tracker._gpu_ids, list)
189-
and all(isinstance(gpu_id, int) for gpu_id in self.tracker._gpu_ids)
190-
):
191-
self.tracker._gpu_ids: List[int] = parse_gpu_ids(self.tracker._gpu_ids)
186+
self.tracker._gpu_ids = parse_gpu_ids(self.tracker._gpu_ids)
187+
if self.tracker._gpu_ids:
192188
self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids
193189
self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids)
194-
else:
195-
logger.warning(
196-
"Invalid gpu_ids format. Expected a string or a list of ints."
197-
)
190+
198191
if gpu.is_gpu_details_available():
199192
logger.info("Tracking Nvidia GPU via pynvml")
200193
gpu_devices = GPU.from_utils(self.tracker._gpu_ids)

codecarbon/external/hardware.py

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -102,25 +102,43 @@ def _get_gpu_ids(self) -> Iterable[int]:
102102
Get the Ids of the GPUs that we will monitor
103103
:return: list of ids
104104
"""
105-
gpu_ids = []
106105
if self.gpu_ids is not None:
107-
# Check that the provided GPU ids are valid
108-
if not set(self.gpu_ids).issubset(set(range(self.num_gpus))):
109-
logger.warning(
110-
f"Unknown GPU ids {gpu_ids}, only {self.num_gpus} GPUs available."
111-
)
112-
# Keep only the GPUs that are in the provided list
113-
for gpu_id in range(self.num_gpus):
114-
if gpu_id in self.gpu_ids:
115-
gpu_ids.append(gpu_id)
106+
uuids_to_ids = {
107+
gpu.get("uuid"): gpu.get("gpu_index")
108+
for gpu in self.devices.get_gpu_static_info()
109+
}
110+
monitored_gpu_ids = []
111+
112+
for gpu_id in self.gpu_ids:
113+
found_gpu_id = False
114+
# Does it look like an index into the number of GPUs on the system?
115+
if isinstance(gpu_id, int) or gpu_id.isdigit():
116+
gpu_id = int(gpu_id)
117+
if 0 <= gpu_id < self.num_gpus:
118+
monitored_gpu_ids.append(gpu_id)
119+
found_gpu_id = True
120+
# Does it match a prefix of any UUID on the system after stripping any 'MIG-'
121+
# id prefix per https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-environment-variables ?
116122
else:
117-
logger.info(
118-
f"GPU number {gpu_id} will not be monitored, at your request."
123+
stripped_gpu_id_str = gpu_id.lstrip("MIG-")
124+
for uuid, id in uuids_to_ids.items():
125+
if uuid.startswith(stripped_gpu_id_str):
126+
logger.debug(
127+
f"Matching GPU ID {stripped_gpu_id_str} (originally {gpu_id}) against {uuid} for GPU index {id}"
128+
)
129+
monitored_gpu_ids.append(id)
130+
found_gpu_id = True
131+
break
132+
if not found_gpu_id:
133+
logger.warning(
134+
f"GPU with ID '{gpu_id}' not found or invalid. It will be ignored."
119135
)
120-
self.gpu_ids = gpu_ids
136+
137+
monitored_gpu_ids = sorted(list(set(monitored_gpu_ids)))
138+
self.gpu_ids = monitored_gpu_ids
139+
return monitored_gpu_ids
121140
else:
122-
gpu_ids = set(range(self.num_gpus))
123-
return gpu_ids
141+
return list(range(self.num_gpus))
124142

125143
def total_power(self) -> Power:
126144
return self._total_power
@@ -135,7 +153,7 @@ def from_utils(cls, gpu_ids: Optional[List] = None) -> "GPU":
135153
new_gpu_ids = gpus._get_gpu_ids()
136154
if len(new_gpu_ids) < gpus.num_gpus:
137155
logger.warning(
138-
f"You have {gpus.num_gpus} GPUs but we will monitor only {len(gpu_ids)} of them. Check your configuration."
156+
f"You have {gpus.num_gpus} GPUs but we will monitor only {len(new_gpu_ids)} ({new_gpu_ids}) of them. Check your configuration."
139157
)
140158
return cls(gpu_ids=new_gpu_ids)
141159

docs/edit/parameters.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@ Input Parameters
2525
- | ``machine`` measure the power consumptions of the entire machine (defaults)
2626
| ``process`` try and isolate the tracked processes in isolation
2727
* - gpu_ids
28-
- User-specified known gpu ids to track, defaults to ``None``
28+
- | Comma-separated list of GPU ids to track, defaults to ``None``
29+
| These can either be integer indexes of GPUs on the system, or prefixes
30+
| to match against GPU identifiers as described `here <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-environment-variables>`_
2931
* - log_level
3032
- | Global codecarbon log level (by order of verbosity): "debug", "info" (defaults),
3133
| "warning", "error", or "critical"

docs/parameters.html

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,12 @@ <h2>Input Parameters<a class="headerlink" href="#input-parameters" title="Link t
131131
</td>
132132
</tr>
133133
<tr class="row-even"><td><p>gpu_ids</p></td>
134-
<td><p>User-specified known gpu ids to track, defaults to <code class="docutils literal notranslate"><span class="pre">None</span></code></p></td>
134+
<td><div class="line-block">
135+
<div class="line">Comma-separated list of GPU ids to track, defaults to <code class="docutils literal notranslate"><span class="pre">None</span></code></div>
136+
<div class="line">These can either be integer indexes of GPUs on the system, or prefixes</div>
137+
<div class="line">to match against GPU identifiers as described <a class="reference external" href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-environment-variables">here</a></div>
138+
</div>
139+
</td>
135140
</tr>
136141
<tr class="row-odd"><td><p>log_level</p></td>
137142
<td><div class="line-block">

tests/test_config.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,16 @@ def test_clean_env_key(self):
3232

3333
def test_parse_gpu_ids(self):
3434
for ids, target in [
35-
("0,1,2", [0, 1, 2]),
36-
("[0, 1, 2", [0, 1, 2]),
37-
("(0, 1, 2)", [0, 1, 2]),
38-
("[1]", [1]),
39-
("1", [1]),
40-
("0", [0]),
35+
("0,1,2", ["0", "1", "2"]),
36+
("[0, 1, 2", ["0", "1", "2"]),
37+
("(0, 1, 2)", ["0", "1", "2"]),
38+
("[1]", ["1"]),
39+
("1", ["1"]),
40+
("0", ["0"]),
41+
("MIG-f1e", ["MIG-f1e"]),
4142
("", []),
4243
([], []),
43-
([1, 2, 3], [1, 2, 3]),
44-
(1, 1),
44+
([1, 2, 3], ["1", "2", "3"]),
4545
]:
4646
self.assertEqual(parse_gpu_ids(ids), target)
4747

@@ -101,6 +101,7 @@ def test_read_confs(self):
101101
"USER": "useless key",
102102
"CODECARBON_ENV_OVERWRITE": "SUCCESS:overwritten",
103103
"CODECARBON_ENV_NEW_KEY": "cool value",
104+
"CODECARBON_ALLOW_MULTIPLE_RUNS": "True",
104105
},
105106
)
106107
def test_read_confs_and_parse_envs(self):
@@ -145,9 +146,8 @@ def test_empty_conf(self):
145146
"builtins.open", new_callable=get_custom_mock_open(global_conf, local_conf)
146147
):
147148
conf = dict(get_hierarchical_config())
148-
target = {
149-
"allow_multiple_runs": "True"
150-
} # allow_multiple_runs is a default value
149+
# allow_multiple_runs is set in pytest.ini and not mocked, so it's visible here.
150+
target = {"allow_multiple_runs": "True"}
151151
self.assertDictEqual(conf, target)
152152

153153
@mock.patch.dict(
@@ -190,7 +190,7 @@ def test_full_hierarchy(self):
190190
self.assertEqual(tracker._force_ram_power, 50.5)
191191
self.assertEqual(tracker._output_dir, "/success/overwritten")
192192
self.assertEqual(tracker._emissions_endpoint, "http://testhost:2000")
193-
self.assertEqual(tracker._gpu_ids, [0, 1])
193+
self.assertEqual(tracker._gpu_ids, ["0", "1"])
194194
self.assertEqual(tracker._co2_signal_api_token, "signal-token")
195195
self.assertEqual(tracker._project_name, "test-project")
196196
self.assertTrue(tracker._save_to_file)
@@ -206,7 +206,7 @@ def test_gpu_ids_from_env(self):
206206
tracker = EmissionsTracker(
207207
project_name="test-project", allow_multiple_runs=True
208208
)
209-
self.assertEqual(tracker._gpu_ids, [2, 3])
209+
self.assertEqual(tracker._gpu_ids, ["2", "3"])
210210

211211
@mock.patch.dict(
212212
os.environ,
@@ -220,7 +220,7 @@ def test_too_much_gpu_ids_in_env(self):
220220
tracker = EmissionsTracker(
221221
project_name="test-project", allow_multiple_runs=True
222222
)
223-
self.assertEqual(tracker._gpu_ids, [99])
223+
self.assertEqual(tracker._gpu_ids, ["99"])
224224
gpu_count = 0
225225
for hardware in tracker._hardware:
226226
if isinstance(hardware, GPU):

tests/test_gpu.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def setup_method(self):
5050
self.DETAILS = {
5151
"handle_0": {
5252
"name": b"GeForce GTX 1080",
53-
"uuid": b"uuid#1",
53+
"uuid": b"uuid-1",
5454
"memory": real_pynvml.c_nvmlMemory_t(1024, 100, 924),
5555
"temperature": 75,
5656
"power_usage": 26,
@@ -66,7 +66,7 @@ def setup_method(self):
6666
},
6767
"handle_1": {
6868
"name": b"GeForce GTX 1080",
69-
"uuid": b"uuid#2",
69+
"uuid": b"uuid-2",
7070
"memory": real_pynvml.c_nvmlMemory_t(1024, 200, 824),
7171
"temperature": 79,
7272
"power_usage": 29,
@@ -84,7 +84,7 @@ def setup_method(self):
8484
self.expected = [
8585
{
8686
"name": "GeForce GTX 1080",
87-
"uuid": "uuid#1",
87+
"uuid": "uuid-1",
8888
"total_memory": 1024,
8989
"free_memory": 100,
9090
"used_memory": 924,
@@ -102,7 +102,7 @@ def setup_method(self):
102102
},
103103
{
104104
"name": "GeForce GTX 1080",
105-
"uuid": "uuid#2",
105+
"uuid": "uuid-2",
106106
"total_memory": 1024,
107107
"free_memory": 200,
108108
"used_memory": 824,
@@ -146,14 +146,14 @@ def test_static_gpu_info(self):
146146
expected = [
147147
{
148148
"name": "GeForce GTX 1080",
149-
"uuid": "uuid#1",
149+
"uuid": "uuid-1",
150150
"total_memory": 1024,
151151
"power_limit": 149,
152152
"gpu_index": 0,
153153
},
154154
{
155155
"name": "GeForce GTX 1080",
156-
"uuid": "uuid#2",
156+
"uuid": "uuid-2",
157157
"total_memory": 1024,
158158
"power_limit": 149,
159159
"gpu_index": 1,
@@ -311,6 +311,33 @@ def mock_nvmlDeviceGetTotalEnergyConsumption(handle):
311311
expected_power = gpu2_power
312312
tc.assertAlmostEqual(expected_power.kW, gpu.total_power().kW)
313313

314+
def test_get_gpu_ids(self):
315+
"""
316+
Check parsing of gpu_ids in various forms.
317+
"""
318+
# Prepare
319+
from codecarbon.external.hardware import GPU
320+
321+
for test_ids, expected_ids in [
322+
([0, 1], [0, 1]),
323+
([0, 1, 2], [0, 1]),
324+
([2], []),
325+
(["0", "1"], [0, 1]),
326+
# Only two GPUS in the system, so ignore the third (index 2)
327+
(["0", "1", "2"], [0, 1]),
328+
(["2"], []),
329+
# Check UUID-to-index mapping
330+
(["uuid-1"], [0]),
331+
(["uuid-1", "uuid-2"], [0, 1]),
332+
(["uuid-3"], []),
333+
# Check UUID-to-index mapping when we need to strip the prefix
334+
(["MIG-uuid-1"], [0]),
335+
(["MIG-uuid-3"], []),
336+
]:
337+
gpu = GPU(test_ids)
338+
result = gpu._get_gpu_ids()
339+
assert result == expected_ids
340+
314341

315342
class TestGpuNotAvailable:
316343
def setup_method(self):

0 commit comments

Comments
 (0)