-
Notifications
You must be signed in to change notification settings - Fork 277
Fix PinnedMemoryResource IPC NUMA ID derivation #1699
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
069bdaa
61810bd
e55a26b
2902578
2dd3d1a
c7f85ff
4c36462
629e142
217dbf1
885985a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,25 +1,31 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from cuda.bindings cimport cydriver | ||
| from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions | ||
| from cuda.core._memory._memory_pool cimport ( | ||
| _MemPool, MP_init_create_pool, MP_raise_release_threshold, | ||
| ) | ||
| from cuda.core._memory cimport _ipc | ||
| from cuda.core._memory._ipc cimport IPCAllocationHandle | ||
| from cuda.core._resource_handles cimport ( | ||
| as_cu, | ||
| get_device_mempool, | ||
| ) | ||
| from cuda.core._utils.cuda_utils cimport ( | ||
| check_or_create_options, | ||
| HANDLE_RETURN, | ||
| ) | ||
| from cpython.mem cimport PyMem_Malloc, PyMem_Free | ||
|
|
||
| from dataclasses import dataclass | ||
| import multiprocessing | ||
| import platform # no-cython-lint | ||
| import uuid | ||
|
|
||
| from cuda.core._utils.cuda_utils import check_multiprocessing_start_method | ||
| from cuda.core._resource_handles cimport as_cu | ||
|
|
||
| __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions'] | ||
|
|
||
|
|
@@ -122,27 +128,26 @@ cdef class DeviceMemoryResource(_MemPool): | |
| associated MMR. | ||
| """ | ||
|
|
||
| def __init__(self, device_id: Device | int, options=None): | ||
| from .._device import Device | ||
| cdef int dev_id = Device(device_id).device_id | ||
| cdef DeviceMemoryResourceOptions opts = check_or_create_options( | ||
| DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", | ||
| keep_none=True | ||
| ) | ||
| cdef _MemPoolOptions opts_base = _MemPoolOptions() | ||
|
|
||
| cdef bint ipc_enabled = False | ||
| if opts: | ||
| ipc_enabled = opts.ipc_enabled | ||
| if ipc_enabled and not _ipc.is_supported(): | ||
| raise RuntimeError("IPC is not available on {platform.system()}") | ||
| opts_base._max_size = opts.max_size | ||
| opts_base._use_current = False | ||
| opts_base._ipc_enabled = ipc_enabled | ||
| opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE | ||
| opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED | ||
| def __cinit__(self, *args, **kwargs): | ||
| self._dev_id = cydriver.CU_DEVICE_INVALID | ||
| self._peer_accessible_by = () | ||
|
|
||
| super().__init__(dev_id, opts_base) | ||
| def __init__(self, device_id: Device | int, options=None): | ||
| _DMR_init(self, device_id, options) | ||
|
|
||
| def __dealloc__(self): | ||
| try: | ||
| self.close() | ||
| except Exception: | ||
| pass | ||
|
|
||
| def close(self): | ||
| """Close the memory resource, revoking peer access before destruction.""" | ||
| # nvbug 5698116: clear peer access before pool destruction; also | ||
| # needed for non-owned (default) pools to undo modifications. | ||
| if self._peer_accessible_by: | ||
| _DMR_set_peer_accessible_by(self, []) | ||
| super().close() | ||
|
|
||
| def __reduce__(self): | ||
| return DeviceMemoryResource.from_registry, (self.uuid,) | ||
|
|
@@ -215,6 +220,37 @@ cdef class DeviceMemoryResource(_MemPool): | |
| raise RuntimeError("Memory resource is not IPC-enabled") | ||
| return self._ipc_data._alloc_handle | ||
|
|
||
| @property | ||
| def device_id(self) -> int: | ||
| """The associated device ordinal.""" | ||
| return self._dev_id | ||
|
|
||
| @property | ||
| def peer_accessible_by(self): | ||
| """ | ||
| Get or set the devices that can access allocations from this memory | ||
| pool. Access can be modified at any time and affects all allocations | ||
| from this memory pool. | ||
|
|
||
| Returns a tuple of sorted device IDs that currently have peer access to | ||
| allocations from this memory pool. | ||
|
|
||
| When setting, accepts a sequence of Device objects or device IDs. | ||
| Setting to an empty sequence revokes all peer access. | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> dmr = DeviceMemoryResource(0) | ||
| >>> dmr.peer_accessible_by = [1] # Grant access to device 1 | ||
| >>> assert dmr.peer_accessible_by == (1,) | ||
| >>> dmr.peer_accessible_by = [] # Revoke access | ||
| """ | ||
| return self._peer_accessible_by | ||
|
|
||
| @peer_accessible_by.setter | ||
| def peer_accessible_by(self, devices): | ||
| _DMR_set_peer_accessible_by(self, devices) | ||
|
|
||
| @property | ||
| def is_device_accessible(self) -> bool: | ||
| """Return True. This memory resource provides device-accessible buffers.""" | ||
|
|
@@ -226,6 +262,82 @@ cdef class DeviceMemoryResource(_MemPool): | |
| return False | ||
|
|
||
|
|
||
| cdef inline _DMR_set_peer_accessible_by(DeviceMemoryResource self, devices): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like something that should almost certainly never be invoked in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, yeah this was a mess. I removed all peer-access-clearing logic from Python. The real problem was that certain tests were using the default memory pool, which is persistent and should keep its peer access settings. Those tests incorrectly assumed the beginning state should be "no peer access." I updated those to simply use fresh pools. |
||
| from .._device import Device | ||
|
|
||
| cdef set[int] target_ids = {Device(dev).device_id for dev in devices} | ||
| target_ids.discard(self._dev_id) | ||
| this_dev = Device(self._dev_id) | ||
| cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)] | ||
| if bad: | ||
| raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}") | ||
| cdef set[int] cur_ids = set(self._peer_accessible_by) | ||
| cdef set[int] to_add = target_ids - cur_ids | ||
| cdef set[int] to_rm = cur_ids - target_ids | ||
| cdef size_t count = len(to_add) + len(to_rm) | ||
| cdef cydriver.CUmemAccessDesc* access_desc = NULL | ||
| cdef size_t i = 0 | ||
|
|
||
| if count > 0: | ||
| access_desc = <cydriver.CUmemAccessDesc*>PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc)) | ||
| if access_desc == NULL: | ||
| raise MemoryError("Failed to allocate memory for access descriptors") | ||
|
|
||
| try: | ||
| for dev_id in to_add: | ||
| access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE | ||
| access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE | ||
| access_desc[i].location.id = dev_id | ||
| i += 1 | ||
|
|
||
| for dev_id in to_rm: | ||
| access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE | ||
| access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE | ||
| access_desc[i].location.id = dev_id | ||
| i += 1 | ||
|
|
||
| with nogil: | ||
| HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count)) | ||
| finally: | ||
| if access_desc != NULL: | ||
| PyMem_Free(access_desc) | ||
|
|
||
| self._peer_accessible_by = tuple(target_ids) | ||
|
|
||
|
|
||
| cdef inline _DMR_init(DeviceMemoryResource self, device_id, options): | ||
| from .._device import Device | ||
| cdef int dev_id = Device(device_id).device_id | ||
| cdef DeviceMemoryResourceOptions opts = check_or_create_options( | ||
| DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", | ||
| keep_none=True | ||
| ) | ||
| cdef bint ipc_enabled = False | ||
| cdef size_t max_size = 0 | ||
|
|
||
| self._dev_id = dev_id | ||
|
|
||
| if opts is not None: | ||
| ipc_enabled = opts.ipc_enabled | ||
| if ipc_enabled and not _ipc.is_supported(): | ||
| raise RuntimeError(f"IPC is not available on {platform.system()}") | ||
| max_size = opts.max_size | ||
|
|
||
| if opts is None: | ||
| self._h_pool = get_device_mempool(dev_id) | ||
| self._mempool_owned = False | ||
| MP_raise_release_threshold(self) | ||
| else: | ||
| MP_init_create_pool( | ||
| self, | ||
| cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, | ||
| dev_id, | ||
| cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, | ||
| ipc_enabled, | ||
| max_size, | ||
| ) | ||
|
|
||
|
|
||
| # Note: this is referenced in instructions to debug nvbug 5698116. | ||
| cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id): | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're not supposed to call methods in
__dealloc__: https://cython.readthedocs.io/en/stable/src/userguide/special_methods.html#finalization-methods-dealloc-and-del_peer_accessible_byis a Python object_DMR_set_peer_accessible_bylooks like it's modifying the object based on the way it's being invoked.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds like a job for
__del__and also adding a context manager.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was to work around a driver bug (5698116). When a mempool is destroyed and a new one allocated, if the new pool happens to reuse the previous memory address, it incorrectly inherits the peer access attributes of the destroyed pool. This code is all about clearing the peer access attributes on destruction to avoid that.
The real fix is in C++ under
resource_handles.*and should only be applied to owned pools created by cuda.core. The changes here, in Python, were a misadventure. This was resetting peer access for the shared default pool, which is not being destroyed but just released.