From 73d2d892eeddf871241316330e497c2c54fae914 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 4 May 2026 14:30:40 -0700 Subject: [PATCH 1/8] cuda.core: convert peer_accessible_by to a live MutableSet view DeviceMemoryResource.peer_accessible_by previously returned a sorted tuple[int, ...] backed by a Python-level cache, which was prone to divergence from driver state across multiple wrappers around the same memory pool. The setter accepted Device | int and emitted a single batched cuMemPoolSetAccess covering the diff against the cache. This commit replaces the property with a live driver-backed view: - Adds PeerAccessibleBySetProxy in _memory/_peer_access_utils.py, a collections.abc.MutableSet whose reads call cuMemPoolGetAccess and whose writes call cuMemPoolSetAccess. Iteration yields Device objects; add, discard, and __contains__ accept either a Device or a device-ordinal int. The proxy is constructed fresh on every property access, so there is nothing to cache or pickle. - Drops the _peer_accessible_by cache field (and its initializations in __cinit__, _DMR_init, and from_allocation_handle), eliminating the owned/non-owned read split. All pools now share the same code path and always query the driver. - All bulk operations on the proxy (update, |=, &=, -=, ^=, clear, pop) issue exactly one cuMemPoolSetAccess call. Peer-access transitions can take seconds per pool because every existing memory mapping is updated, so coalescing into a single driver call lets the toolkit handle the mappings in parallel. The property setter (mr.peer_accessible_by = [...]) preserves its original single-call behavior via the same shared planner path. - Single-element add validates can_access_peer through plan_peer_access_update, matching the existing setter contract. This is a breaking change captured in the v1.0.0 release notes. Callers comparing against tuples must update to set comparisons (mr.peer_accessible_by == {Device(0)}). Existing tests are migrated; new tests for set-interface conformance are intentionally deferred to a follow-up. Co-authored-by: Cursor --- .../core/_memory/_device_memory_resource.pxd | 1 - .../core/_memory/_device_memory_resource.pyx | 172 +++++++++------ .../cuda/core/_memory/_peer_access_utils.py | 205 +++++++++++++++++- cuda_core/docs/source/api_private.rst | 1 + cuda_core/docs/source/release/1.0.0-notes.rst | 13 ++ .../tests/memory_ipc/test_peer_access.py | 18 +- cuda_core/tests/test_memory_peer_access.py | 26 +-- 7 files changed, 347 insertions(+), 89 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd index a7f3bfd9585..0b7cd941e18 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pxd +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd @@ -9,7 +9,6 @@ from cuda.core._memory._ipc cimport IPCDataForMR cdef class DeviceMemoryResource(_MemPool): cdef: int _dev_id - object _peer_accessible_by cpdef DMR_mempool_get_access(DeviceMemoryResource, int) diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx index 62df3cfb305..48beea80b71 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx @@ -25,7 +25,11 @@ import multiprocessing import platform # no-cython-lint import uuid -from ._peer_access_utils import plan_peer_access_update +from ._peer_access_utils import ( + PeerAccessibleBySetProxy, + _resolve_peer_device_id, + plan_peer_access_update, +) from cuda.core._utils.cuda_utils import check_multiprocessing_start_method __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions'] @@ -131,7 +135,6 @@ cdef class DeviceMemoryResource(_MemPool): def __cinit__(self, *args, **kwargs): self._dev_id = cydriver.CU_DEVICE_INVALID - self._peer_accessible_by = None def __init__(self, device_id: Device | int, options=None): _DMR_init(self, device_id, options) @@ -191,7 +194,6 @@ cdef class DeviceMemoryResource(_MemPool): _ipc.MP_from_allocation_handle(cls, alloc_handle)) from .._device import Device mr._dev_id = Device(device_id).device_id - mr._peer_accessible_by = () return mr @property @@ -213,34 +215,60 @@ cdef class DeviceMemoryResource(_MemPool): @property def peer_accessible_by(self): """ - Get or set the devices that can access allocations from this memory - pool. Access can be modified at any time and affects all allocations + Live driver-backed set view of the devices that can access allocations from this memory pool. - Returns a tuple of sorted device IDs that currently have peer access to - allocations from this memory pool. + Returns a :class:`PeerAccessibleBySetProxy` (a + :class:`collections.abc.MutableSet`) whose reads call + ``cuMemPoolGetAccess`` and whose writes call ``cuMemPoolSetAccess``. + Iteration yields :class:`Device` objects; ``add``, ``discard``, and + ``__contains__`` accept either a :class:`Device` or a device-ordinal + ``int``. There is no in-memory cache, so the view always reflects the + current driver state and stays consistent across multiple wrappers + around the same pool. - When setting, accepts a sequence of :obj:`~_device.Device` objects or device IDs. - Setting to an empty sequence revokes all peer access. + When setting, accepts an iterable of :obj:`~_device.Device` objects or + device IDs. Setting replaces the full set in a single batched driver call. - For non-owned pools (the default or current device pool), the state - is always queried from the driver to reflect changes made by other - wrappers or direct driver calls. + Bulk operations (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``, + and the property setter) each issue exactly one ``cuMemPoolSetAccess`` + call so the toolkit can update existing memory mappings in parallel. Examples -------- >>> dmr = DeviceMemoryResource(0) - >>> dmr.peer_accessible_by = [1] # Grant access to device 1 - >>> assert dmr.peer_accessible_by == (1,) - >>> dmr.peer_accessible_by = [] # Revoke access + >>> dmr.peer_accessible_by.add(1) # grant access to device 1 + >>> assert dmr.peer_accessible_by == {Device(1)} + >>> dmr.peer_accessible_by |= {Device(2)} # batched grant via |= + >>> dmr.peer_accessible_by = [] # revoke all in one call """ - if not self._mempool_owned: - _DMR_query_peer_access(self) - return self._peer_accessible_by + return PeerAccessibleBySetProxy(self) @peer_accessible_by.setter def peer_accessible_by(self, devices): - _DMR_set_peer_accessible_by(self, devices) + _DMR_replace_peer_accessible_by(self, devices) + + def _query_peer_access_ids(self): + """Return the current peer device IDs as a sorted tuple of ints. + + Always queries the driver via ``cuMemPoolGetAccess`` for every visible + device. Used by :class:`PeerAccessibleBySetProxy` for ``__iter__`` and + ``__len__``. + """ + return _DMR_query_peer_access_ids(self) + + def _peer_access_includes(self, int dev_id) -> bool: + """Return True if peer access from ``dev_id`` is currently granted.""" + return _DMR_peer_access_includes(self, dev_id) + + def _apply_peer_access_diff(self, to_add, to_remove): + """Issue a single ``cuMemPoolSetAccess`` for the given add/remove deltas. + + ``to_add`` and ``to_remove`` are iterables of device-ordinal ints. + Both must already be filtered (no owner, no overlap, no duplicates). + Used by :class:`PeerAccessibleBySetProxy` for batched writes. + """ + _DMR_apply_peer_access_diff(self, tuple(to_add), tuple(to_remove)) @property def is_device_accessible(self) -> bool: @@ -253,8 +281,8 @@ cdef class DeviceMemoryResource(_MemPool): return False -cdef inline _DMR_query_peer_access(DeviceMemoryResource self): - """Query the driver for the actual peer access state of this pool.""" +cdef inline tuple _DMR_query_peer_access_ids(DeviceMemoryResource self): + """Return the current peer device IDs as a sorted tuple of ints.""" cdef int total cdef cydriver.CUmemAccess_flags flags cdef cydriver.CUmemLocation location @@ -273,59 +301,74 @@ cdef inline _DMR_query_peer_access(DeviceMemoryResource self): if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE: peers.append(dev_id) - self._peer_accessible_by = tuple(sorted(peers)) + return tuple(sorted(peers)) -cdef inline _DMR_set_peer_accessible_by(DeviceMemoryResource self, devices): - from .._device import Device +cdef inline bint _DMR_peer_access_includes(DeviceMemoryResource self, int dev_id): + """Return True if peer access from ``dev_id`` is currently granted.""" + cdef cydriver.CUmemAccess_flags flags + cdef cydriver.CUmemLocation location - this_dev = Device(self._dev_id) - cdef object resolve_device_id = lambda dev: Device(dev).device_id - cdef object plan - cdef tuple target_ids - cdef tuple to_add - cdef tuple to_rm - if not self._mempool_owned: - _DMR_query_peer_access(self) - plan = plan_peer_access_update( - owner_device_id=self._dev_id, - current_peer_ids=self._peer_accessible_by, - requested_devices=devices, - resolve_device_id=resolve_device_id, - can_access_peer=this_dev.can_access_peer, - ) - target_ids = plan.target_ids - to_add = plan.to_add - to_rm = plan.to_remove - cdef size_t count = len(to_add) + len(to_rm) + location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + location.id = dev_id + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(self._h_pool), &location)) + return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + + +cdef inline _DMR_apply_peer_access_diff( + DeviceMemoryResource self, tuple to_add, tuple to_remove +): + """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas.""" + cdef size_t count = len(to_add) + len(to_remove) cdef cydriver.CUmemAccessDesc* access_desc = NULL cdef size_t i = 0 - if count > 0: - access_desc = PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc)) - if access_desc == NULL: - raise MemoryError("Failed to allocate memory for access descriptors") + if count == 0: + return + + access_desc = PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc)) + if access_desc == NULL: + raise MemoryError("Failed to allocate memory for access descriptors") + + try: + for dev_id in to_add: + access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + access_desc[i].location.id = dev_id + i += 1 + for dev_id in to_remove: + access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE + access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + access_desc[i].location.id = dev_id + i += 1 + + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count)) + finally: + if access_desc != NULL: + PyMem_Free(access_desc) - try: - for dev_id in to_add: - access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE - access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - access_desc[i].location.id = dev_id - i += 1 - for dev_id in to_rm: - access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE - access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - access_desc[i].location.id = dev_id - i += 1 +cdef inline _DMR_replace_peer_accessible_by(DeviceMemoryResource self, devices): + """Replace the full peer-access set in a single batched driver call. - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count)) - finally: - if access_desc != NULL: - PyMem_Free(access_desc) + Backs the ``mr.peer_accessible_by = [...]`` setter. Uses the same planner + as the proxy's bulk ops; the only difference is that adds and removes are + derived from the symmetric difference between current driver state and the + requested target set. + """ + from .._device import Device - self._peer_accessible_by = tuple(target_ids) + this_dev = Device(self._dev_id) + plan = plan_peer_access_update( + owner_device_id=self._dev_id, + current_peer_ids=_DMR_query_peer_access_ids(self), + requested_devices=devices, + resolve_device_id=_resolve_peer_device_id, + can_access_peer=this_dev.can_access_peer, + ) + _DMR_apply_peer_access_diff(self, plan.to_add, plan.to_remove) cdef inline _DMR_init(DeviceMemoryResource self, device_id, options): @@ -351,7 +394,6 @@ cdef inline _DMR_init(DeviceMemoryResource self, device_id, options): self._mempool_owned = False MP_raise_release_threshold(self) else: - self._peer_accessible_by = () MP_init_create_pool( self, cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.py b/cuda_core/cuda/core/_memory/_peer_access_utils.py index e08de69f2c7..4589612ff62 100644 --- a/cuda_core/cuda/core/_memory/_peer_access_utils.py +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.py @@ -4,8 +4,13 @@ from __future__ import annotations -from collections.abc import Callable, Iterable +from collections.abc import Callable, Iterable, MutableSet from dataclasses import dataclass +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from cuda.core._device import Device + from cuda.core._memory._device_memory_resource import DeviceMemoryResource @dataclass(frozen=True) @@ -57,3 +62,201 @@ def plan_peer_access_update( to_add=tuple(sorted(target_id_set - current_ids)), to_remove=tuple(sorted(current_ids - target_id_set)), ) + + +def _resolve_peer_device_id(value: Device | int) -> int: + """Coerce ``Device | int`` into a device-ordinal int.""" + from cuda.core._device import Device + + return Device(value).device_id + + +class PeerAccessibleBySetProxy(MutableSet): + """Live driver-backed view of the peer devices granted access to a memory pool. + + Reads (``__contains__``, ``__iter__``, ``len(...)``) call ``cuMemPoolGetAccess``; + writes (``add``, ``discard``, and bulk ops) call ``cuMemPoolSetAccess``. There + is no in-memory mirror, so the view always reflects the current driver state + and stays consistent across multiple wrappers around the same pool. + + Iteration yields :class:`~cuda.core.Device` objects. ``add``, ``discard``, and + ``__contains__`` accept either a :class:`~cuda.core.Device` or a device-ordinal + ``int``; the owner device is silently ignored when supplied. + + All bulk operations (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``) + issue exactly one ``cuMemPoolSetAccess`` call. This matters: peer-access + transitions can take seconds per pool because every existing memory mapping + is updated, so coalescing into a single driver call lets the toolkit handle + the mappings in parallel. + """ + + __slots__ = ("_mr",) + + def __init__(self, mr: DeviceMemoryResource): + self._mr = mr + + @classmethod + def _from_iterable(cls, it): + # Binary set operators (&, |, -, ^) collect their result through + # _from_iterable. Returning a plain set lets the user reason about + # the result independently of any pool's driver state. + return set(it) + + # --- abstract MutableSet methods --- + + def __contains__(self, value) -> bool: + try: + dev_id = _resolve_peer_device_id(value) + except (TypeError, ValueError): + return False + if dev_id == self._mr._dev_id: + return False + return self._mr._peer_access_includes(dev_id) + + def __iter__(self): + from cuda.core._device import Device + + return iter(Device(dev_id) for dev_id in self._mr._query_peer_access_ids()) + + def __len__(self) -> int: + return len(self._mr._query_peer_access_ids()) + + def add(self, value: Device | int) -> None: + """Grant peer access from ``value`` to allocations in this pool.""" + self._apply([value], ()) + + def discard(self, value: Device | int) -> None: + """Revoke peer access from ``value`` to allocations in this pool.""" + try: + dev_id = _resolve_peer_device_id(value) + except (TypeError, ValueError): + return + self._apply((), [dev_id]) + + # --- bulk overrides: one driver call per op --- + + def clear(self) -> None: + """Revoke all peer access in a single driver call.""" + self._apply((), self._mr._query_peer_access_ids()) + + def update(self, *others) -> None: + """Grant peer access to every device in ``others`` in one driver call.""" + to_add = [] + for other in others: + to_add.extend(other) + if to_add: + self._apply(to_add, ()) + + def difference_update(self, *others) -> None: + """Revoke peer access for every device in ``others`` in one driver call.""" + revoke_ids: set[int] = set() + for other in others: + for value in other: + try: + revoke_ids.add(_resolve_peer_device_id(value)) + except (TypeError, ValueError): + continue + current = set(self._mr._query_peer_access_ids()) + to_remove = revoke_ids & current + if to_remove: + self._apply((), to_remove) + + def intersection_update(self, *others) -> None: + """Restrict peer access to the intersection in a single driver call.""" + keep_ids: set[int] | None = None + for other in others: + ids: set[int] = set() + for value in other: + try: + ids.add(_resolve_peer_device_id(value)) + except (TypeError, ValueError): + continue + keep_ids = ids if keep_ids is None else keep_ids & ids + if keep_ids is None: + return # ``set.intersection_update()`` with no args is a no-op + current = set(self._mr._query_peer_access_ids()) + to_remove = current - keep_ids + if to_remove: + self._apply((), to_remove) + + def symmetric_difference_update(self, other) -> None: + """Toggle peer access for every device in ``other`` in one driver call.""" + toggle_ids: set[int] = set() + for value in other: + try: + toggle_ids.add(_resolve_peer_device_id(value)) + except (TypeError, ValueError): + continue + current = set(self._mr._query_peer_access_ids()) + to_add = toggle_ids - current + to_remove = toggle_ids & current + if to_add or to_remove: + self._apply(to_add, to_remove) + + def __ior__(self, other): + self.update(other) + return self + + def __iand__(self, other): + self.intersection_update(other) + return self + + def __isub__(self, other): + if other is self: + self.clear() + else: + self.difference_update(other) + return self + + def __ixor__(self, other): + self.symmetric_difference_update(other) + return self + + def __repr__(self) -> str: + return f"PeerAccessibleBySetProxy({set(self)!r})" + + # --- internal: route every write through one batched driver call --- + + def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> None: + """Compute the diff and issue a single ``cuMemPoolSetAccess``. + + ``additions`` and ``removals`` are user-supplied (``Device | int``); + only the owner device is filtered out. Adds are validated through + :meth:`Device.can_access_peer` via :func:`plan_peer_access_update`; + removals bypass that check (revoking is always permitted). + """ + from cuda.core._device import Device + + owner_id = self._mr._dev_id + owner = Device(owner_id) + current = self._mr._query_peer_access_ids() + + # Plan additions through the existing helper (validates can_access_peer). + plan = plan_peer_access_update( + owner_device_id=owner_id, + current_peer_ids=current, + # union of (current set + requested adds) so the planner emits + # exactly the to_add deltas for these additions, no removals. + requested_devices=[*current, *additions], + resolve_device_id=_resolve_peer_device_id, + can_access_peer=owner.can_access_peer, + ) + to_add = plan.to_add + + # Removals: resolve, drop owner and unknowns, intersect with current. + current_set = set(current) + revoke_ids: set[int] = set() + for value in removals: + try: + dev_id = _resolve_peer_device_id(value) + except (TypeError, ValueError): + continue + if dev_id == owner_id: + continue + if dev_id in current_set: + revoke_ids.add(dev_id) + to_remove = tuple(sorted(revoke_ids)) + + if not to_add and not to_remove: + return + self._mr._apply_peer_access_diff(to_add, to_remove) diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index 141773967e8..edb113dbd25 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -17,6 +17,7 @@ CUDA runtime :toctree: generated/ typing.DevicePointerT + _memory._peer_access_utils.PeerAccessibleBySetProxy _memory._virtual_memory_resource.VirtualMemoryAllocationTypeT _memory._virtual_memory_resource.VirtualMemoryLocationTypeT _memory._virtual_memory_resource.VirtualMemoryGranularityT diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 3f61a30ec1e..5625cfde879 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -113,6 +113,19 @@ Breaking changes ``CUgraphConditionalHandle`` value. Previously, ``.handle`` had to be extracted explicitly. +- :attr:`DeviceMemoryResource.peer_accessible_by` now returns a live + driver-backed :class:`collections.abc.MutableSet` view + (:class:`~_memory._peer_access_utils.PeerAccessibleBySetProxy`) instead of a + sorted ``tuple[int, ...]``. Iteration yields :obj:`~_device.Device` objects; + ``add``, ``discard``, and ``__contains__`` accept either a + :obj:`~_device.Device` or a device-ordinal ``int``. Callers comparing against + tuples must update to set comparisons (e.g. + ``mr.peer_accessible_by == {Device(0)}``). The property setter + (``mr.peer_accessible_by = [...]``) is unchanged. Bulk operations on the + proxy (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``) each issue + exactly one ``cuMemPoolSetAccess`` call so the toolkit can update existing + memory mappings in parallel. + Fixes and enhancements ----------------------- diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index a3f83986701..f2623dd89e0 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -29,7 +29,7 @@ def test_main(self, mempool_device_x2): options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mr = DeviceMemoryResource(dev1, options=options) mr.peer_accessible_by = [dev0] - assert mr.peer_accessible_by == (0,) + assert mr.peer_accessible_by == {dev0} # Spawn child process process = mp.Process(target=self.child_main, args=(mr,)) @@ -38,18 +38,18 @@ def test_main(self, mempool_device_x2): assert process.exitcode == 0 # Verify parent's MR still has peer access set (independent state) - assert mr.peer_accessible_by == (0,) + assert mr.peer_accessible_by == {dev0} mr.close() def child_main(self, mr): Device(1).set_current() assert mr.is_mapped is True assert mr.device_id == 1 - assert mr.peer_accessible_by == () + assert mr.peer_accessible_by == set() mr.peer_accessible_by = [0] - assert mr.peer_accessible_by == (0,) + assert mr.peer_accessible_by == {Device(0)} mr.peer_accessible_by = [] - assert mr.peer_accessible_by == () + assert mr.peer_accessible_by == set() mr.close() @@ -70,9 +70,9 @@ def test_main(self, mempool_device_x2, grant_access_in_parent): mr = DeviceMemoryResource(dev1, options=options) if grant_access_in_parent: mr.peer_accessible_by = [dev0] - assert mr.peer_accessible_by == (0,) + assert mr.peer_accessible_by == {dev0} else: - assert mr.peer_accessible_by == () + assert mr.peer_accessible_by == set() buffer = mr.allocate(NBYTES) pgen = PatternGen(dev1, NBYTES) pgen.fill_buffer(buffer, seed=False) @@ -108,14 +108,14 @@ def child_main(self, mr, buffer): # Test 3: Set peer access and verify buffer becomes accessible dev1.set_current() mr.peer_accessible_by = [0] - assert mr.peer_accessible_by == (0,) + assert mr.peer_accessible_by == {dev0} dev0.set_current() PatternGen(dev0, NBYTES).verify_buffer(buffer, seed=False) # Test 4: Revoke peer access and verify buffer becomes inaccessible dev1.set_current() mr.peer_accessible_by = [] - assert mr.peer_accessible_by == () + assert mr.peer_accessible_by == set() dev0.set_current() with pytest.raises(CUDAError, match="CUDA_ERROR_INVALID_VALUE"): PatternGen(dev0, NBYTES).verify_buffer(buffer, seed=False) diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index 1a790645864..c73ba3ffd1c 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -1,10 +1,10 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import pytest from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer -from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions +from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core._utils.cuda_utils import CUDAError NBYTES = 1024 @@ -100,18 +100,18 @@ def verify_state(state, pattern_seed): transitions = [(s0, s1) for s0 in states for s1 in states if s0 != s1] for init_state, final_state in transitions: dmrs[0].peer_accessible_by = init_state - assert dmrs[0].peer_accessible_by == init_state + assert dmrs[0].peer_accessible_by == {Device(i) for i in init_state} verify_state(init_state, pattern_seed) pattern_seed += 1 dmrs[0].peer_accessible_by = final_state - assert dmrs[0].peer_accessible_by == final_state + assert dmrs[0].peer_accessible_by == {Device(i) for i in final_state} verify_state(final_state, pattern_seed) pattern_seed += 1 def test_peer_access_shared_pool_queries_driver(mempool_device_x2): - """Non-owned pools always query the driver for peer access state.""" + """All pools always query the driver, so wrappers see consistent state.""" dev0, dev1 = mempool_device_x2 # Grant peer access via one wrapper; a second wrapper must see it. @@ -122,18 +122,18 @@ def test_peer_access_shared_pool_queries_driver(mempool_device_x2): # Revoke via dmr2; dmr1 must reflect the change immediately. dmr2.peer_accessible_by = [] - assert dmr1.peer_accessible_by == () + assert dmr1.peer_accessible_by == set() # Re-grant via dmr1. A fresh wrapper that has never read the # property must still query the driver before computing diffs # in the setter, so setting [] must discover and revoke the access. dmr1.peer_accessible_by = [dev1] dmr3 = DeviceMemoryResource(dev0) - assert dmr1.peer_accessible_by == (dev1.device_id,) - assert dmr2.peer_accessible_by == (dev1.device_id,) - assert dmr3.peer_accessible_by == (dev1.device_id,) + assert dmr1.peer_accessible_by == {dev1} + assert dmr2.peer_accessible_by == {dev1} + assert dmr3.peer_accessible_by == {dev1} dmr3.peer_accessible_by = [] - assert DeviceMemoryResource(dev0).peer_accessible_by == () - assert dmr1.peer_accessible_by == () - assert dmr2.peer_accessible_by == () - assert dmr3.peer_accessible_by == () + assert DeviceMemoryResource(dev0).peer_accessible_by == set() + assert dmr1.peer_accessible_by == set() + assert dmr2.peer_accessible_by == set() + assert dmr3.peer_accessible_by == set() From e34aacf0176cfe1883df25fb4bcb6c540600d244 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 4 May 2026 15:25:15 -0700 Subject: [PATCH 2/8] cuda.core: move peer-access internals into a single _peer_access_utils.pyx The previous commit left DeviceMemoryResource carrying three pass-through def methods (_query_peer_access_ids, _peer_access_includes, _apply_peer_access_diff) whose only purpose was to give the pure-Python proxy in _peer_access_utils.py a way to call cdef helpers in _device_memory_resource.pyx. These methods served no public role and cluttered the class API. Promote _peer_access_utils.py to a Cython module so the proxy and the driver-touching helpers can live together: - Convert _peer_access_utils.py to _peer_access_utils.pyx. cimports cydriver and DeviceMemoryResource from the .pxd; uses nogil and direct CUmemAccessDesc packing identically to before. - Move _DMR_query_peer_access_ids, _DMR_peer_access_includes, _DMR_apply_peer_access_diff, and _DMR_replace_peer_accessible_by from _device_memory_resource.pyx into the new module as cdef helpers (and a cpdef replace_peer_accessible_by entry point used by the property setter). - Drop the three pass-through def methods from DeviceMemoryResource. The class is left with the property getter and setter only; everything else is module-level in _peer_access_utils. - The proxy now calls the module-level cdef helpers directly instead of routing through methods on mr. No behavior change. The public surface (PeerAccessibleBySetProxy, plan_peer_access_update, normalize_peer_access_targets, PeerAccessPlan) is preserved at the same import paths. Co-authored-by: Cursor --- .../core/_memory/_device_memory_resource.pyx | 151 ++---------------- ...access_utils.py => _peer_access_utils.pyx} | 145 ++++++++++++++--- 2 files changed, 134 insertions(+), 162 deletions(-) rename cuda_core/cuda/core/_memory/{_peer_access_utils.py => _peer_access_utils.pyx} (62%) diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx index 48beea80b71..703e37b7bf0 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx @@ -18,18 +18,12 @@ from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN, ) -from cpython.mem cimport PyMem_Malloc, PyMem_Free - from dataclasses import dataclass import multiprocessing import platform # no-cython-lint import uuid -from ._peer_access_utils import ( - PeerAccessibleBySetProxy, - _resolve_peer_device_id, - plan_peer_access_update, -) +from ._peer_access_utils import PeerAccessibleBySetProxy, replace_peer_accessible_by from cuda.core._utils.cuda_utils import check_multiprocessing_start_method __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions'] @@ -215,60 +209,27 @@ cdef class DeviceMemoryResource(_MemPool): @property def peer_accessible_by(self): """ - Live driver-backed set view of the devices that can access allocations + Get or set the devices that can access allocations from this memory + pool. Access can be modified at any time and affects all allocations from this memory pool. - Returns a :class:`PeerAccessibleBySetProxy` (a - :class:`collections.abc.MutableSet`) whose reads call - ``cuMemPoolGetAccess`` and whose writes call ``cuMemPoolSetAccess``. - Iteration yields :class:`Device` objects; ``add``, ``discard``, and - ``__contains__`` accept either a :class:`Device` or a device-ordinal - ``int``. There is no in-memory cache, so the view always reflects the - current driver state and stays consistent across multiple wrappers - around the same pool. - - When setting, accepts an iterable of :obj:`~_device.Device` objects or - device IDs. Setting replaces the full set in a single batched driver call. - - Bulk operations (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``, - and the property setter) each issue exactly one ``cuMemPoolSetAccess`` - call so the toolkit can update existing memory mappings in parallel. + Returns a set-like proxy of :obj:`~_device.Device` objects that manages + peer access. Inputs are accepted as either :obj:`~_device.Device` + objects or device-ordinal :class:`int` values. Examples -------- >>> dmr = DeviceMemoryResource(0) - >>> dmr.peer_accessible_by.add(1) # grant access to device 1 - >>> assert dmr.peer_accessible_by == {Device(1)} - >>> dmr.peer_accessible_by |= {Device(2)} # batched grant via |= - >>> dmr.peer_accessible_by = [] # revoke all in one call + >>> dmr.peer_accessible_by = {1} # grant access to device 1 + >>> assert 1 in dmr.peer_accessible_by + >>> dmr.peer_accessible_by.add(2) # update access to include device 2 + >>> dmr.peer_accessible_by = [] # revoke peer access """ return PeerAccessibleBySetProxy(self) @peer_accessible_by.setter def peer_accessible_by(self, devices): - _DMR_replace_peer_accessible_by(self, devices) - - def _query_peer_access_ids(self): - """Return the current peer device IDs as a sorted tuple of ints. - - Always queries the driver via ``cuMemPoolGetAccess`` for every visible - device. Used by :class:`PeerAccessibleBySetProxy` for ``__iter__`` and - ``__len__``. - """ - return _DMR_query_peer_access_ids(self) - - def _peer_access_includes(self, int dev_id) -> bool: - """Return True if peer access from ``dev_id`` is currently granted.""" - return _DMR_peer_access_includes(self, dev_id) - - def _apply_peer_access_diff(self, to_add, to_remove): - """Issue a single ``cuMemPoolSetAccess`` for the given add/remove deltas. - - ``to_add`` and ``to_remove`` are iterables of device-ordinal ints. - Both must already be filtered (no owner, no overlap, no duplicates). - Used by :class:`PeerAccessibleBySetProxy` for batched writes. - """ - _DMR_apply_peer_access_diff(self, tuple(to_add), tuple(to_remove)) + replace_peer_accessible_by(self, devices) @property def is_device_accessible(self) -> bool: @@ -281,96 +242,6 @@ cdef class DeviceMemoryResource(_MemPool): return False -cdef inline tuple _DMR_query_peer_access_ids(DeviceMemoryResource self): - """Return the current peer device IDs as a sorted tuple of ints.""" - cdef int total - cdef cydriver.CUmemAccess_flags flags - cdef cydriver.CUmemLocation location - cdef list peers = [] - - with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) - - location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - for dev_id in range(total): - if dev_id == self._dev_id: - continue - location.id = dev_id - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(self._h_pool), &location)) - if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE: - peers.append(dev_id) - - return tuple(sorted(peers)) - - -cdef inline bint _DMR_peer_access_includes(DeviceMemoryResource self, int dev_id): - """Return True if peer access from ``dev_id`` is currently granted.""" - cdef cydriver.CUmemAccess_flags flags - cdef cydriver.CUmemLocation location - - location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - location.id = dev_id - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(self._h_pool), &location)) - return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE - - -cdef inline _DMR_apply_peer_access_diff( - DeviceMemoryResource self, tuple to_add, tuple to_remove -): - """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas.""" - cdef size_t count = len(to_add) + len(to_remove) - cdef cydriver.CUmemAccessDesc* access_desc = NULL - cdef size_t i = 0 - - if count == 0: - return - - access_desc = PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc)) - if access_desc == NULL: - raise MemoryError("Failed to allocate memory for access descriptors") - - try: - for dev_id in to_add: - access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE - access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - access_desc[i].location.id = dev_id - i += 1 - for dev_id in to_remove: - access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE - access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - access_desc[i].location.id = dev_id - i += 1 - - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count)) - finally: - if access_desc != NULL: - PyMem_Free(access_desc) - - -cdef inline _DMR_replace_peer_accessible_by(DeviceMemoryResource self, devices): - """Replace the full peer-access set in a single batched driver call. - - Backs the ``mr.peer_accessible_by = [...]`` setter. Uses the same planner - as the proxy's bulk ops; the only difference is that adds and removes are - derived from the symmetric difference between current driver state and the - requested target set. - """ - from .._device import Device - - this_dev = Device(self._dev_id) - plan = plan_peer_access_update( - owner_device_id=self._dev_id, - current_peer_ids=_DMR_query_peer_access_ids(self), - requested_devices=devices, - resolve_device_id=_resolve_peer_device_id, - can_access_peer=this_dev.can_access_peer, - ) - _DMR_apply_peer_access_diff(self, plan.to_add, plan.to_remove) - - cdef inline _DMR_init(DeviceMemoryResource self, device_id, options): from .._device import Device cdef int dev_id = Device(device_id).device_id diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.py b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx similarity index 62% rename from cuda_core/cuda/core/_memory/_peer_access_utils.py rename to cuda_core/cuda/core/_memory/_peer_access_utils.pyx index 4589612ff62..22f055dcfb6 100644 --- a/cuda_core/cuda/core/_memory/_peer_access_utils.py +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx @@ -8,9 +8,14 @@ from dataclasses import dataclass from typing import TYPE_CHECKING +from cuda.bindings cimport cydriver +from cuda.core._memory._device_memory_resource cimport DeviceMemoryResource +from cuda.core._resource_handles cimport as_cu +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cpython.mem cimport PyMem_Malloc, PyMem_Free + if TYPE_CHECKING: from cuda.core._device import Device - from cuda.core._memory._device_memory_resource import DeviceMemoryResource @dataclass(frozen=True) @@ -64,13 +69,107 @@ def plan_peer_access_update( ) -def _resolve_peer_device_id(value: Device | int) -> int: +def _resolve_peer_device_id(value): """Coerce ``Device | int`` into a device-ordinal int.""" from cuda.core._device import Device return Device(value).device_id +# ---- driver-touching helpers (cdef inline, called from .pyx code) ----------- + +cdef inline tuple _query_peer_access_ids(DeviceMemoryResource mr): + """Return the current peer device IDs as a sorted tuple of ints.""" + cdef int total + cdef cydriver.CUmemAccess_flags flags + cdef cydriver.CUmemLocation location + cdef list peers = [] + + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) + + location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + for dev_id in range(total): + if dev_id == mr._dev_id: + continue + location.id = dev_id + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(mr._h_pool), &location)) + if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE: + peers.append(dev_id) + + return tuple(sorted(peers)) + + +cdef inline bint _peer_access_includes(DeviceMemoryResource mr, int dev_id): + """Return True if peer access from ``dev_id`` is currently granted.""" + cdef cydriver.CUmemAccess_flags flags + cdef cydriver.CUmemLocation location + + location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + location.id = dev_id + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(mr._h_pool), &location)) + return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + + +cdef inline _apply_peer_access_diff( + DeviceMemoryResource mr, tuple to_add, tuple to_remove +): + """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas.""" + cdef size_t count = len(to_add) + len(to_remove) + cdef cydriver.CUmemAccessDesc* access_desc = NULL + cdef size_t i = 0 + + if count == 0: + return + + access_desc = PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc)) + if access_desc == NULL: + raise MemoryError("Failed to allocate memory for access descriptors") + + try: + for dev_id in to_add: + access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + access_desc[i].location.id = dev_id + i += 1 + for dev_id in to_remove: + access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE + access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + access_desc[i].location.id = dev_id + i += 1 + + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(mr._h_pool), access_desc, count)) + finally: + if access_desc != NULL: + PyMem_Free(access_desc) + + +cpdef replace_peer_accessible_by(DeviceMemoryResource mr, devices): + """Replace the full peer-access set in a single batched driver call. + + Backs the ``mr.peer_accessible_by = [...]`` setter. Uses the same planner + as the proxy's bulk ops; the only difference is that adds and removes are + derived from the symmetric difference between current driver state and the + requested target set. + """ + from cuda.core._device import Device + + this_dev = Device(mr._dev_id) + plan = plan_peer_access_update( + owner_device_id=mr._dev_id, + current_peer_ids=_query_peer_access_ids(mr), + requested_devices=devices, + resolve_device_id=_resolve_peer_device_id, + can_access_peer=this_dev.can_access_peer, + ) + _apply_peer_access_diff(mr, plan.to_add, plan.to_remove) + + +# ---- Python MutableSet proxy ------------------------------------------------ + class PeerAccessibleBySetProxy(MutableSet): """Live driver-backed view of the peer devices granted access to a memory pool. @@ -92,7 +191,7 @@ class PeerAccessibleBySetProxy(MutableSet): __slots__ = ("_mr",) - def __init__(self, mr: DeviceMemoryResource): + def __init__(self, mr): self._mr = mr @classmethod @@ -109,23 +208,24 @@ def __contains__(self, value) -> bool: dev_id = _resolve_peer_device_id(value) except (TypeError, ValueError): return False - if dev_id == self._mr._dev_id: + cdef DeviceMemoryResource mr = self._mr + if dev_id == mr._dev_id: return False - return self._mr._peer_access_includes(dev_id) + return _peer_access_includes(mr, dev_id) def __iter__(self): from cuda.core._device import Device - return iter(Device(dev_id) for dev_id in self._mr._query_peer_access_ids()) + return iter(Device(dev_id) for dev_id in _query_peer_access_ids(self._mr)) def __len__(self) -> int: - return len(self._mr._query_peer_access_ids()) + return len(_query_peer_access_ids(self._mr)) - def add(self, value: Device | int) -> None: + def add(self, value) -> None: """Grant peer access from ``value`` to allocations in this pool.""" self._apply([value], ()) - def discard(self, value: Device | int) -> None: + def discard(self, value) -> None: """Revoke peer access from ``value`` to allocations in this pool.""" try: dev_id = _resolve_peer_device_id(value) @@ -137,7 +237,7 @@ def discard(self, value: Device | int) -> None: def clear(self) -> None: """Revoke all peer access in a single driver call.""" - self._apply((), self._mr._query_peer_access_ids()) + self._apply((), _query_peer_access_ids(self._mr)) def update(self, *others) -> None: """Grant peer access to every device in ``others`` in one driver call.""" @@ -149,23 +249,23 @@ def update(self, *others) -> None: def difference_update(self, *others) -> None: """Revoke peer access for every device in ``others`` in one driver call.""" - revoke_ids: set[int] = set() + revoke_ids = set() for other in others: for value in other: try: revoke_ids.add(_resolve_peer_device_id(value)) except (TypeError, ValueError): continue - current = set(self._mr._query_peer_access_ids()) + current = set(_query_peer_access_ids(self._mr)) to_remove = revoke_ids & current if to_remove: self._apply((), to_remove) def intersection_update(self, *others) -> None: """Restrict peer access to the intersection in a single driver call.""" - keep_ids: set[int] | None = None + keep_ids = None for other in others: - ids: set[int] = set() + ids = set() for value in other: try: ids.add(_resolve_peer_device_id(value)) @@ -174,20 +274,20 @@ def intersection_update(self, *others) -> None: keep_ids = ids if keep_ids is None else keep_ids & ids if keep_ids is None: return # ``set.intersection_update()`` with no args is a no-op - current = set(self._mr._query_peer_access_ids()) + current = set(_query_peer_access_ids(self._mr)) to_remove = current - keep_ids if to_remove: self._apply((), to_remove) def symmetric_difference_update(self, other) -> None: """Toggle peer access for every device in ``other`` in one driver call.""" - toggle_ids: set[int] = set() + toggle_ids = set() for value in other: try: toggle_ids.add(_resolve_peer_device_id(value)) except (TypeError, ValueError): continue - current = set(self._mr._query_peer_access_ids()) + current = set(_query_peer_access_ids(self._mr)) to_add = toggle_ids - current to_remove = toggle_ids & current if to_add or to_remove: @@ -217,7 +317,7 @@ def __repr__(self) -> str: # --- internal: route every write through one batched driver call --- - def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> None: + def _apply(self, additions, removals) -> None: """Compute the diff and issue a single ``cuMemPoolSetAccess``. ``additions`` and ``removals`` are user-supplied (``Device | int``); @@ -227,9 +327,10 @@ def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> Non """ from cuda.core._device import Device - owner_id = self._mr._dev_id + cdef DeviceMemoryResource mr = self._mr + owner_id = mr._dev_id owner = Device(owner_id) - current = self._mr._query_peer_access_ids() + current = _query_peer_access_ids(mr) # Plan additions through the existing helper (validates can_access_peer). plan = plan_peer_access_update( @@ -245,7 +346,7 @@ def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> Non # Removals: resolve, drop owner and unknowns, intersect with current. current_set = set(current) - revoke_ids: set[int] = set() + revoke_ids = set() for value in removals: try: dev_id = _resolve_peer_device_id(value) @@ -259,4 +360,4 @@ def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> Non if not to_add and not to_remove: return - self._mr._apply_peer_access_diff(to_add, to_remove) + _apply_peer_access_diff(mr, to_add, to_remove) From 123e057276abaaea32986e3aa090f52f8be4a71e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 4 May 2026 16:30:10 -0700 Subject: [PATCH 3/8] cuda.core: tighten peer-access query loop and 1.0.0 release note Refactor _query_peer_access_ids so the entire driver loop runs inside a single nogil block instead of acquiring/releasing the GIL once per device. The flag query now uses a cached as_cu(mr._h_pool) handle and fills a libcpp.vector[int]; because range(total) ascends, the result is already sorted and the trailing sorted() call is dropped. Also tighten the peer_accessible_by entry in 1.0.0-notes.rst: the breaking-change blurb only needs to state the type/element change, so remove the implementation-flavored details about input acceptance and batched cuMemPoolSetAccess calls. Co-authored-by: Cursor --- .../cuda/core/_memory/_peer_access_utils.pyx | 35 ++++++++++++------- cuda_core/docs/source/release/1.0.0-notes.rst | 15 ++------ 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx index 22f055dcfb6..e613a545b3d 100644 --- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx @@ -13,6 +13,7 @@ from cuda.core._memory._device_memory_resource cimport DeviceMemoryResource from cuda.core._resource_handles cimport as_cu from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cpython.mem cimport PyMem_Malloc, PyMem_Free +from libcpp.vector cimport vector if TYPE_CHECKING: from cuda.core._device import Device @@ -79,26 +80,34 @@ def _resolve_peer_device_id(value): # ---- driver-touching helpers (cdef inline, called from .pyx code) ----------- cdef inline tuple _query_peer_access_ids(DeviceMemoryResource mr): - """Return the current peer device IDs as a sorted tuple of ints.""" + """Return the current peer device IDs as a sorted tuple of ints. + + The full driver loop runs inside a single ``nogil`` block. Because + ``range(total)`` ascends, the result is already sorted. + """ cdef int total + cdef int dev_id + cdef int owner_id = mr._dev_id cdef cydriver.CUmemAccess_flags flags cdef cydriver.CUmemLocation location - cdef list peers = [] + cdef cydriver.CUmemoryPool h_pool = as_cu(mr._h_pool) + cdef vector[int] peers + cdef size_t i, n + + location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE with nogil: HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) + for dev_id in range(total): + if dev_id == owner_id: + continue + location.id = dev_id + HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, h_pool, &location)) + if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE: + peers.push_back(dev_id) - location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - for dev_id in range(total): - if dev_id == mr._dev_id: - continue - location.id = dev_id - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(mr._h_pool), &location)) - if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE: - peers.append(dev_id) - - return tuple(sorted(peers)) + n = peers.size() + return tuple(peers[i] for i in range(n)) cdef inline bint _peer_access_includes(DeviceMemoryResource mr, int dev_id): diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 5625cfde879..7d0f3cf50b4 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -113,18 +113,9 @@ Breaking changes ``CUgraphConditionalHandle`` value. Previously, ``.handle`` had to be extracted explicitly. -- :attr:`DeviceMemoryResource.peer_accessible_by` now returns a live - driver-backed :class:`collections.abc.MutableSet` view - (:class:`~_memory._peer_access_utils.PeerAccessibleBySetProxy`) instead of a - sorted ``tuple[int, ...]``. Iteration yields :obj:`~_device.Device` objects; - ``add``, ``discard``, and ``__contains__`` accept either a - :obj:`~_device.Device` or a device-ordinal ``int``. Callers comparing against - tuples must update to set comparisons (e.g. - ``mr.peer_accessible_by == {Device(0)}``). The property setter - (``mr.peer_accessible_by = [...]``) is unchanged. Bulk operations on the - proxy (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``) each issue - exactly one ``cuMemPoolSetAccess`` call so the toolkit can update existing - memory mappings in parallel. +- :attr:`DeviceMemoryResource.peer_accessible_by` now returns a + :class:`collections.abc.MutableSet` of :obj:`~_device.Device` objects instead + of a sorted ``tuple[int, ...]``. The property setter is unchanged. Fixes and enhancements From 3e9c5dd7699cadbdec0c20999ebbcd8f6ac16d9e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 4 May 2026 16:52:42 -0700 Subject: [PATCH 4/8] cuda.core: cover PeerAccessibleBySetProxy interface, batching, and edge cases Existing peer-access tests covered the integration path well (real copies across peers, the full transition matrix, shared-pool consistency) but only touched ``in``, ``==``, and the property setter on the new set proxy. After the v1.0.0 break that surfaced ~25 ``MutableSet`` methods, nothing was pinning the type-coercion contract, the owner-filtering behavior, the ``KeyError``/value-error paths, or the "one ``cuMemPoolSetAccess`` per bulk op" performance invariant. Add the following coverage in ``test_memory_peer_access.py``: - A ``MutableSet`` conformance test using a relaxed ``assert_mutable_set_interface`` mode that admits subjects holding at most one insertable element. CI maxes at two GPUs (one peer), so the multi-element protocol pass cannot run there. The new ``support_multi_insert=False`` path takes one insertable item plus two non-member sentinels and exercises every ``MutableSet`` method (``add``/``discard``/``remove``/``pop``/``clear``/``update``, comparisons, isdisjoint, subset/superset, binary and in-place operators, ``__iter__``/``__len__``/``__repr__``). - ``Device``/``int`` interchangeability on ``add``/``discard``/``__contains__``. - The owner-device filtering contract on every write (silent no-op). - Error paths: ``add(out_of_range)`` and ``add(non_coercible)`` raise while the lenient ``discard``/``__contains__`` paths swallow the same inputs; ``remove(non_member)`` raises ``KeyError``. - "Live driver view" semantics: a proxy obtained before another wrapper modifies the pool reflects the change with no refresh step. - ``__iter__`` ordering is ascending by ``device_id`` and elements are ``Device`` instances; ``__repr__`` includes the class name and tracks live contents; the getter returns the documented proxy type. - A batching spy that monkeypatches the module-level ``_apply_peer_access_diff`` and asserts that every bulk op (``|=``/``&=``/``-=``/``^=``/``update``/``difference_update``/ ``clear``) and the property setter issues at most one driver call, zero when the diff is empty. To make the spy possible, ``_apply_peer_access_diff`` is now a Python-visible ``def`` wrapper around a renamed ``_apply_peer_access_diff_cython`` ``cdef inline``. The proxy and the property setter still call ``_apply_peer_access_diff`` by bare name, which Cython resolves through the module's globals at runtime, so a ``monkeypatch.setattr(_peer_access_utils, "_apply_peer_access_diff", ...)`` intercepts them. The extra Python-level dispatch is negligible next to ``cuMemPoolSetAccess`` itself. Co-authored-by: Cursor --- .../cuda/core/_memory/_peer_access_utils.pyx | 13 +- .../helpers/collection_interface_testers.py | 197 +++++++++++++- cuda_core/tests/test_memory_peer_access.py | 253 +++++++++++++++++- 3 files changed, 458 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx index e613a545b3d..b1f92452f4b 100644 --- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx @@ -122,7 +122,7 @@ cdef inline bint _peer_access_includes(DeviceMemoryResource mr, int dev_id): return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE -cdef inline _apply_peer_access_diff( +cdef inline _apply_peer_access_diff_cython( DeviceMemoryResource mr, tuple to_add, tuple to_remove ): """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas.""" @@ -156,6 +156,17 @@ cdef inline _apply_peer_access_diff( PyMem_Free(access_desc) +def _apply_peer_access_diff(mr, to_add, to_remove): + """Python-visible wrapper around the cdef inline implementation. + + Every write path on :class:`PeerAccessibleBySetProxy` and the + ``peer_accessible_by`` setter route through this function so tests can + intercept it via ``monkeypatch.setattr`` on the module to assert the + "exactly one ``cuMemPoolSetAccess`` per bulk op" batching contract. + """ + _apply_peer_access_diff_cython(mr, tuple(to_add), tuple(to_remove)) + + cpdef replace_peer_accessible_by(DeviceMemoryResource mr, devices): """Replace the full peer-access set in a single batched driver call. diff --git a/cuda_core/tests/helpers/collection_interface_testers.py b/cuda_core/tests/helpers/collection_interface_testers.py index 5197e475c18..d719c47aa3f 100644 --- a/cuda_core/tests/helpers/collection_interface_testers.py +++ b/cuda_core/tests/helpers/collection_interface_testers.py @@ -8,17 +8,50 @@ import pytest -def assert_mutable_set_interface(subject, items): +def assert_mutable_set_interface( + subject, + items, + *, + non_members=None, + support_multi_insert=True, +): """Exercise every MutableSet method on *subject* against a reference set. + Two modes are supported: + + - ``support_multi_insert=True`` (default): the standard protocol pass that + inserts up to five distinct elements simultaneously. Use this whenever + the subject can hold an arbitrary number of items. + - ``support_multi_insert=False``: a reduced pass for proxies whose backing + store admits at most one insertable element at a time (e.g. a peer-access + view on a 2-GPU system). The subject only ever holds ``{}`` or ``{a}``, + and ``non_members`` supplies sentinel values used as the *other* side of + comparisons, ``isdisjoint``, subset/superset, and binary/in-place + operators. Every ``MutableSet`` method is still exercised at least once. + Parameters ---------- subject : MutableSet An **empty** mutable-set-like object to test. items : sequence - At least five distinct, hashable objects valid for insertion into - *subject*. + Distinct, hashable objects valid for insertion into *subject*. With + ``support_multi_insert=True`` at least five are required; with + ``support_multi_insert=False`` exactly one is needed (extras ignored). + non_members : sequence, optional + Distinct, hashable values that compare equal across set semantics but + are guaranteed *never* to be inserted into *subject* (typically because + the backing store rejects them). Required and used only when + ``support_multi_insert=False``; at least two are needed there. """ + if support_multi_insert: + _assert_mutable_set_interface_multi(subject, items) + else: + if non_members is None: + raise TypeError("non_members is required when support_multi_insert=False") + _assert_mutable_set_interface_single(subject, items, non_members) + + +def _assert_mutable_set_interface_multi(subject, items): assert len(items) >= 5 a, b, c, d, e = items[:5] ref = set() @@ -140,3 +173,161 @@ def assert_mutable_set_interface(subject, items): r = repr(subject) assert isinstance(r, str) assert len(r) > 0 + + +def _assert_mutable_set_interface_single(subject, items, non_members): + """Reduced protocol pass for subjects that admit at most one insertable element. + + Invariants: + - ``a`` is the lone insertable element; the subject only ever holds ``{}`` + or ``{a}``. + - ``x``, ``y`` are sentinels guaranteed *not* to be inserted; they appear + only on the right-hand side of operators and comparisons. They must + compare correctly under set semantics (i.e. equality and hashing). + """ + assert len(items) >= 1 + assert len(non_members) >= 2 + a = items[0] + x, y = non_members[:2] + assert a != x and a != y and x != y, "items[0] and non_members[:2] must be distinct" + + ref = set() + + # -- ABC conformance -- + assert isinstance(subject, Set) + assert isinstance(subject, MutableSet) + + # -- empty state -- + assert len(subject) == 0 + assert subject == ref + assert subject == set() + assert list(subject) == [] + + # -- add -- + subject.add(a) + ref.add(a) + assert subject == ref + assert a in subject + assert x not in subject + assert len(subject) == 1 + + # add duplicate is a no-op + subject.add(a) + assert subject == ref + assert len(subject) == 1 + + # -- comparison with plain set -- + assert subject == {a} + assert subject != {a, x} + assert subject != set() + + # -- isdisjoint -- + assert subject.isdisjoint({x, y}) + assert not subject.isdisjoint({a, x}) + + # -- subset / superset -- + assert subject <= {a} + assert subject <= {a, x} + assert not (subject <= set()) + assert subject < {a, x} + assert not (subject < {a}) + assert {a, x} >= subject + assert {a, x} > subject + + # -- binary operators (results are plain sets, never insert into subject) -- + assert subject & {a, x} == {a} + assert subject & {x} == set() + assert subject | {x} == {a, x} + assert subject - {a} == set() + assert subject - {x} == {a} + assert subject ^ {x} == {a, x} + assert subject ^ {a} == set() + + # -- discard non-member is a no-op -- + subject.discard(x) + assert subject == ref + + # -- discard member -- + subject.discard(a) + ref.discard(a) + assert subject == ref + + # -- remove member -- + subject.add(a) + ref.add(a) + subject.remove(a) + ref.remove(a) + assert subject == ref + + # -- remove non-member raises -- + with pytest.raises(KeyError): + subject.remove(x) + + # -- pop empty raises -- + with pytest.raises(KeyError): + subject.pop() + + # -- pop populated -- + subject.add(a) + ref.add(a) + popped = subject.pop() + ref.discard(popped) + assert popped == a + assert popped not in subject + assert subject == ref + + # -- in-place union (|=) covers single insert via bulk path -- + subject |= {a} + ref |= {a} + assert subject == ref + + # -- in-place intersection (&=) keeps the lone member -- + subject &= {a, x} + ref &= {a, x} + assert subject == ref + + # -- in-place intersection (&=) drops the lone member -- + subject &= {x, y} + ref &= {x, y} + assert subject == ref + + # -- in-place union via bulk path again, ahead of -= and ^= -- + subject |= {a} + ref |= {a} + assert subject == ref + + # -- in-place difference (-=) on non-member is a no-op -- + subject -= {x} + ref -= {x} + assert subject == ref + + # -- in-place difference (-=) on member empties the subject -- + subject -= {a} + ref -= {a} + assert subject == ref + + # -- in-place symmetric difference (^=): toggle in then out -- + subject ^= {a} + ref ^= {a} + assert subject == ref + subject ^= {a} + ref ^= {a} + assert subject == ref + + # -- clear -- + subject.add(a) + ref.add(a) + subject.clear() + ref.clear() + assert subject == ref + assert len(subject) == 0 + + # -- __iter__ on populated subject -- + subject.add(a) + ref.add(a) + assert set(subject) == ref + + # -- __repr__ -- + r = repr(subject) + assert isinstance(r, str) + assert len(r) > 0 diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index c73ba3ffd1c..94517004a77 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -3,8 +3,11 @@ import pytest from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer +from helpers.collection_interface_testers import assert_mutable_set_interface -from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions +from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, system +from cuda.core._memory import _peer_access_utils +from cuda.core._memory._peer_access_utils import PeerAccessibleBySetProxy from cuda.core._utils.cuda_utils import CUDAError NBYTES = 1024 @@ -137,3 +140,251 @@ def test_peer_access_shared_pool_queries_driver(mempool_device_x2): assert dmr1.peer_accessible_by == set() assert dmr2.peer_accessible_by == set() assert dmr3.peer_accessible_by == set() + + +# --------------------------------------------------------------------------- +# Set-proxy interface coverage +# +# These tests exercise the ``PeerAccessibleBySetProxy`` surface added in +# v1.0.0. They run against ``mempool_device_x2`` because every CI machine has +# at most 2 GPUs, which means at most one valid peer device. The relaxed +# ``support_multi_insert=False`` path on ``assert_mutable_set_interface`` +# threads that single insertable element through the full ``MutableSet`` +# protocol. +# --------------------------------------------------------------------------- + + +@pytest.fixture +def isolated_dmr_x2(mempool_device_x2): + """Owned-pool DMR on dev0 + the lone peer device (dev1). + + The owned pool guarantees a clean, empty initial peer-access state so the + proxy tests are not polluted by other tests sharing a default pool. + """ + dev0, dev1 = mempool_device_x2 + dmr = DeviceMemoryResource(dev0, DeviceMemoryResourceOptions()) + dmr.peer_accessible_by = [] + return dmr, dev0, dev1 + + +def test_peer_accessible_by_mutable_set_interface(isolated_dmr_x2): + """Run the full MutableSet protocol against a single-peer driver-backed view. + + On a 2-GPU box the proxy can only ever hold ``{dev1}``. The relaxed helper + uses ``dev0`` (the owner, which the proxy refuses to insert) as the + non-member sentinel; this exercises every ``MutableSet`` method while + respecting the proxy's "insert at most one" hardware reality. + """ + dmr, dev0, dev1 = isolated_dmr_x2 + assert_mutable_set_interface( + dmr.peer_accessible_by, + items=[dev1], + non_members=[dev0, dev0.device_id], + support_multi_insert=False, + ) + + +def test_peer_accessible_by_accepts_int_and_device(isolated_dmr_x2): + """``add``/``discard``/``__contains__`` accept ``Device`` and ``int`` interchangeably.""" + dmr, dev0, dev1 = isolated_dmr_x2 + + dmr.peer_accessible_by.add(dev1.device_id) + assert dmr.peer_accessible_by == {dev1} + assert dev1 in dmr.peer_accessible_by + assert dev1.device_id in dmr.peer_accessible_by + + dmr.peer_accessible_by.discard(dev1) + assert dmr.peer_accessible_by == set() + + dmr.peer_accessible_by.add(dev1) + assert dmr.peer_accessible_by == {dev1} + dmr.peer_accessible_by.discard(dev1.device_id) + assert dmr.peer_accessible_by == set() + + +def test_peer_accessible_by_silently_ignores_owner(isolated_dmr_x2): + """The owner device is silently filtered on every write; ``__contains__`` returns False.""" + dmr, dev0, dev1 = isolated_dmr_x2 + + # add/discard on owner is a no-op (no error, no state change) + dmr.peer_accessible_by.add(dev0) + dmr.peer_accessible_by.add(dev0.device_id) + assert dmr.peer_accessible_by == set() + dmr.peer_accessible_by.discard(dev0) + dmr.peer_accessible_by.discard(dev0.device_id) + assert dmr.peer_accessible_by == set() + + # __contains__ on owner is False (matches set semantics, never raises) + assert dev0 not in dmr.peer_accessible_by + assert dev0.device_id not in dmr.peer_accessible_by + + # Owner mixed into bulk ops is filtered, the peer is still added/removed + dmr.peer_accessible_by |= {dev0, dev1} + assert dmr.peer_accessible_by == {dev1} + dmr.peer_accessible_by -= {dev0, dev1} + assert dmr.peer_accessible_by == set() + + +def test_peer_accessible_by_rejects_invalid_inputs(isolated_dmr_x2): + """``add`` raises on out-of-range/unsupported inputs; lenient methods do not.""" + dmr, dev0, dev1 = isolated_dmr_x2 + bad_id = system.get_num_devices() # one past the last valid device ordinal + + # add: validates strictly, propagates errors from Device(bad_id) + with pytest.raises((ValueError, CUDAError)): + dmr.peer_accessible_by.add(bad_id) + # Non-coercible inputs surface whatever Device(value) raises (TypeError or + # ValueError depending on Cython's int coercion path). + with pytest.raises((TypeError, ValueError)): + dmr.peer_accessible_by.add("not-a-device") + + # discard: silently ignores non-coercible values (matches set.discard) + dmr.peer_accessible_by.discard("not-a-device") + assert dmr.peer_accessible_by == set() + + # __contains__: returns False on non-coercible values, never raises + assert "not-a-device" not in dmr.peer_accessible_by + + # remove on a non-member raises KeyError (inherited from MutableSet) + with pytest.raises(KeyError): + dmr.peer_accessible_by.remove(dev1) + + +def test_peer_accessible_by_no_cache_across_proxies(mempool_device_x2): + """Updates via one wrapper are immediately visible through any other proxy.""" + dev0, dev1 = mempool_device_x2 + dmr_a = DeviceMemoryResource(dev0) + dmr_b = DeviceMemoryResource(dev0) + dmr_a.peer_accessible_by = [] + + proxy = dmr_a.peer_accessible_by # acquired before the change below + dmr_b.peer_accessible_by.add(dev1) + # The proxy must reflect the new driver state, not a snapshot. + assert dev1 in proxy + assert proxy == {dev1} + + dmr_b.peer_accessible_by.clear() + assert proxy == set() + + +def test_peer_accessible_by_iteration_order_is_sorted(mempool_device_x2): + """``__iter__`` yields peers in ascending device-ordinal order.""" + dev0, dev1 = mempool_device_x2 + dmr = DeviceMemoryResource(dev0, DeviceMemoryResourceOptions()) + dmr.peer_accessible_by = [dev1] + devices = list(dmr.peer_accessible_by) + ids = [d.device_id for d in devices] + assert ids == sorted(ids) + assert all(isinstance(d, Device) for d in devices) + + +def test_peer_accessible_by_repr(isolated_dmr_x2): + """``repr`` includes the class name and reflects the live contents.""" + dmr, dev0, dev1 = isolated_dmr_x2 + empty_repr = repr(dmr.peer_accessible_by) + assert "PeerAccessibleBySetProxy" in empty_repr + assert "set()" in empty_repr + + dmr.peer_accessible_by.add(dev1) + populated_repr = repr(dmr.peer_accessible_by) + assert "PeerAccessibleBySetProxy" in populated_repr + # Don't pin the exact device repr; just confirm content changed. + assert populated_repr != empty_repr + + +def test_peer_accessible_by_returns_proxy_type(isolated_dmr_x2): + """The getter returns the documented proxy type (anchors the public contract).""" + dmr, dev0, dev1 = isolated_dmr_x2 + assert isinstance(dmr.peer_accessible_by, PeerAccessibleBySetProxy) + + +# --------------------------------------------------------------------------- +# Batching contract: every bulk op must issue at most one cuMemPoolSetAccess +# +# Spying via ``monkeypatch.setattr`` on the module-level +# ``_apply_peer_access_diff`` works because the proxy and the property setter +# call it by bare name, which Cython resolves through the module's globals at +# runtime (the wrapper is a plain ``def``, not a ``cdef inline``). +# --------------------------------------------------------------------------- + + +class _DiffSpy: + """Counts batched driver calls and forwards each call to the real wrapper.""" + + def __init__(self, real): + self._real = real + self.calls = [] + + def __call__(self, mr, to_add, to_remove): + # Materialize tuples once so the recorded args reflect what the proxy + # passed (they may be sets / lists of mixed Device/int upstream). + recorded_add = tuple(to_add) + recorded_remove = tuple(to_remove) + self.calls.append((recorded_add, recorded_remove)) + self._real(mr, to_add, to_remove) + + +@pytest.fixture +def diff_spy(monkeypatch): + spy = _DiffSpy(_peer_access_utils._apply_peer_access_diff) + monkeypatch.setattr(_peer_access_utils, "_apply_peer_access_diff", spy) + return spy + + +def test_peer_accessible_by_setter_batches_one_call(diff_spy, isolated_dmr_x2): + """``mr.peer_accessible_by = [...]`` issues exactly one driver call (or zero on no-op).""" + dmr, dev0, dev1 = isolated_dmr_x2 + dmr.peer_accessible_by = [dev1] + assert len(diff_spy.calls) == 1 + assert dev1.device_id in diff_spy.calls[-1][0] + + # Reassigning the same set is a no-op (zero driver calls). + diff_spy.calls.clear() + dmr.peer_accessible_by = [dev1] + assert diff_spy.calls == [] + + # Revoking everything is a single call (one removal). + dmr.peer_accessible_by = [] + assert len(diff_spy.calls) == 1 + assert dev1.device_id in diff_spy.calls[-1][1] + + +def test_peer_accessible_by_bulk_ops_batch_one_call(diff_spy, isolated_dmr_x2): + """``|=``, ``&=``, ``-=``, ``^=``, ``clear``, ``update`` each issue at most one call.""" + dmr, dev0, dev1 = isolated_dmr_x2 + + dmr.peer_accessible_by |= {dev1} + assert len(diff_spy.calls) == 1 + diff_spy.calls.clear() + + # &= keeping the lone member: no driver call (no diff). + dmr.peer_accessible_by &= {dev1} + assert diff_spy.calls == [] + + # &= dropping the lone member: one removal. + dmr.peer_accessible_by &= {dev0} + assert len(diff_spy.calls) == 1 + diff_spy.calls.clear() + + # ^= toggling the lone peer in then out: two ops, one call each. + dmr.peer_accessible_by ^= {dev1} + assert len(diff_spy.calls) == 1 + dmr.peer_accessible_by ^= {dev1} + assert len(diff_spy.calls) == 2 + diff_spy.calls.clear() + + # update() with the peer already absent: one add. + dmr.peer_accessible_by.update([dev1]) + assert len(diff_spy.calls) == 1 + diff_spy.calls.clear() + + # clear() with one member: one removal. + dmr.peer_accessible_by.clear() + assert len(diff_spy.calls) == 1 + diff_spy.calls.clear() + + # Already-empty bulk ops are no-ops (nothing to add or remove). + dmr.peer_accessible_by.clear() + dmr.peer_accessible_by.difference_update([dev1]) + dmr.peer_accessible_by -= {dev1} + assert diff_spy.calls == [] From 0e4a0832df45dd8f52b4c848ace4e4209048462d Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 4 May 2026 17:04:15 -0700 Subject: [PATCH 5/8] cuda.core: filter empty-delta calls in peer-access batching spy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Augmented assignment on the ``peer_accessible_by`` property (``dmr.peer_accessible_by |= {...}``) is two trips through the proxy/setter pair, not one: Python fetches the proxy, the proxy mutates itself in place via ``__ior__``, and Python then assigns the (already-mutated) proxy back through the setter. That trailing setter call computes the diff against current driver state, finds it empty, and short-circuits inside the ``cdef inline`` before issuing any ``cuMemPoolSetAccess`` work — so the *driver-level* contract ("one batched call per bulk op") still holds, but the wrapper is invoked twice, which the spy was counting. Also, the fixture's ``dmr.peer_accessible_by = []`` reset on an already empty pool is itself an empty-delta wrapper call. Filter the recorded calls down to those with non-empty deltas (the ones that translate to real driver work) and switch the bulk-ops test to use a locally bound proxy so augmented assignment goes through ``__ior__`` once with no extra setter invocation. The setter test stays on ``dmr.peer_accessible_by = ...`` because that is the public API contract under test there. Co-authored-by: Cursor --- cuda_core/tests/test_memory_peer_access.py | 51 +++++++++++++++------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index 94517004a77..b5e0e47dffe 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -309,18 +309,26 @@ def test_peer_accessible_by_returns_proxy_type(isolated_dmr_x2): class _DiffSpy: - """Counts batched driver calls and forwards each call to the real wrapper.""" + """Counts batched driver calls and forwards each call to the real wrapper. + + Only invocations with non-empty deltas are recorded in :attr:`calls`, + because those are the ones that translate to actual ``cuMemPoolSetAccess`` + work. The wrapper still gets called with empty deltas in a few places + (e.g. the property setter when the requested target matches current + driver state), but the underlying ``cdef inline`` short-circuits before + issuing the driver call, so those invocations do not count against the + "one call per bulk op" contract. + """ def __init__(self, real): self._real = real self.calls = [] def __call__(self, mr, to_add, to_remove): - # Materialize tuples once so the recorded args reflect what the proxy - # passed (they may be sets / lists of mixed Device/int upstream). recorded_add = tuple(to_add) recorded_remove = tuple(to_remove) - self.calls.append((recorded_add, recorded_remove)) + if recorded_add or recorded_remove: + self.calls.append((recorded_add, recorded_remove)) self._real(mr, to_add, to_remove) @@ -350,41 +358,54 @@ def test_peer_accessible_by_setter_batches_one_call(diff_spy, isolated_dmr_x2): def test_peer_accessible_by_bulk_ops_batch_one_call(diff_spy, isolated_dmr_x2): - """``|=``, ``&=``, ``-=``, ``^=``, ``clear``, ``update`` each issue at most one call.""" + """``|=``/``&=``/``-=``/``^=``/``update``/``clear`` each issue at most one driver call. + + .. note:: + ``proxy = dmr.peer_accessible_by; proxy |= {...}`` is exactly one driver + call. Augmented assignment directly on the property + (``dmr.peer_accessible_by |= {...}``) is two operations because Python + fetches the proxy, mutates it, and then assigns the (already-mutated) + proxy back through the setter; that final assignment is a no-op at the + driver level (deltas come out empty), but it is two trips through the + proxy/setter machinery. The spy filters out empty-delta calls so the + invariant under test is "actual driver work" rather than that + Python-level quirk. + """ dmr, dev0, dev1 = isolated_dmr_x2 + proxy = dmr.peer_accessible_by # use the local-binding form for predictable counting - dmr.peer_accessible_by |= {dev1} + proxy |= {dev1} assert len(diff_spy.calls) == 1 diff_spy.calls.clear() # &= keeping the lone member: no driver call (no diff). - dmr.peer_accessible_by &= {dev1} + proxy &= {dev1} assert diff_spy.calls == [] # &= dropping the lone member: one removal. - dmr.peer_accessible_by &= {dev0} + proxy &= {dev0} assert len(diff_spy.calls) == 1 diff_spy.calls.clear() # ^= toggling the lone peer in then out: two ops, one call each. - dmr.peer_accessible_by ^= {dev1} + proxy ^= {dev1} assert len(diff_spy.calls) == 1 - dmr.peer_accessible_by ^= {dev1} + proxy ^= {dev1} assert len(diff_spy.calls) == 2 diff_spy.calls.clear() # update() with the peer already absent: one add. - dmr.peer_accessible_by.update([dev1]) + proxy.update([dev1]) assert len(diff_spy.calls) == 1 diff_spy.calls.clear() # clear() with one member: one removal. - dmr.peer_accessible_by.clear() + proxy.clear() assert len(diff_spy.calls) == 1 diff_spy.calls.clear() # Already-empty bulk ops are no-ops (nothing to add or remove). - dmr.peer_accessible_by.clear() - dmr.peer_accessible_by.difference_update([dev1]) - dmr.peer_accessible_by -= {dev1} + proxy.clear() + proxy.difference_update([dev1]) + proxy -= {dev1} assert diff_spy.calls == [] From e89c19bf286381921b62b6e0d61a84353d6c3227 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 4 May 2026 17:12:21 -0700 Subject: [PATCH 6/8] cuda.core: spy on the driver call directly to assert peer-access batching Move the actual ``cuMemPoolSetAccess`` invocation (descriptor-array build + driver call) into a thin Python-visible ``def _set_pool_access`` in ``_peer_access_utils.pyx``. ``_apply_peer_access_diff`` now does only the empty-diff short-circuit and delegates the work to ``_set_pool_access``, which Cython resolves through the module globals at runtime so tests can intercept it via ``monkeypatch.setattr``. Replace the previous internal-wrapper spy with a driver-call spy that counts every real ``cuMemPoolSetAccess`` invocation. Earlier no-op layers (e.g. the augmented-assignment-on-property pattern that writes an already-mutated proxy back through the setter) short-circuit before reaching ``_set_pool_access``, so the recorded count is exactly the number of driver calls. The empty-delta filter and the local-binding workaround in the bulk-ops test are gone; we now also assert that ``dmr.peer_accessible_by |= {...}`` directly on the property is still exactly one driver call. Co-authored-by: Cursor --- .../cuda/core/_memory/_peer_access_utils.pyx | 35 +++--- cuda_core/tests/test_memory_peer_access.py | 107 +++++++++--------- 2 files changed, 76 insertions(+), 66 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx index b1f92452f4b..ed26a4f0f2a 100644 --- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx @@ -122,17 +122,22 @@ cdef inline bint _peer_access_includes(DeviceMemoryResource mr, int dev_id): return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE -cdef inline _apply_peer_access_diff_cython( - DeviceMemoryResource mr, tuple to_add, tuple to_remove -): - """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas.""" +def _set_pool_access(mr, tuple to_add, tuple to_remove): + """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas. + + The thin Python-callable layer that wraps the actual driver call: building + the ``CUmemAccessDesc`` array and invoking ``cuMemPoolSetAccess`` happens + in here. Tests monkeypatch this on the module to spy on real driver work + without intercepting earlier no-op paths. + + Preconditions: ``len(to_add) + len(to_remove) > 0`` (the caller is + responsible for skipping empty diffs). + """ + cdef DeviceMemoryResource mr_typed = mr cdef size_t count = len(to_add) + len(to_remove) cdef cydriver.CUmemAccessDesc* access_desc = NULL cdef size_t i = 0 - if count == 0: - return - access_desc = PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc)) if access_desc == NULL: raise MemoryError("Failed to allocate memory for access descriptors") @@ -150,21 +155,25 @@ cdef inline _apply_peer_access_diff_cython( i += 1 with nogil: - HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(mr._h_pool), access_desc, count)) + HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(mr_typed._h_pool), access_desc, count)) finally: if access_desc != NULL: PyMem_Free(access_desc) def _apply_peer_access_diff(mr, to_add, to_remove): - """Python-visible wrapper around the cdef inline implementation. + """Apply a peer-access diff in at most one driver call. Every write path on :class:`PeerAccessibleBySetProxy` and the - ``peer_accessible_by`` setter route through this function so tests can - intercept it via ``monkeypatch.setattr`` on the module to assert the - "exactly one ``cuMemPoolSetAccess`` per bulk op" batching contract. + ``peer_accessible_by`` setter routes through this function. Empty diffs + short-circuit here so the driver-level helper :func:`_set_pool_access` is + only invoked when there is actual work for ``cuMemPoolSetAccess`` to do. """ - _apply_peer_access_diff_cython(mr, tuple(to_add), tuple(to_remove)) + add_tuple = tuple(to_add) + remove_tuple = tuple(to_remove) + if not add_tuple and not remove_tuple: + return + _set_pool_access(mr, add_tuple, remove_tuple) cpdef replace_peer_accessible_by(DeviceMemoryResource mr, devices): diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index b5e0e47dffe..4ec24eabfc7 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -308,16 +308,15 @@ def test_peer_accessible_by_returns_proxy_type(isolated_dmr_x2): # --------------------------------------------------------------------------- -class _DiffSpy: - """Counts batched driver calls and forwards each call to the real wrapper. - - Only invocations with non-empty deltas are recorded in :attr:`calls`, - because those are the ones that translate to actual ``cuMemPoolSetAccess`` - work. The wrapper still gets called with empty deltas in a few places - (e.g. the property setter when the requested target matches current - driver state), but the underlying ``cdef inline`` short-circuits before - issuing the driver call, so those invocations do not count against the - "one call per bulk op" contract. +class _DriverCallSpy: + """Records every actual ``cuMemPoolSetAccess`` invocation. + + Spies on :func:`cuda.core._memory._peer_access_utils._set_pool_access` — + the thin Python-visible wrapper that builds the descriptor array and + issues the single driver call. Earlier no-op layers (e.g. the + augmented-assignment-on-property quirk that reassigns an already-mutated + proxy back through the setter) short-circuit before reaching here, so the + recorded count is exactly the number of real ``cuMemPoolSetAccess`` calls. """ def __init__(self, real): @@ -325,87 +324,89 @@ def __init__(self, real): self.calls = [] def __call__(self, mr, to_add, to_remove): - recorded_add = tuple(to_add) - recorded_remove = tuple(to_remove) - if recorded_add or recorded_remove: - self.calls.append((recorded_add, recorded_remove)) + self.calls.append((tuple(to_add), tuple(to_remove))) self._real(mr, to_add, to_remove) @pytest.fixture -def diff_spy(monkeypatch): - spy = _DiffSpy(_peer_access_utils._apply_peer_access_diff) - monkeypatch.setattr(_peer_access_utils, "_apply_peer_access_diff", spy) +def driver_spy(monkeypatch): + spy = _DriverCallSpy(_peer_access_utils._set_pool_access) + monkeypatch.setattr(_peer_access_utils, "_set_pool_access", spy) return spy -def test_peer_accessible_by_setter_batches_one_call(diff_spy, isolated_dmr_x2): +def test_peer_accessible_by_setter_batches_one_call(driver_spy, isolated_dmr_x2): """``mr.peer_accessible_by = [...]`` issues exactly one driver call (or zero on no-op).""" dmr, dev0, dev1 = isolated_dmr_x2 dmr.peer_accessible_by = [dev1] - assert len(diff_spy.calls) == 1 - assert dev1.device_id in diff_spy.calls[-1][0] + assert len(driver_spy.calls) == 1 + assert dev1.device_id in driver_spy.calls[-1][0] # Reassigning the same set is a no-op (zero driver calls). - diff_spy.calls.clear() + driver_spy.calls.clear() dmr.peer_accessible_by = [dev1] - assert diff_spy.calls == [] + assert driver_spy.calls == [] # Revoking everything is a single call (one removal). dmr.peer_accessible_by = [] - assert len(diff_spy.calls) == 1 - assert dev1.device_id in diff_spy.calls[-1][1] - - -def test_peer_accessible_by_bulk_ops_batch_one_call(diff_spy, isolated_dmr_x2): - """``|=``/``&=``/``-=``/``^=``/``update``/``clear`` each issue at most one driver call. - - .. note:: - ``proxy = dmr.peer_accessible_by; proxy |= {...}`` is exactly one driver - call. Augmented assignment directly on the property - (``dmr.peer_accessible_by |= {...}``) is two operations because Python - fetches the proxy, mutates it, and then assigns the (already-mutated) - proxy back through the setter; that final assignment is a no-op at the - driver level (deltas come out empty), but it is two trips through the - proxy/setter machinery. The spy filters out empty-delta calls so the - invariant under test is "actual driver work" rather than that - Python-level quirk. + assert len(driver_spy.calls) == 1 + assert dev1.device_id in driver_spy.calls[-1][1] + + +def test_peer_accessible_by_bulk_ops_batch_one_call(driver_spy, isolated_dmr_x2): + """Every bulk op issues exactly one ``cuMemPoolSetAccess`` (or zero on no-op). + + Covers ``|=``, ``&=``, ``^=``, ``update``, ``difference_update``, and + ``clear``. Both operand styles are exercised: a locally bound proxy + (``proxy |= {...}``) and augmented assignment directly on the property + (``dmr.peer_accessible_by |= {...}``). The latter trips Python's + augmented-assignment-on-property pattern (fetch proxy, mutate, write + back through setter) but the trailing setter call discovers an empty + diff and short-circuits before reaching the driver, so the count is + still one. """ dmr, dev0, dev1 = isolated_dmr_x2 - proxy = dmr.peer_accessible_by # use the local-binding form for predictable counting + proxy = dmr.peer_accessible_by proxy |= {dev1} - assert len(diff_spy.calls) == 1 - diff_spy.calls.clear() + assert len(driver_spy.calls) == 1 + driver_spy.calls.clear() # &= keeping the lone member: no driver call (no diff). proxy &= {dev1} - assert diff_spy.calls == [] + assert driver_spy.calls == [] # &= dropping the lone member: one removal. proxy &= {dev0} - assert len(diff_spy.calls) == 1 - diff_spy.calls.clear() + assert len(driver_spy.calls) == 1 + driver_spy.calls.clear() # ^= toggling the lone peer in then out: two ops, one call each. proxy ^= {dev1} - assert len(diff_spy.calls) == 1 + assert len(driver_spy.calls) == 1 proxy ^= {dev1} - assert len(diff_spy.calls) == 2 - diff_spy.calls.clear() + assert len(driver_spy.calls) == 2 + driver_spy.calls.clear() # update() with the peer already absent: one add. proxy.update([dev1]) - assert len(diff_spy.calls) == 1 - diff_spy.calls.clear() + assert len(driver_spy.calls) == 1 + driver_spy.calls.clear() # clear() with one member: one removal. proxy.clear() - assert len(diff_spy.calls) == 1 - diff_spy.calls.clear() + assert len(driver_spy.calls) == 1 + driver_spy.calls.clear() # Already-empty bulk ops are no-ops (nothing to add or remove). proxy.clear() proxy.difference_update([dev1]) proxy -= {dev1} - assert diff_spy.calls == [] + assert driver_spy.calls == [] + + # Augmented assignment directly on the property is also one driver call: + # the proxy mutates the pool via __ior__, the setter writes back an + # already-up-to-date proxy, and the empty-diff short-circuit prevents a + # second driver call. + dmr.peer_accessible_by |= {dev1} + assert len(driver_spy.calls) == 1 From 774acf159949ce8c0182f8841d9b3144e29918bd Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 4 May 2026 17:20:31 -0700 Subject: [PATCH 7/8] cuda.core: split MutableSet conformance helpers by capacity Replace the ``support_multi_insert`` flag and ``non_members`` keyword with two purpose-built helpers: - ``assert_mutable_set_interface(subject, items)`` keeps the original signature and contract: at least five distinct insertable items, exercised against a reference set in the standard way. The graph ``AdjacencySetProxy`` test continues to use this unchanged. - ``assert_single_member_mutable_set_interface(subject, member, non_member)`` is a focused pass for proxies whose backing store admits at most one insertable element at a time (here, the peer-access view on a 2-GPU box). It threads a single member and one non-member sentinel through every ``MutableSet`` method. The two helpers share small private utilities (empty-state checks, ``__repr__`` shape) but keep their public surfaces small and linear. A capacity-one proxy is a meaningfully different contract from a general mutable set; naming that explicitly in the API reads better than a flag and avoids forcing call sites to plumb sentinels through. Co-authored-by: Cursor --- .../helpers/collection_interface_testers.py | 151 ++++++++---------- cuda_core/tests/test_memory_peer_access.py | 19 ++- 2 files changed, 77 insertions(+), 93 deletions(-) diff --git a/cuda_core/tests/helpers/collection_interface_testers.py b/cuda_core/tests/helpers/collection_interface_testers.py index d719c47aa3f..63fbed8b381 100644 --- a/cuda_core/tests/helpers/collection_interface_testers.py +++ b/cuda_core/tests/helpers/collection_interface_testers.py @@ -1,70 +1,63 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Reusable helpers to verify collections.abc protocol conformance.""" +"""Reusable helpers to verify collections.abc protocol conformance. + +Two helpers are provided for ``MutableSet``-like subjects, picked by the +capacity of the backing store: + +- :func:`assert_mutable_set_interface` is the standard pass; it requires at + least five distinct insertable items so every method (including the + multi-element bulk operators) can be exercised. +- :func:`assert_single_member_mutable_set_interface` is a focused pass for + proxies whose backing store admits at most one insertable element at a time + (for example, a peer-access view on a system with one valid peer device). + It runs every ``MutableSet`` method at least once using a single member and + one non-member sentinel. + +The two helpers are intentionally separate rather than one helper with a +mode flag: a single-member proxy is a substantially different contract +("capacity one, by hardware") and naming it explicitly in the API keeps each +helper's signature small and its assertions linear. +""" from collections.abc import MutableSet, Set import pytest -def assert_mutable_set_interface( - subject, - items, - *, - non_members=None, - support_multi_insert=True, -): - """Exercise every MutableSet method on *subject* against a reference set. +def _assert_empty(subject): + """Assertions that hold for any empty MutableSet-like subject.""" + assert isinstance(subject, Set) + assert isinstance(subject, MutableSet) + assert len(subject) == 0 + assert subject == set() + assert list(subject) == [] + + +def _assert_repr_nonempty(subject): + """``__repr__`` produces a non-empty string.""" + r = repr(subject) + assert isinstance(r, str) + assert len(r) > 0 - Two modes are supported: - - ``support_multi_insert=True`` (default): the standard protocol pass that - inserts up to five distinct elements simultaneously. Use this whenever - the subject can hold an arbitrary number of items. - - ``support_multi_insert=False``: a reduced pass for proxies whose backing - store admits at most one insertable element at a time (e.g. a peer-access - view on a 2-GPU system). The subject only ever holds ``{}`` or ``{a}``, - and ``non_members`` supplies sentinel values used as the *other* side of - comparisons, ``isdisjoint``, subset/superset, and binary/in-place - operators. Every ``MutableSet`` method is still exercised at least once. +def assert_mutable_set_interface(subject, items): + """Exercise every MutableSet method on *subject* against a reference set. Parameters ---------- subject : MutableSet An **empty** mutable-set-like object to test. items : sequence - Distinct, hashable objects valid for insertion into *subject*. With - ``support_multi_insert=True`` at least five are required; with - ``support_multi_insert=False`` exactly one is needed (extras ignored). - non_members : sequence, optional - Distinct, hashable values that compare equal across set semantics but - are guaranteed *never* to be inserted into *subject* (typically because - the backing store rejects them). Required and used only when - ``support_multi_insert=False``; at least two are needed there. + At least five distinct, hashable objects valid for insertion into + *subject*. """ - if support_multi_insert: - _assert_mutable_set_interface_multi(subject, items) - else: - if non_members is None: - raise TypeError("non_members is required when support_multi_insert=False") - _assert_mutable_set_interface_single(subject, items, non_members) - - -def _assert_mutable_set_interface_multi(subject, items): assert len(items) >= 5 a, b, c, d, e = items[:5] ref = set() - # -- ABC conformance -- - assert isinstance(subject, Set) - assert isinstance(subject, MutableSet) - - # -- empty state -- - assert len(subject) == 0 - assert subject == ref - assert subject == set() - assert list(subject) == [] + _assert_empty(subject) # -- add -- subject.add(a) @@ -169,39 +162,37 @@ def _assert_mutable_set_interface_multi(subject, items): # -- __iter__ -- assert set(subject) == ref - # -- __repr__ -- - r = repr(subject) - assert isinstance(r, str) - assert len(r) > 0 + _assert_repr_nonempty(subject) -def _assert_mutable_set_interface_single(subject, items, non_members): - """Reduced protocol pass for subjects that admit at most one insertable element. +def assert_single_member_mutable_set_interface(subject, member, non_member): + """Exercise every MutableSet method on a subject with capacity one. - Invariants: - - ``a`` is the lone insertable element; the subject only ever holds ``{}`` - or ``{a}``. - - ``x``, ``y`` are sentinels guaranteed *not* to be inserted; they appear - only on the right-hand side of operators and comparisons. They must - compare correctly under set semantics (i.e. equality and hashing). - """ - assert len(items) >= 1 - assert len(non_members) >= 2 - a = items[0] - x, y = non_members[:2] - assert a != x and a != y and x != y, "items[0] and non_members[:2] must be distinct" + Use this for proxies whose backing store admits at most one insertable + element at a time (typically because the underlying resource is bounded + by hardware, e.g. a peer-access view on a system with a single valid + peer device). The subject only ever holds ``set()`` or ``{member}``; + *non_member* supplies the right-hand side of comparisons, ``isdisjoint``, + subset/superset, and binary/in-place operators so every ``MutableSet`` + method is exercised at least once. + Parameters + ---------- + subject : MutableSet + An **empty** mutable-set-like object to test. + member : hashable + A distinct, hashable object valid for insertion into *subject*. + non_member : hashable + A distinct, hashable object that compares correctly under set + semantics but is guaranteed never to be inserted into *subject* + (typically because the backing store rejects it). + """ + assert member != non_member, "member and non_member must be distinct" + a = member + x = non_member ref = set() - # -- ABC conformance -- - assert isinstance(subject, Set) - assert isinstance(subject, MutableSet) - - # -- empty state -- - assert len(subject) == 0 - assert subject == ref - assert subject == set() - assert list(subject) == [] + _assert_empty(subject) # -- add -- subject.add(a) @@ -222,7 +213,7 @@ def _assert_mutable_set_interface_single(subject, items, non_members): assert subject != set() # -- isdisjoint -- - assert subject.isdisjoint({x, y}) + assert subject.isdisjoint({x}) assert not subject.isdisjoint({a, x}) # -- subset / superset -- @@ -287,16 +278,13 @@ def _assert_mutable_set_interface_single(subject, items, non_members): assert subject == ref # -- in-place intersection (&=) drops the lone member -- - subject &= {x, y} - ref &= {x, y} + subject &= {x} + ref &= {x} assert subject == ref - # -- in-place union via bulk path again, ahead of -= and ^= -- + # -- in-place difference (-=) on non-member is a no-op -- subject |= {a} ref |= {a} - assert subject == ref - - # -- in-place difference (-=) on non-member is a no-op -- subject -= {x} ref -= {x} assert subject == ref @@ -327,7 +315,4 @@ def _assert_mutable_set_interface_single(subject, items, non_members): ref.add(a) assert set(subject) == ref - # -- __repr__ -- - r = repr(subject) - assert isinstance(r, str) - assert len(r) > 0 + _assert_repr_nonempty(subject) diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index 4ec24eabfc7..70add843a90 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -3,7 +3,7 @@ import pytest from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer -from helpers.collection_interface_testers import assert_mutable_set_interface +from helpers.collection_interface_testers import assert_single_member_mutable_set_interface from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, system from cuda.core._memory import _peer_access_utils @@ -168,19 +168,18 @@ def isolated_dmr_x2(mempool_device_x2): def test_peer_accessible_by_mutable_set_interface(isolated_dmr_x2): - """Run the full MutableSet protocol against a single-peer driver-backed view. + """Run the MutableSet protocol against a single-peer driver-backed view. - On a 2-GPU box the proxy can only ever hold ``{dev1}``. The relaxed helper - uses ``dev0`` (the owner, which the proxy refuses to insert) as the - non-member sentinel; this exercises every ``MutableSet`` method while - respecting the proxy's "insert at most one" hardware reality. + On a 2-GPU box the proxy can only ever hold ``{dev1}`` because there is a + single valid peer. The capacity-one helper exercises every ``MutableSet`` + method using ``dev1`` as the lone insertable element and ``dev0`` (the + owner, which the proxy refuses to insert) as the non-member sentinel. """ dmr, dev0, dev1 = isolated_dmr_x2 - assert_mutable_set_interface( + assert_single_member_mutable_set_interface( dmr.peer_accessible_by, - items=[dev1], - non_members=[dev0, dev0.device_id], - support_multi_insert=False, + member=dev1, + non_member=dev0, ) From 3233538afb57e031cc0337d1bf11ea540b34efd9 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 6 May 2026 13:41:59 -0700 Subject: [PATCH 8/8] Address review feedback on peer_accessible_by proxy - Optimize add/discard to check the single target device (1 driver call) instead of scanning all devices via _query_peer_access_ids - Add test for out-of-range int in __contains__ (returns False) - Fix stale comment referencing removed support_multi_insert flag - Add #2018 link to release notes entry Co-authored-by: Cursor --- .../cuda/core/_memory/_peer_access_utils.pyx | 18 ++++++++++++++++-- cuda_core/docs/source/release/1.0.0-notes.rst | 1 + cuda_core/tests/test_memory_peer_access.py | 10 ++++++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx index ed26a4f0f2a..8086aaff170 100644 --- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx @@ -252,7 +252,16 @@ class PeerAccessibleBySetProxy(MutableSet): def add(self, value) -> None: """Grant peer access from ``value`` to allocations in this pool.""" - self._apply([value], ()) + dev_id = _resolve_peer_device_id(value) + cdef DeviceMemoryResource mr = self._mr + if dev_id == mr._dev_id: + return + if _peer_access_includes(mr, dev_id): + return + from cuda.core._device import Device + if not Device(mr._dev_id).can_access_peer(dev_id): + raise ValueError(f"Device {mr._dev_id} cannot access peer: {dev_id}") + _apply_peer_access_diff(mr, (dev_id,), ()) def discard(self, value) -> None: """Revoke peer access from ``value`` to allocations in this pool.""" @@ -260,7 +269,12 @@ class PeerAccessibleBySetProxy(MutableSet): dev_id = _resolve_peer_device_id(value) except (TypeError, ValueError): return - self._apply((), [dev_id]) + cdef DeviceMemoryResource mr = self._mr + if dev_id == mr._dev_id: + return + if not _peer_access_includes(mr, dev_id): + return + _apply_peer_access_diff(mr, (), (dev_id,)) # --- bulk overrides: one driver call per op --- diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 9b7bd65ff05..747b033c117 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -123,6 +123,7 @@ Breaking changes - :attr:`DeviceMemoryResource.peer_accessible_by` now returns a :class:`collections.abc.MutableSet` of :obj:`~_device.Device` objects instead of a sorted ``tuple[int, ...]``. The property setter is unchanged. + (`#2018 `__) - ``stream`` is now a required keyword-only argument on APIs that schedule work on a stream diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index 1002d96ad0b..71beb459143 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -147,10 +147,9 @@ def test_peer_access_shared_pool_queries_driver(mempool_device_x2): # # These tests exercise the ``PeerAccessibleBySetProxy`` surface added in # v1.0.0. They run against ``mempool_device_x2`` because every CI machine has -# at most 2 GPUs, which means at most one valid peer device. The relaxed -# ``support_multi_insert=False`` path on ``assert_mutable_set_interface`` -# threads that single insertable element through the full ``MutableSet`` -# protocol. +# at most 2 GPUs, which means at most one valid peer device. The +# ``assert_single_member_mutable_set_interface`` helper threads that single +# insertable element through the full ``MutableSet`` protocol. # --------------------------------------------------------------------------- @@ -244,6 +243,9 @@ def test_peer_accessible_by_rejects_invalid_inputs(isolated_dmr_x2): # __contains__: returns False on non-coercible values, never raises assert "not-a-device" not in dmr.peer_accessible_by + # __contains__: out-of-range int returns False, never raises + assert bad_id not in dmr.peer_accessible_by + # remove on a non-member raises KeyError (inherited from MutableSet) with pytest.raises(KeyError): dmr.peer_accessible_by.remove(dev1)