From 73d2d892eeddf871241316330e497c2c54fae914 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 4 May 2026 14:30:40 -0700
Subject: [PATCH 1/8] cuda.core: convert peer_accessible_by to a live
 MutableSet view

DeviceMemoryResource.peer_accessible_by previously returned a sorted
tuple[int, ...] backed by a Python-level cache, which was prone to
divergence from driver state across multiple wrappers around the same
memory pool. The setter accepted Device | int and emitted a single
batched cuMemPoolSetAccess covering the diff against the cache.

This commit replaces the property with a live driver-backed view:

- Adds PeerAccessibleBySetProxy in _memory/_peer_access_utils.py, a
  collections.abc.MutableSet whose reads call cuMemPoolGetAccess and
  whose writes call cuMemPoolSetAccess. Iteration yields Device
  objects; add, discard, and __contains__ accept either a Device or a
  device-ordinal int. The proxy is constructed fresh on every property
  access, so there is nothing to cache or pickle.

- Drops the _peer_accessible_by cache field (and its initializations
  in __cinit__, _DMR_init, and from_allocation_handle), eliminating
  the owned/non-owned read split. All pools now share the same code
  path and always query the driver.

- All bulk operations on the proxy (update, |=, &=, -=, ^=, clear,
  pop) issue exactly one cuMemPoolSetAccess call. Peer-access
  transitions can take seconds per pool because every existing memory
  mapping is updated, so coalescing into a single driver call lets the
  toolkit handle the mappings in parallel. The property setter
  (mr.peer_accessible_by = [...]) preserves its original single-call
  behavior via the same shared planner path.

- Single-element add validates can_access_peer through
  plan_peer_access_update, matching the existing setter contract.

This is a breaking change captured in the v1.0.0 release notes.
Callers comparing against tuples must update to set comparisons
(mr.peer_accessible_by == {Device(0)}). Existing tests are migrated;
new tests for set-interface conformance are intentionally deferred to
a follow-up.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../core/_memory/_device_memory_resource.pxd  |   1 -
 .../core/_memory/_device_memory_resource.pyx  | 172 +++++++++------
 .../cuda/core/_memory/_peer_access_utils.py   | 205 +++++++++++++++++-
 cuda_core/docs/source/api_private.rst         |   1 +
 cuda_core/docs/source/release/1.0.0-notes.rst |  13 ++
 .../tests/memory_ipc/test_peer_access.py      |  18 +-
 cuda_core/tests/test_memory_peer_access.py    |  26 +--
 7 files changed, 347 insertions(+), 89 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
index a7f3bfd9585..0b7cd941e18 100644
--- a/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
@@ -9,7 +9,6 @@ from cuda.core._memory._ipc cimport IPCDataForMR
 cdef class DeviceMemoryResource(_MemPool):
     cdef:
         int _dev_id
-        object _peer_accessible_by
 
 
 cpdef DMR_mempool_get_access(DeviceMemoryResource, int)
diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
index 62df3cfb305..48beea80b71 100644
--- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
@@ -25,7 +25,11 @@ import multiprocessing
 import platform  # no-cython-lint
 import uuid
 
-from ._peer_access_utils import plan_peer_access_update
+from ._peer_access_utils import (
+    PeerAccessibleBySetProxy,
+    _resolve_peer_device_id,
+    plan_peer_access_update,
+)
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
 
 __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions']
@@ -131,7 +135,6 @@ cdef class DeviceMemoryResource(_MemPool):
 
     def __cinit__(self, *args, **kwargs):
         self._dev_id = cydriver.CU_DEVICE_INVALID
-        self._peer_accessible_by = None
 
     def __init__(self, device_id: Device | int, options=None):
         _DMR_init(self, device_id, options)
@@ -191,7 +194,6 @@ cdef class DeviceMemoryResource(_MemPool):
             _ipc.MP_from_allocation_handle(cls, alloc_handle))
         from .._device import Device
         mr._dev_id = Device(device_id).device_id
-        mr._peer_accessible_by = ()
         return mr
 
     @property
@@ -213,34 +215,60 @@ cdef class DeviceMemoryResource(_MemPool):
     @property
     def peer_accessible_by(self):
         """
-        Get or set the devices that can access allocations from this memory
-        pool. Access can be modified at any time and affects all allocations
+        Live driver-backed set view of the devices that can access allocations
         from this memory pool.
 
-        Returns a tuple of sorted device IDs that currently have peer access to
-        allocations from this memory pool.
+        Returns a :class:`PeerAccessibleBySetProxy` (a
+        :class:`collections.abc.MutableSet`) whose reads call
+        ``cuMemPoolGetAccess`` and whose writes call ``cuMemPoolSetAccess``.
+        Iteration yields :class:`Device` objects; ``add``, ``discard``, and
+        ``__contains__`` accept either a :class:`Device` or a device-ordinal
+        ``int``. There is no in-memory cache, so the view always reflects the
+        current driver state and stays consistent across multiple wrappers
+        around the same pool.
 
-        When setting, accepts a sequence of :obj:`~_device.Device` objects or device IDs.
-        Setting to an empty sequence revokes all peer access.
+        When setting, accepts an iterable of :obj:`~_device.Device` objects or
+        device IDs. Setting replaces the full set in a single batched driver call.
 
-        For non-owned pools (the default or current device pool), the state
-        is always queried from the driver to reflect changes made by other
-        wrappers or direct driver calls.
+        Bulk operations (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``,
+        and the property setter) each issue exactly one ``cuMemPoolSetAccess``
+        call so the toolkit can update existing memory mappings in parallel.
 
         Examples
         --------
         >>> dmr = DeviceMemoryResource(0)
-        >>> dmr.peer_accessible_by = [1]  # Grant access to device 1
-        >>> assert dmr.peer_accessible_by == (1,)
-        >>> dmr.peer_accessible_by = []  # Revoke access
+        >>> dmr.peer_accessible_by.add(1)            # grant access to device 1
+        >>> assert dmr.peer_accessible_by == {Device(1)}
+        >>> dmr.peer_accessible_by |= {Device(2)}    # batched grant via |=
+        >>> dmr.peer_accessible_by = []              # revoke all in one call
         """
-        if not self._mempool_owned:
-            _DMR_query_peer_access(self)
-        return self._peer_accessible_by
+        return PeerAccessibleBySetProxy(self)
 
     @peer_accessible_by.setter
     def peer_accessible_by(self, devices):
-        _DMR_set_peer_accessible_by(self, devices)
+        _DMR_replace_peer_accessible_by(self, devices)
+
+    def _query_peer_access_ids(self):
+        """Return the current peer device IDs as a sorted tuple of ints.
+
+        Always queries the driver via ``cuMemPoolGetAccess`` for every visible
+        device. Used by :class:`PeerAccessibleBySetProxy` for ``__iter__`` and
+        ``__len__``.
+        """
+        return _DMR_query_peer_access_ids(self)
+
+    def _peer_access_includes(self, int dev_id) -> bool:
+        """Return True if peer access from ``dev_id`` is currently granted."""
+        return _DMR_peer_access_includes(self, dev_id)
+
+    def _apply_peer_access_diff(self, to_add, to_remove):
+        """Issue a single ``cuMemPoolSetAccess`` for the given add/remove deltas.
+
+        ``to_add`` and ``to_remove`` are iterables of device-ordinal ints.
+        Both must already be filtered (no owner, no overlap, no duplicates).
+        Used by :class:`PeerAccessibleBySetProxy` for batched writes.
+        """
+        _DMR_apply_peer_access_diff(self, tuple(to_add), tuple(to_remove))
 
     @property
     def is_device_accessible(self) -> bool:
@@ -253,8 +281,8 @@ cdef class DeviceMemoryResource(_MemPool):
         return False
 
 
-cdef inline _DMR_query_peer_access(DeviceMemoryResource self):
-    """Query the driver for the actual peer access state of this pool."""
+cdef inline tuple _DMR_query_peer_access_ids(DeviceMemoryResource self):
+    """Return the current peer device IDs as a sorted tuple of ints."""
     cdef int total
     cdef cydriver.CUmemAccess_flags flags
     cdef cydriver.CUmemLocation location
@@ -273,59 +301,74 @@ cdef inline _DMR_query_peer_access(DeviceMemoryResource self):
         if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE:
             peers.append(dev_id)
 
-    self._peer_accessible_by = tuple(sorted(peers))
+    return tuple(sorted(peers))
 
 
-cdef inline _DMR_set_peer_accessible_by(DeviceMemoryResource self, devices):
-    from .._device import Device
+cdef inline bint _DMR_peer_access_includes(DeviceMemoryResource self, int dev_id):
+    """Return True if peer access from ``dev_id`` is currently granted."""
+    cdef cydriver.CUmemAccess_flags flags
+    cdef cydriver.CUmemLocation location
 
-    this_dev = Device(self._dev_id)
-    cdef object resolve_device_id = lambda dev: Device(dev).device_id
-    cdef object plan
-    cdef tuple target_ids
-    cdef tuple to_add
-    cdef tuple to_rm
-    if not self._mempool_owned:
-        _DMR_query_peer_access(self)
-    plan = plan_peer_access_update(
-        owner_device_id=self._dev_id,
-        current_peer_ids=self._peer_accessible_by,
-        requested_devices=devices,
-        resolve_device_id=resolve_device_id,
-        can_access_peer=this_dev.can_access_peer,
-    )
-    target_ids = plan.target_ids
-    to_add = plan.to_add
-    to_rm = plan.to_remove
-    cdef size_t count = len(to_add) + len(to_rm)
+    location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+    location.id = dev_id
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(self._h_pool), &location))
+    return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+
+
+cdef inline _DMR_apply_peer_access_diff(
+    DeviceMemoryResource self, tuple to_add, tuple to_remove
+):
+    """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas."""
+    cdef size_t count = len(to_add) + len(to_remove)
     cdef cydriver.CUmemAccessDesc* access_desc = NULL
     cdef size_t i = 0
 
-    if count > 0:
-        access_desc = <cydriver.CUmemAccessDesc*>PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc))
-        if access_desc == NULL:
-            raise MemoryError("Failed to allocate memory for access descriptors")
+    if count == 0:
+        return
+
+    access_desc = <cydriver.CUmemAccessDesc*>PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc))
+    if access_desc == NULL:
+        raise MemoryError("Failed to allocate memory for access descriptors")
+
+    try:
+        for dev_id in to_add:
+            access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+            access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+            access_desc[i].location.id = dev_id
+            i += 1
+        for dev_id in to_remove:
+            access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE
+            access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+            access_desc[i].location.id = dev_id
+            i += 1
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count))
+    finally:
+        if access_desc != NULL:
+            PyMem_Free(access_desc)
 
-        try:
-            for dev_id in to_add:
-                access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
-                access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                access_desc[i].location.id = dev_id
-                i += 1
 
-            for dev_id in to_rm:
-                access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE
-                access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                access_desc[i].location.id = dev_id
-                i += 1
+cdef inline _DMR_replace_peer_accessible_by(DeviceMemoryResource self, devices):
+    """Replace the full peer-access set in a single batched driver call.
 
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count))
-        finally:
-            if access_desc != NULL:
-                PyMem_Free(access_desc)
+    Backs the ``mr.peer_accessible_by = [...]`` setter. Uses the same planner
+    as the proxy's bulk ops; the only difference is that adds and removes are
+    derived from the symmetric difference between current driver state and the
+    requested target set.
+    """
+    from .._device import Device
 
-        self._peer_accessible_by = tuple(target_ids)
+    this_dev = Device(self._dev_id)
+    plan = plan_peer_access_update(
+        owner_device_id=self._dev_id,
+        current_peer_ids=_DMR_query_peer_access_ids(self),
+        requested_devices=devices,
+        resolve_device_id=_resolve_peer_device_id,
+        can_access_peer=this_dev.can_access_peer,
+    )
+    _DMR_apply_peer_access_diff(self, plan.to_add, plan.to_remove)
 
 
 cdef inline _DMR_init(DeviceMemoryResource self, device_id, options):
@@ -351,7 +394,6 @@ cdef inline _DMR_init(DeviceMemoryResource self, device_id, options):
         self._mempool_owned = False
         MP_raise_release_threshold(self)
     else:
-        self._peer_accessible_by = ()
         MP_init_create_pool(
             self,
             cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE,
diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.py b/cuda_core/cuda/core/_memory/_peer_access_utils.py
index e08de69f2c7..4589612ff62 100644
--- a/cuda_core/cuda/core/_memory/_peer_access_utils.py
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.py
@@ -4,8 +4,13 @@
 
 from __future__ import annotations
 
-from collections.abc import Callable, Iterable
+from collections.abc import Callable, Iterable, MutableSet
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from cuda.core._device import Device
+    from cuda.core._memory._device_memory_resource import DeviceMemoryResource
 
 
 @dataclass(frozen=True)
@@ -57,3 +62,201 @@ def plan_peer_access_update(
         to_add=tuple(sorted(target_id_set - current_ids)),
         to_remove=tuple(sorted(current_ids - target_id_set)),
     )
+
+
+def _resolve_peer_device_id(value: Device | int) -> int:
+    """Coerce ``Device | int`` into a device-ordinal int."""
+    from cuda.core._device import Device
+
+    return Device(value).device_id
+
+
+class PeerAccessibleBySetProxy(MutableSet):
+    """Live driver-backed view of the peer devices granted access to a memory pool.
+
+    Reads (``__contains__``, ``__iter__``, ``len(...)``) call ``cuMemPoolGetAccess``;
+    writes (``add``, ``discard``, and bulk ops) call ``cuMemPoolSetAccess``. There
+    is no in-memory mirror, so the view always reflects the current driver state
+    and stays consistent across multiple wrappers around the same pool.
+
+    Iteration yields :class:`~cuda.core.Device` objects. ``add``, ``discard``, and
+    ``__contains__`` accept either a :class:`~cuda.core.Device` or a device-ordinal
+    ``int``; the owner device is silently ignored when supplied.
+
+    All bulk operations (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``)
+    issue exactly one ``cuMemPoolSetAccess`` call. This matters: peer-access
+    transitions can take seconds per pool because every existing memory mapping
+    is updated, so coalescing into a single driver call lets the toolkit handle
+    the mappings in parallel.
+    """
+
+    __slots__ = ("_mr",)
+
+    def __init__(self, mr: DeviceMemoryResource):
+        self._mr = mr
+
+    @classmethod
+    def _from_iterable(cls, it):
+        # Binary set operators (&, |, -, ^) collect their result through
+        # _from_iterable. Returning a plain set lets the user reason about
+        # the result independently of any pool's driver state.
+        return set(it)
+
+    # --- abstract MutableSet methods ---
+
+    def __contains__(self, value) -> bool:
+        try:
+            dev_id = _resolve_peer_device_id(value)
+        except (TypeError, ValueError):
+            return False
+        if dev_id == self._mr._dev_id:
+            return False
+        return self._mr._peer_access_includes(dev_id)
+
+    def __iter__(self):
+        from cuda.core._device import Device
+
+        return iter(Device(dev_id) for dev_id in self._mr._query_peer_access_ids())
+
+    def __len__(self) -> int:
+        return len(self._mr._query_peer_access_ids())
+
+    def add(self, value: Device | int) -> None:
+        """Grant peer access from ``value`` to allocations in this pool."""
+        self._apply([value], ())
+
+    def discard(self, value: Device | int) -> None:
+        """Revoke peer access from ``value`` to allocations in this pool."""
+        try:
+            dev_id = _resolve_peer_device_id(value)
+        except (TypeError, ValueError):
+            return
+        self._apply((), [dev_id])
+
+    # --- bulk overrides: one driver call per op ---
+
+    def clear(self) -> None:
+        """Revoke all peer access in a single driver call."""
+        self._apply((), self._mr._query_peer_access_ids())
+
+    def update(self, *others) -> None:
+        """Grant peer access to every device in ``others`` in one driver call."""
+        to_add = []
+        for other in others:
+            to_add.extend(other)
+        if to_add:
+            self._apply(to_add, ())
+
+    def difference_update(self, *others) -> None:
+        """Revoke peer access for every device in ``others`` in one driver call."""
+        revoke_ids: set[int] = set()
+        for other in others:
+            for value in other:
+                try:
+                    revoke_ids.add(_resolve_peer_device_id(value))
+                except (TypeError, ValueError):
+                    continue
+        current = set(self._mr._query_peer_access_ids())
+        to_remove = revoke_ids & current
+        if to_remove:
+            self._apply((), to_remove)
+
+    def intersection_update(self, *others) -> None:
+        """Restrict peer access to the intersection in a single driver call."""
+        keep_ids: set[int] | None = None
+        for other in others:
+            ids: set[int] = set()
+            for value in other:
+                try:
+                    ids.add(_resolve_peer_device_id(value))
+                except (TypeError, ValueError):
+                    continue
+            keep_ids = ids if keep_ids is None else keep_ids & ids
+        if keep_ids is None:
+            return  # ``set.intersection_update()`` with no args is a no-op
+        current = set(self._mr._query_peer_access_ids())
+        to_remove = current - keep_ids
+        if to_remove:
+            self._apply((), to_remove)
+
+    def symmetric_difference_update(self, other) -> None:
+        """Toggle peer access for every device in ``other`` in one driver call."""
+        toggle_ids: set[int] = set()
+        for value in other:
+            try:
+                toggle_ids.add(_resolve_peer_device_id(value))
+            except (TypeError, ValueError):
+                continue
+        current = set(self._mr._query_peer_access_ids())
+        to_add = toggle_ids - current
+        to_remove = toggle_ids & current
+        if to_add or to_remove:
+            self._apply(to_add, to_remove)
+
+    def __ior__(self, other):
+        self.update(other)
+        return self
+
+    def __iand__(self, other):
+        self.intersection_update(other)
+        return self
+
+    def __isub__(self, other):
+        if other is self:
+            self.clear()
+        else:
+            self.difference_update(other)
+        return self
+
+    def __ixor__(self, other):
+        self.symmetric_difference_update(other)
+        return self
+
+    def __repr__(self) -> str:
+        return f"PeerAccessibleBySetProxy({set(self)!r})"
+
+    # --- internal: route every write through one batched driver call ---
+
+    def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> None:
+        """Compute the diff and issue a single ``cuMemPoolSetAccess``.
+
+        ``additions`` and ``removals`` are user-supplied (``Device | int``);
+        only the owner device is filtered out. Adds are validated through
+        :meth:`Device.can_access_peer` via :func:`plan_peer_access_update`;
+        removals bypass that check (revoking is always permitted).
+        """
+        from cuda.core._device import Device
+
+        owner_id = self._mr._dev_id
+        owner = Device(owner_id)
+        current = self._mr._query_peer_access_ids()
+
+        # Plan additions through the existing helper (validates can_access_peer).
+        plan = plan_peer_access_update(
+            owner_device_id=owner_id,
+            current_peer_ids=current,
+            # union of (current set + requested adds) so the planner emits
+            # exactly the to_add deltas for these additions, no removals.
+            requested_devices=[*current, *additions],
+            resolve_device_id=_resolve_peer_device_id,
+            can_access_peer=owner.can_access_peer,
+        )
+        to_add = plan.to_add
+
+        # Removals: resolve, drop owner and unknowns, intersect with current.
+        current_set = set(current)
+        revoke_ids: set[int] = set()
+        for value in removals:
+            try:
+                dev_id = _resolve_peer_device_id(value)
+            except (TypeError, ValueError):
+                continue
+            if dev_id == owner_id:
+                continue
+            if dev_id in current_set:
+                revoke_ids.add(dev_id)
+        to_remove = tuple(sorted(revoke_ids))
+
+        if not to_add and not to_remove:
+            return
+        self._mr._apply_peer_access_diff(to_add, to_remove)
diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
index 141773967e8..edb113dbd25 100644
--- a/cuda_core/docs/source/api_private.rst
+++ b/cuda_core/docs/source/api_private.rst
@@ -17,6 +17,7 @@ CUDA runtime
    :toctree: generated/
 
    typing.DevicePointerT
+   _memory._peer_access_utils.PeerAccessibleBySetProxy
    _memory._virtual_memory_resource.VirtualMemoryAllocationTypeT
    _memory._virtual_memory_resource.VirtualMemoryLocationTypeT
    _memory._virtual_memory_resource.VirtualMemoryGranularityT
diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst
index 3f61a30ec1e..5625cfde879 100644
--- a/cuda_core/docs/source/release/1.0.0-notes.rst
+++ b/cuda_core/docs/source/release/1.0.0-notes.rst
@@ -113,6 +113,19 @@ Breaking changes
     ``CUgraphConditionalHandle`` value. Previously, ``.handle`` had to be
     extracted explicitly.
 
+- :attr:`DeviceMemoryResource.peer_accessible_by` now returns a live
+  driver-backed :class:`collections.abc.MutableSet` view
+  (:class:`~_memory._peer_access_utils.PeerAccessibleBySetProxy`) instead of a
+  sorted ``tuple[int, ...]``. Iteration yields :obj:`~_device.Device` objects;
+  ``add``, ``discard``, and ``__contains__`` accept either a
+  :obj:`~_device.Device` or a device-ordinal ``int``. Callers comparing against
+  tuples must update to set comparisons (e.g.
+  ``mr.peer_accessible_by == {Device(0)}``). The property setter
+  (``mr.peer_accessible_by = [...]``) is unchanged. Bulk operations on the
+  proxy (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``) each issue
+  exactly one ``cuMemPoolSetAccess`` call so the toolkit can update existing
+  memory mappings in parallel.
+
 
 Fixes and enhancements
 -----------------------
diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index a3f83986701..f2623dd89e0 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -29,7 +29,7 @@ def test_main(self, mempool_device_x2):
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mr = DeviceMemoryResource(dev1, options=options)
         mr.peer_accessible_by = [dev0]
-        assert mr.peer_accessible_by == (0,)
+        assert mr.peer_accessible_by == {dev0}
 
         # Spawn child process
         process = mp.Process(target=self.child_main, args=(mr,))
@@ -38,18 +38,18 @@ def test_main(self, mempool_device_x2):
         assert process.exitcode == 0
 
         # Verify parent's MR still has peer access set (independent state)
-        assert mr.peer_accessible_by == (0,)
+        assert mr.peer_accessible_by == {dev0}
         mr.close()
 
     def child_main(self, mr):
         Device(1).set_current()
         assert mr.is_mapped is True
         assert mr.device_id == 1
-        assert mr.peer_accessible_by == ()
+        assert mr.peer_accessible_by == set()
         mr.peer_accessible_by = [0]
-        assert mr.peer_accessible_by == (0,)
+        assert mr.peer_accessible_by == {Device(0)}
         mr.peer_accessible_by = []
-        assert mr.peer_accessible_by == ()
+        assert mr.peer_accessible_by == set()
         mr.close()
 
 
@@ -70,9 +70,9 @@ def test_main(self, mempool_device_x2, grant_access_in_parent):
         mr = DeviceMemoryResource(dev1, options=options)
         if grant_access_in_parent:
             mr.peer_accessible_by = [dev0]
-            assert mr.peer_accessible_by == (0,)
+            assert mr.peer_accessible_by == {dev0}
         else:
-            assert mr.peer_accessible_by == ()
+            assert mr.peer_accessible_by == set()
         buffer = mr.allocate(NBYTES)
         pgen = PatternGen(dev1, NBYTES)
         pgen.fill_buffer(buffer, seed=False)
@@ -108,14 +108,14 @@ def child_main(self, mr, buffer):
         # Test 3: Set peer access and verify buffer becomes accessible
         dev1.set_current()
         mr.peer_accessible_by = [0]
-        assert mr.peer_accessible_by == (0,)
+        assert mr.peer_accessible_by == {dev0}
         dev0.set_current()
         PatternGen(dev0, NBYTES).verify_buffer(buffer, seed=False)
 
         # Test 4: Revoke peer access and verify buffer becomes inaccessible
         dev1.set_current()
         mr.peer_accessible_by = []
-        assert mr.peer_accessible_by == ()
+        assert mr.peer_accessible_by == set()
         dev0.set_current()
         with pytest.raises(CUDAError, match="CUDA_ERROR_INVALID_VALUE"):
             PatternGen(dev0, NBYTES).verify_buffer(buffer, seed=False)
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index 1a790645864..c73ba3ffd1c 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -1,10 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
 from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer
 
-from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from cuda.core._utils.cuda_utils import CUDAError
 
 NBYTES = 1024
@@ -100,18 +100,18 @@ def verify_state(state, pattern_seed):
     transitions = [(s0, s1) for s0 in states for s1 in states if s0 != s1]
     for init_state, final_state in transitions:
         dmrs[0].peer_accessible_by = init_state
-        assert dmrs[0].peer_accessible_by == init_state
+        assert dmrs[0].peer_accessible_by == {Device(i) for i in init_state}
         verify_state(init_state, pattern_seed)
         pattern_seed += 1
 
         dmrs[0].peer_accessible_by = final_state
-        assert dmrs[0].peer_accessible_by == final_state
+        assert dmrs[0].peer_accessible_by == {Device(i) for i in final_state}
         verify_state(final_state, pattern_seed)
         pattern_seed += 1
 
 
 def test_peer_access_shared_pool_queries_driver(mempool_device_x2):
-    """Non-owned pools always query the driver for peer access state."""
+    """All pools always query the driver, so wrappers see consistent state."""
     dev0, dev1 = mempool_device_x2
 
     # Grant peer access via one wrapper; a second wrapper must see it.
@@ -122,18 +122,18 @@ def test_peer_access_shared_pool_queries_driver(mempool_device_x2):
 
     # Revoke via dmr2; dmr1 must reflect the change immediately.
     dmr2.peer_accessible_by = []
-    assert dmr1.peer_accessible_by == ()
+    assert dmr1.peer_accessible_by == set()
 
     # Re-grant via dmr1. A fresh wrapper that has never read the
     # property must still query the driver before computing diffs
     # in the setter, so setting [] must discover and revoke the access.
     dmr1.peer_accessible_by = [dev1]
     dmr3 = DeviceMemoryResource(dev0)
-    assert dmr1.peer_accessible_by == (dev1.device_id,)
-    assert dmr2.peer_accessible_by == (dev1.device_id,)
-    assert dmr3.peer_accessible_by == (dev1.device_id,)
+    assert dmr1.peer_accessible_by == {dev1}
+    assert dmr2.peer_accessible_by == {dev1}
+    assert dmr3.peer_accessible_by == {dev1}
     dmr3.peer_accessible_by = []
-    assert DeviceMemoryResource(dev0).peer_accessible_by == ()
-    assert dmr1.peer_accessible_by == ()
-    assert dmr2.peer_accessible_by == ()
-    assert dmr3.peer_accessible_by == ()
+    assert DeviceMemoryResource(dev0).peer_accessible_by == set()
+    assert dmr1.peer_accessible_by == set()
+    assert dmr2.peer_accessible_by == set()
+    assert dmr3.peer_accessible_by == set()

From e34aacf0176cfe1883df25fb4bcb6c540600d244 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 4 May 2026 15:25:15 -0700
Subject: [PATCH 2/8] cuda.core: move peer-access internals into a single
 _peer_access_utils.pyx

The previous commit left DeviceMemoryResource carrying three pass-through
def methods (_query_peer_access_ids, _peer_access_includes,
_apply_peer_access_diff) whose only purpose was to give the pure-Python
proxy in _peer_access_utils.py a way to call cdef helpers in
_device_memory_resource.pyx. These methods served no public role and
cluttered the class API.

Promote _peer_access_utils.py to a Cython module so the proxy and the
driver-touching helpers can live together:

- Convert _peer_access_utils.py to _peer_access_utils.pyx. cimports
  cydriver and DeviceMemoryResource from the .pxd; uses nogil and direct
  CUmemAccessDesc packing identically to before.

- Move _DMR_query_peer_access_ids, _DMR_peer_access_includes,
  _DMR_apply_peer_access_diff, and _DMR_replace_peer_accessible_by from
  _device_memory_resource.pyx into the new module as cdef helpers (and a
  cpdef replace_peer_accessible_by entry point used by the property
  setter).

- Drop the three pass-through def methods from DeviceMemoryResource. The
  class is left with the property getter and setter only; everything else
  is module-level in _peer_access_utils.

- The proxy now calls the module-level cdef helpers directly instead of
  routing through methods on mr.

No behavior change. The public surface (PeerAccessibleBySetProxy,
plan_peer_access_update, normalize_peer_access_targets, PeerAccessPlan)
is preserved at the same import paths.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../core/_memory/_device_memory_resource.pyx  | 151 ++----------------
 ...access_utils.py => _peer_access_utils.pyx} | 145 ++++++++++++++---
 2 files changed, 134 insertions(+), 162 deletions(-)
 rename cuda_core/cuda/core/_memory/{_peer_access_utils.py => _peer_access_utils.pyx} (62%)

diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
index 48beea80b71..703e37b7bf0 100644
--- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
@@ -18,18 +18,12 @@ from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN,
 )
-from cpython.mem cimport PyMem_Malloc, PyMem_Free
-
 from dataclasses import dataclass
 import multiprocessing
 import platform  # no-cython-lint
 import uuid
 
-from ._peer_access_utils import (
-    PeerAccessibleBySetProxy,
-    _resolve_peer_device_id,
-    plan_peer_access_update,
-)
+from ._peer_access_utils import PeerAccessibleBySetProxy, replace_peer_accessible_by
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
 
 __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions']
@@ -215,60 +209,27 @@ cdef class DeviceMemoryResource(_MemPool):
     @property
     def peer_accessible_by(self):
         """
-        Live driver-backed set view of the devices that can access allocations
+        Get or set the devices that can access allocations from this memory
+        pool. Access can be modified at any time and affects all allocations
         from this memory pool.
 
-        Returns a :class:`PeerAccessibleBySetProxy` (a
-        :class:`collections.abc.MutableSet`) whose reads call
-        ``cuMemPoolGetAccess`` and whose writes call ``cuMemPoolSetAccess``.
-        Iteration yields :class:`Device` objects; ``add``, ``discard``, and
-        ``__contains__`` accept either a :class:`Device` or a device-ordinal
-        ``int``. There is no in-memory cache, so the view always reflects the
-        current driver state and stays consistent across multiple wrappers
-        around the same pool.
-
-        When setting, accepts an iterable of :obj:`~_device.Device` objects or
-        device IDs. Setting replaces the full set in a single batched driver call.
-
-        Bulk operations (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``,
-        and the property setter) each issue exactly one ``cuMemPoolSetAccess``
-        call so the toolkit can update existing memory mappings in parallel.
+        Returns a set-like proxy of :obj:`~_device.Device` objects that manages
+        peer access. Inputs are accepted as either :obj:`~_device.Device`
+        objects or device-ordinal :class:`int` values.
 
         Examples
         --------
         >>> dmr = DeviceMemoryResource(0)
-        >>> dmr.peer_accessible_by.add(1)            # grant access to device 1
-        >>> assert dmr.peer_accessible_by == {Device(1)}
-        >>> dmr.peer_accessible_by |= {Device(2)}    # batched grant via |=
-        >>> dmr.peer_accessible_by = []              # revoke all in one call
+        >>> dmr.peer_accessible_by = {1}   # grant access to device 1
+        >>> assert 1 in dmr.peer_accessible_by
+        >>> dmr.peer_accessible_by.add(2)  # update access to include device 2
+        >>> dmr.peer_accessible_by = []    # revoke peer access
         """
         return PeerAccessibleBySetProxy(self)
 
     @peer_accessible_by.setter
     def peer_accessible_by(self, devices):
-        _DMR_replace_peer_accessible_by(self, devices)
-
-    def _query_peer_access_ids(self):
-        """Return the current peer device IDs as a sorted tuple of ints.
-
-        Always queries the driver via ``cuMemPoolGetAccess`` for every visible
-        device. Used by :class:`PeerAccessibleBySetProxy` for ``__iter__`` and
-        ``__len__``.
-        """
-        return _DMR_query_peer_access_ids(self)
-
-    def _peer_access_includes(self, int dev_id) -> bool:
-        """Return True if peer access from ``dev_id`` is currently granted."""
-        return _DMR_peer_access_includes(self, dev_id)
-
-    def _apply_peer_access_diff(self, to_add, to_remove):
-        """Issue a single ``cuMemPoolSetAccess`` for the given add/remove deltas.
-
-        ``to_add`` and ``to_remove`` are iterables of device-ordinal ints.
-        Both must already be filtered (no owner, no overlap, no duplicates).
-        Used by :class:`PeerAccessibleBySetProxy` for batched writes.
-        """
-        _DMR_apply_peer_access_diff(self, tuple(to_add), tuple(to_remove))
+        replace_peer_accessible_by(self, devices)
 
     @property
     def is_device_accessible(self) -> bool:
@@ -281,96 +242,6 @@ cdef class DeviceMemoryResource(_MemPool):
         return False
 
 
-cdef inline tuple _DMR_query_peer_access_ids(DeviceMemoryResource self):
-    """Return the current peer device IDs as a sorted tuple of ints."""
-    cdef int total
-    cdef cydriver.CUmemAccess_flags flags
-    cdef cydriver.CUmemLocation location
-    cdef list peers = []
-
-    with nogil:
-        HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
-
-    location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-    for dev_id in range(total):
-        if dev_id == self._dev_id:
-            continue
-        location.id = dev_id
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(self._h_pool), &location))
-        if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE:
-            peers.append(dev_id)
-
-    return tuple(sorted(peers))
-
-
-cdef inline bint _DMR_peer_access_includes(DeviceMemoryResource self, int dev_id):
-    """Return True if peer access from ``dev_id`` is currently granted."""
-    cdef cydriver.CUmemAccess_flags flags
-    cdef cydriver.CUmemLocation location
-
-    location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-    location.id = dev_id
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(self._h_pool), &location))
-    return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
-
-
-cdef inline _DMR_apply_peer_access_diff(
-    DeviceMemoryResource self, tuple to_add, tuple to_remove
-):
-    """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas."""
-    cdef size_t count = len(to_add) + len(to_remove)
-    cdef cydriver.CUmemAccessDesc* access_desc = NULL
-    cdef size_t i = 0
-
-    if count == 0:
-        return
-
-    access_desc = <cydriver.CUmemAccessDesc*>PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc))
-    if access_desc == NULL:
-        raise MemoryError("Failed to allocate memory for access descriptors")
-
-    try:
-        for dev_id in to_add:
-            access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
-            access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-            access_desc[i].location.id = dev_id
-            i += 1
-        for dev_id in to_remove:
-            access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE
-            access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-            access_desc[i].location.id = dev_id
-            i += 1
-
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count))
-    finally:
-        if access_desc != NULL:
-            PyMem_Free(access_desc)
-
-
-cdef inline _DMR_replace_peer_accessible_by(DeviceMemoryResource self, devices):
-    """Replace the full peer-access set in a single batched driver call.
-
-    Backs the ``mr.peer_accessible_by = [...]`` setter. Uses the same planner
-    as the proxy's bulk ops; the only difference is that adds and removes are
-    derived from the symmetric difference between current driver state and the
-    requested target set.
-    """
-    from .._device import Device
-
-    this_dev = Device(self._dev_id)
-    plan = plan_peer_access_update(
-        owner_device_id=self._dev_id,
-        current_peer_ids=_DMR_query_peer_access_ids(self),
-        requested_devices=devices,
-        resolve_device_id=_resolve_peer_device_id,
-        can_access_peer=this_dev.can_access_peer,
-    )
-    _DMR_apply_peer_access_diff(self, plan.to_add, plan.to_remove)
-
-
 cdef inline _DMR_init(DeviceMemoryResource self, device_id, options):
     from .._device import Device
     cdef int dev_id = Device(device_id).device_id
diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.py b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
similarity index 62%
rename from cuda_core/cuda/core/_memory/_peer_access_utils.py
rename to cuda_core/cuda/core/_memory/_peer_access_utils.pyx
index 4589612ff62..22f055dcfb6 100644
--- a/cuda_core/cuda/core/_memory/_peer_access_utils.py
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
@@ -8,9 +8,14 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
+from cuda.bindings cimport cydriver
+from cuda.core._memory._device_memory_resource cimport DeviceMemoryResource
+from cuda.core._resource_handles cimport as_cu
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
+
 if TYPE_CHECKING:
     from cuda.core._device import Device
-    from cuda.core._memory._device_memory_resource import DeviceMemoryResource
 
 
 @dataclass(frozen=True)
@@ -64,13 +69,107 @@ def plan_peer_access_update(
     )
 
 
-def _resolve_peer_device_id(value: Device | int) -> int:
+def _resolve_peer_device_id(value):
     """Coerce ``Device | int`` into a device-ordinal int."""
     from cuda.core._device import Device
 
     return Device(value).device_id
 
 
+# ---- driver-touching helpers (cdef inline, called from .pyx code) -----------
+
+cdef inline tuple _query_peer_access_ids(DeviceMemoryResource mr):
+    """Return the current peer device IDs as a sorted tuple of ints."""
+    cdef int total
+    cdef cydriver.CUmemAccess_flags flags
+    cdef cydriver.CUmemLocation location
+    cdef list peers = []
+
+    with nogil:
+        HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
+
+    location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+    for dev_id in range(total):
+        if dev_id == mr._dev_id:
+            continue
+        location.id = dev_id
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(mr._h_pool), &location))
+        if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE:
+            peers.append(dev_id)
+
+    return tuple(sorted(peers))
+
+
+cdef inline bint _peer_access_includes(DeviceMemoryResource mr, int dev_id):
+    """Return True if peer access from ``dev_id`` is currently granted."""
+    cdef cydriver.CUmemAccess_flags flags
+    cdef cydriver.CUmemLocation location
+
+    location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+    location.id = dev_id
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(mr._h_pool), &location))
+    return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+
+
+cdef inline _apply_peer_access_diff(
+    DeviceMemoryResource mr, tuple to_add, tuple to_remove
+):
+    """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas."""
+    cdef size_t count = len(to_add) + len(to_remove)
+    cdef cydriver.CUmemAccessDesc* access_desc = NULL
+    cdef size_t i = 0
+
+    if count == 0:
+        return
+
+    access_desc = <cydriver.CUmemAccessDesc*>PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc))
+    if access_desc == NULL:
+        raise MemoryError("Failed to allocate memory for access descriptors")
+
+    try:
+        for dev_id in to_add:
+            access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+            access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+            access_desc[i].location.id = dev_id
+            i += 1
+        for dev_id in to_remove:
+            access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE
+            access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+            access_desc[i].location.id = dev_id
+            i += 1
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(mr._h_pool), access_desc, count))
+    finally:
+        if access_desc != NULL:
+            PyMem_Free(access_desc)
+
+
+cpdef replace_peer_accessible_by(DeviceMemoryResource mr, devices):
+    """Replace the full peer-access set in a single batched driver call.
+
+    Backs the ``mr.peer_accessible_by = [...]`` setter. Uses the same planner
+    as the proxy's bulk ops; the only difference is that adds and removes are
+    derived from the symmetric difference between current driver state and the
+    requested target set.
+    """
+    from cuda.core._device import Device
+
+    this_dev = Device(mr._dev_id)
+    plan = plan_peer_access_update(
+        owner_device_id=mr._dev_id,
+        current_peer_ids=_query_peer_access_ids(mr),
+        requested_devices=devices,
+        resolve_device_id=_resolve_peer_device_id,
+        can_access_peer=this_dev.can_access_peer,
+    )
+    _apply_peer_access_diff(mr, plan.to_add, plan.to_remove)
+
+
+# ---- Python MutableSet proxy ------------------------------------------------
+
 class PeerAccessibleBySetProxy(MutableSet):
     """Live driver-backed view of the peer devices granted access to a memory pool.
 
@@ -92,7 +191,7 @@ class PeerAccessibleBySetProxy(MutableSet):
 
     __slots__ = ("_mr",)
 
-    def __init__(self, mr: DeviceMemoryResource):
+    def __init__(self, mr):
         self._mr = mr
 
     @classmethod
@@ -109,23 +208,24 @@ def __contains__(self, value) -> bool:
             dev_id = _resolve_peer_device_id(value)
         except (TypeError, ValueError):
             return False
-        if dev_id == self._mr._dev_id:
+        cdef DeviceMemoryResource mr = <DeviceMemoryResource>self._mr
+        if dev_id == mr._dev_id:
             return False
-        return self._mr._peer_access_includes(dev_id)
+        return _peer_access_includes(mr, dev_id)
 
     def __iter__(self):
         from cuda.core._device import Device
 
-        return iter(Device(dev_id) for dev_id in self._mr._query_peer_access_ids())
+        return iter(Device(dev_id) for dev_id in _query_peer_access_ids(self._mr))
 
     def __len__(self) -> int:
-        return len(self._mr._query_peer_access_ids())
+        return len(_query_peer_access_ids(self._mr))
 
-    def add(self, value: Device | int) -> None:
+    def add(self, value) -> None:
         """Grant peer access from ``value`` to allocations in this pool."""
         self._apply([value], ())
 
-    def discard(self, value: Device | int) -> None:
+    def discard(self, value) -> None:
         """Revoke peer access from ``value`` to allocations in this pool."""
         try:
             dev_id = _resolve_peer_device_id(value)
@@ -137,7 +237,7 @@ def discard(self, value: Device | int) -> None:
 
     def clear(self) -> None:
         """Revoke all peer access in a single driver call."""
-        self._apply((), self._mr._query_peer_access_ids())
+        self._apply((), _query_peer_access_ids(self._mr))
 
     def update(self, *others) -> None:
         """Grant peer access to every device in ``others`` in one driver call."""
@@ -149,23 +249,23 @@ def update(self, *others) -> None:
 
     def difference_update(self, *others) -> None:
         """Revoke peer access for every device in ``others`` in one driver call."""
-        revoke_ids: set[int] = set()
+        revoke_ids = set()
         for other in others:
             for value in other:
                 try:
                     revoke_ids.add(_resolve_peer_device_id(value))
                 except (TypeError, ValueError):
                     continue
-        current = set(self._mr._query_peer_access_ids())
+        current = set(_query_peer_access_ids(self._mr))
         to_remove = revoke_ids & current
         if to_remove:
             self._apply((), to_remove)
 
     def intersection_update(self, *others) -> None:
         """Restrict peer access to the intersection in a single driver call."""
-        keep_ids: set[int] | None = None
+        keep_ids = None
         for other in others:
-            ids: set[int] = set()
+            ids = set()
             for value in other:
                 try:
                     ids.add(_resolve_peer_device_id(value))
@@ -174,20 +274,20 @@ def intersection_update(self, *others) -> None:
             keep_ids = ids if keep_ids is None else keep_ids & ids
         if keep_ids is None:
             return  # ``set.intersection_update()`` with no args is a no-op
-        current = set(self._mr._query_peer_access_ids())
+        current = set(_query_peer_access_ids(self._mr))
         to_remove = current - keep_ids
         if to_remove:
             self._apply((), to_remove)
 
     def symmetric_difference_update(self, other) -> None:
         """Toggle peer access for every device in ``other`` in one driver call."""
-        toggle_ids: set[int] = set()
+        toggle_ids = set()
         for value in other:
             try:
                 toggle_ids.add(_resolve_peer_device_id(value))
             except (TypeError, ValueError):
                 continue
-        current = set(self._mr._query_peer_access_ids())
+        current = set(_query_peer_access_ids(self._mr))
         to_add = toggle_ids - current
         to_remove = toggle_ids & current
         if to_add or to_remove:
@@ -217,7 +317,7 @@ def __repr__(self) -> str:
 
     # --- internal: route every write through one batched driver call ---
 
-    def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> None:
+    def _apply(self, additions, removals) -> None:
         """Compute the diff and issue a single ``cuMemPoolSetAccess``.
 
         ``additions`` and ``removals`` are user-supplied (``Device | int``);
@@ -227,9 +327,10 @@ def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> Non
         """
         from cuda.core._device import Device
 
-        owner_id = self._mr._dev_id
+        cdef DeviceMemoryResource mr = <DeviceMemoryResource>self._mr
+        owner_id = mr._dev_id
         owner = Device(owner_id)
-        current = self._mr._query_peer_access_ids()
+        current = _query_peer_access_ids(mr)
 
         # Plan additions through the existing helper (validates can_access_peer).
         plan = plan_peer_access_update(
@@ -245,7 +346,7 @@ def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> Non
 
         # Removals: resolve, drop owner and unknowns, intersect with current.
         current_set = set(current)
-        revoke_ids: set[int] = set()
+        revoke_ids = set()
         for value in removals:
             try:
                 dev_id = _resolve_peer_device_id(value)
@@ -259,4 +360,4 @@ def _apply(self, additions: Iterable[object], removals: Iterable[object]) -> Non
 
         if not to_add and not to_remove:
             return
-        self._mr._apply_peer_access_diff(to_add, to_remove)
+        _apply_peer_access_diff(mr, to_add, to_remove)

From 123e057276abaaea32986e3aa090f52f8be4a71e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 4 May 2026 16:30:10 -0700
Subject: [PATCH 3/8] cuda.core: tighten peer-access query loop and 1.0.0
 release note

Refactor _query_peer_access_ids so the entire driver loop runs inside a
single nogil block instead of acquiring/releasing the GIL once per
device. The flag query now uses a cached as_cu(mr._h_pool) handle and
fills a libcpp.vector[int]; because range(total) ascends, the result is
already sorted and the trailing sorted() call is dropped.

Also tighten the peer_accessible_by entry in 1.0.0-notes.rst: the
breaking-change blurb only needs to state the type/element change, so
remove the implementation-flavored details about input acceptance and
batched cuMemPoolSetAccess calls.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../cuda/core/_memory/_peer_access_utils.pyx  | 35 ++++++++++++-------
 cuda_core/docs/source/release/1.0.0-notes.rst | 15 ++------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
index 22f055dcfb6..e613a545b3d 100644
--- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
@@ -13,6 +13,7 @@ from cuda.core._memory._device_memory_resource cimport DeviceMemoryResource
 from cuda.core._resource_handles cimport as_cu
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
+from libcpp.vector cimport vector
 
 if TYPE_CHECKING:
     from cuda.core._device import Device
@@ -79,26 +80,34 @@ def _resolve_peer_device_id(value):
 # ---- driver-touching helpers (cdef inline, called from .pyx code) -----------
 
 cdef inline tuple _query_peer_access_ids(DeviceMemoryResource mr):
-    """Return the current peer device IDs as a sorted tuple of ints."""
+    """Return the current peer device IDs as a sorted tuple of ints.
+
+    The full driver loop runs inside a single ``nogil`` block. Because
+    ``range(total)`` ascends, the result is already sorted.
+    """
     cdef int total
+    cdef int dev_id
+    cdef int owner_id = mr._dev_id
     cdef cydriver.CUmemAccess_flags flags
     cdef cydriver.CUmemLocation location
-    cdef list peers = []
+    cdef cydriver.CUmemoryPool h_pool = as_cu(mr._h_pool)
+    cdef vector[int] peers
+    cdef size_t i, n
+
+    location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
 
     with nogil:
         HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
+        for dev_id in range(total):
+            if dev_id == owner_id:
+                continue
+            location.id = dev_id
+            HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, h_pool, &location))
+            if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE:
+                peers.push_back(dev_id)
 
-    location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-    for dev_id in range(total):
-        if dev_id == mr._dev_id:
-            continue
-        location.id = dev_id
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, as_cu(mr._h_pool), &location))
-        if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE:
-            peers.append(dev_id)
-
-    return tuple(sorted(peers))
+    n = peers.size()
+    return tuple(peers[i] for i in range(n))
 
 
 cdef inline bint _peer_access_includes(DeviceMemoryResource mr, int dev_id):
diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst
index 5625cfde879..7d0f3cf50b4 100644
--- a/cuda_core/docs/source/release/1.0.0-notes.rst
+++ b/cuda_core/docs/source/release/1.0.0-notes.rst
@@ -113,18 +113,9 @@ Breaking changes
     ``CUgraphConditionalHandle`` value. Previously, ``.handle`` had to be
     extracted explicitly.
 
-- :attr:`DeviceMemoryResource.peer_accessible_by` now returns a live
-  driver-backed :class:`collections.abc.MutableSet` view
-  (:class:`~_memory._peer_access_utils.PeerAccessibleBySetProxy`) instead of a
-  sorted ``tuple[int, ...]``. Iteration yields :obj:`~_device.Device` objects;
-  ``add``, ``discard``, and ``__contains__`` accept either a
-  :obj:`~_device.Device` or a device-ordinal ``int``. Callers comparing against
-  tuples must update to set comparisons (e.g.
-  ``mr.peer_accessible_by == {Device(0)}``). The property setter
-  (``mr.peer_accessible_by = [...]``) is unchanged. Bulk operations on the
-  proxy (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``) each issue
-  exactly one ``cuMemPoolSetAccess`` call so the toolkit can update existing
-  memory mappings in parallel.
+- :attr:`DeviceMemoryResource.peer_accessible_by` now returns a
+  :class:`collections.abc.MutableSet` of :obj:`~_device.Device` objects instead
+  of a sorted ``tuple[int, ...]``. The property setter is unchanged.
 
 
 Fixes and enhancements

From 3e9c5dd7699cadbdec0c20999ebbcd8f6ac16d9e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 4 May 2026 16:52:42 -0700
Subject: [PATCH 4/8] cuda.core: cover PeerAccessibleBySetProxy interface,
 batching, and edge cases

Existing peer-access tests covered the integration path well (real
copies across peers, the full transition matrix, shared-pool consistency)
but only touched ``in``, ``==``, and the property setter on the new set
proxy. After the v1.0.0 break that surfaced ~25 ``MutableSet`` methods,
nothing was pinning the type-coercion contract, the owner-filtering
behavior, the ``KeyError``/value-error paths, or the "one
``cuMemPoolSetAccess`` per bulk op" performance invariant.

Add the following coverage in ``test_memory_peer_access.py``:

- A ``MutableSet`` conformance test using a relaxed
  ``assert_mutable_set_interface`` mode that admits subjects holding at
  most one insertable element. CI maxes at two GPUs (one peer), so the
  multi-element protocol pass cannot run there. The new
  ``support_multi_insert=False`` path takes one insertable item plus two
  non-member sentinels and exercises every ``MutableSet`` method
  (``add``/``discard``/``remove``/``pop``/``clear``/``update``,
  comparisons, isdisjoint, subset/superset, binary and in-place
  operators, ``__iter__``/``__len__``/``__repr__``).
- ``Device``/``int`` interchangeability on ``add``/``discard``/``__contains__``.
- The owner-device filtering contract on every write (silent no-op).
- Error paths: ``add(out_of_range)`` and ``add(non_coercible)`` raise
  while the lenient ``discard``/``__contains__`` paths swallow the same
  inputs; ``remove(non_member)`` raises ``KeyError``.
- "Live driver view" semantics: a proxy obtained before another wrapper
  modifies the pool reflects the change with no refresh step.
- ``__iter__`` ordering is ascending by ``device_id`` and elements are
  ``Device`` instances; ``__repr__`` includes the class name and tracks
  live contents; the getter returns the documented proxy type.
- A batching spy that monkeypatches the module-level
  ``_apply_peer_access_diff`` and asserts that every bulk op
  (``|=``/``&=``/``-=``/``^=``/``update``/``difference_update``/
  ``clear``) and the property setter issues at most one driver call,
  zero when the diff is empty.

To make the spy possible, ``_apply_peer_access_diff`` is now a
Python-visible ``def`` wrapper around a renamed
``_apply_peer_access_diff_cython`` ``cdef inline``. The proxy and the
property setter still call ``_apply_peer_access_diff`` by bare name,
which Cython resolves through the module's globals at runtime, so a
``monkeypatch.setattr(_peer_access_utils, "_apply_peer_access_diff", ...)``
intercepts them. The extra Python-level dispatch is negligible next to
``cuMemPoolSetAccess`` itself.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../cuda/core/_memory/_peer_access_utils.pyx  |  13 +-
 .../helpers/collection_interface_testers.py   | 197 +++++++++++++-
 cuda_core/tests/test_memory_peer_access.py    | 253 +++++++++++++++++-
 3 files changed, 458 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
index e613a545b3d..b1f92452f4b 100644
--- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
@@ -122,7 +122,7 @@ cdef inline bint _peer_access_includes(DeviceMemoryResource mr, int dev_id):
     return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
 
 
-cdef inline _apply_peer_access_diff(
+cdef inline _apply_peer_access_diff_cython(
     DeviceMemoryResource mr, tuple to_add, tuple to_remove
 ):
     """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas."""
@@ -156,6 +156,17 @@ cdef inline _apply_peer_access_diff(
             PyMem_Free(access_desc)
 
 
+def _apply_peer_access_diff(mr, to_add, to_remove):
+    """Python-visible wrapper around the cdef inline implementation.
+
+    Every write path on :class:`PeerAccessibleBySetProxy` and the
+    ``peer_accessible_by`` setter route through this function so tests can
+    intercept it via ``monkeypatch.setattr`` on the module to assert the
+    "exactly one ``cuMemPoolSetAccess`` per bulk op" batching contract.
+    """
+    _apply_peer_access_diff_cython(<DeviceMemoryResource>mr, tuple(to_add), tuple(to_remove))
+
+
 cpdef replace_peer_accessible_by(DeviceMemoryResource mr, devices):
     """Replace the full peer-access set in a single batched driver call.
 
diff --git a/cuda_core/tests/helpers/collection_interface_testers.py b/cuda_core/tests/helpers/collection_interface_testers.py
index 5197e475c18..d719c47aa3f 100644
--- a/cuda_core/tests/helpers/collection_interface_testers.py
+++ b/cuda_core/tests/helpers/collection_interface_testers.py
@@ -8,17 +8,50 @@
 import pytest
 
 
-def assert_mutable_set_interface(subject, items):
+def assert_mutable_set_interface(
+    subject,
+    items,
+    *,
+    non_members=None,
+    support_multi_insert=True,
+):
     """Exercise every MutableSet method on *subject* against a reference set.
 
+    Two modes are supported:
+
+    - ``support_multi_insert=True`` (default): the standard protocol pass that
+      inserts up to five distinct elements simultaneously. Use this whenever
+      the subject can hold an arbitrary number of items.
+    - ``support_multi_insert=False``: a reduced pass for proxies whose backing
+      store admits at most one insertable element at a time (e.g. a peer-access
+      view on a 2-GPU system). The subject only ever holds ``{}`` or ``{a}``,
+      and ``non_members`` supplies sentinel values used as the *other* side of
+      comparisons, ``isdisjoint``, subset/superset, and binary/in-place
+      operators. Every ``MutableSet`` method is still exercised at least once.
+
     Parameters
     ----------
     subject : MutableSet
         An **empty** mutable-set-like object to test.
     items : sequence
-        At least five distinct, hashable objects valid for insertion into
-        *subject*.
+        Distinct, hashable objects valid for insertion into *subject*. With
+        ``support_multi_insert=True`` at least five are required; with
+        ``support_multi_insert=False`` exactly one is needed (extras ignored).
+    non_members : sequence, optional
+        Distinct, hashable values that compare equal across set semantics but
+        are guaranteed *never* to be inserted into *subject* (typically because
+        the backing store rejects them). Required and used only when
+        ``support_multi_insert=False``; at least two are needed there.
     """
+    if support_multi_insert:
+        _assert_mutable_set_interface_multi(subject, items)
+    else:
+        if non_members is None:
+            raise TypeError("non_members is required when support_multi_insert=False")
+        _assert_mutable_set_interface_single(subject, items, non_members)
+
+
+def _assert_mutable_set_interface_multi(subject, items):
     assert len(items) >= 5
     a, b, c, d, e = items[:5]
     ref = set()
@@ -140,3 +173,161 @@ def assert_mutable_set_interface(subject, items):
     r = repr(subject)
     assert isinstance(r, str)
     assert len(r) > 0
+
+
+def _assert_mutable_set_interface_single(subject, items, non_members):
+    """Reduced protocol pass for subjects that admit at most one insertable element.
+
+    Invariants:
+    - ``a`` is the lone insertable element; the subject only ever holds ``{}``
+      or ``{a}``.
+    - ``x``, ``y`` are sentinels guaranteed *not* to be inserted; they appear
+      only on the right-hand side of operators and comparisons. They must
+      compare correctly under set semantics (i.e. equality and hashing).
+    """
+    assert len(items) >= 1
+    assert len(non_members) >= 2
+    a = items[0]
+    x, y = non_members[:2]
+    assert a != x and a != y and x != y, "items[0] and non_members[:2] must be distinct"
+
+    ref = set()
+
+    # -- ABC conformance --
+    assert isinstance(subject, Set)
+    assert isinstance(subject, MutableSet)
+
+    # -- empty state --
+    assert len(subject) == 0
+    assert subject == ref
+    assert subject == set()
+    assert list(subject) == []
+
+    # -- add --
+    subject.add(a)
+    ref.add(a)
+    assert subject == ref
+    assert a in subject
+    assert x not in subject
+    assert len(subject) == 1
+
+    # add duplicate is a no-op
+    subject.add(a)
+    assert subject == ref
+    assert len(subject) == 1
+
+    # -- comparison with plain set --
+    assert subject == {a}
+    assert subject != {a, x}
+    assert subject != set()
+
+    # -- isdisjoint --
+    assert subject.isdisjoint({x, y})
+    assert not subject.isdisjoint({a, x})
+
+    # -- subset / superset --
+    assert subject <= {a}
+    assert subject <= {a, x}
+    assert not (subject <= set())
+    assert subject < {a, x}
+    assert not (subject < {a})
+    assert {a, x} >= subject
+    assert {a, x} > subject
+
+    # -- binary operators (results are plain sets, never insert into subject) --
+    assert subject & {a, x} == {a}
+    assert subject & {x} == set()
+    assert subject | {x} == {a, x}
+    assert subject - {a} == set()
+    assert subject - {x} == {a}
+    assert subject ^ {x} == {a, x}
+    assert subject ^ {a} == set()
+
+    # -- discard non-member is a no-op --
+    subject.discard(x)
+    assert subject == ref
+
+    # -- discard member --
+    subject.discard(a)
+    ref.discard(a)
+    assert subject == ref
+
+    # -- remove member --
+    subject.add(a)
+    ref.add(a)
+    subject.remove(a)
+    ref.remove(a)
+    assert subject == ref
+
+    # -- remove non-member raises --
+    with pytest.raises(KeyError):
+        subject.remove(x)
+
+    # -- pop empty raises --
+    with pytest.raises(KeyError):
+        subject.pop()
+
+    # -- pop populated --
+    subject.add(a)
+    ref.add(a)
+    popped = subject.pop()
+    ref.discard(popped)
+    assert popped == a
+    assert popped not in subject
+    assert subject == ref
+
+    # -- in-place union (|=) covers single insert via bulk path --
+    subject |= {a}
+    ref |= {a}
+    assert subject == ref
+
+    # -- in-place intersection (&=) keeps the lone member --
+    subject &= {a, x}
+    ref &= {a, x}
+    assert subject == ref
+
+    # -- in-place intersection (&=) drops the lone member --
+    subject &= {x, y}
+    ref &= {x, y}
+    assert subject == ref
+
+    # -- in-place union via bulk path again, ahead of -= and ^= --
+    subject |= {a}
+    ref |= {a}
+    assert subject == ref
+
+    # -- in-place difference (-=) on non-member is a no-op --
+    subject -= {x}
+    ref -= {x}
+    assert subject == ref
+
+    # -- in-place difference (-=) on member empties the subject --
+    subject -= {a}
+    ref -= {a}
+    assert subject == ref
+
+    # -- in-place symmetric difference (^=): toggle in then out --
+    subject ^= {a}
+    ref ^= {a}
+    assert subject == ref
+    subject ^= {a}
+    ref ^= {a}
+    assert subject == ref
+
+    # -- clear --
+    subject.add(a)
+    ref.add(a)
+    subject.clear()
+    ref.clear()
+    assert subject == ref
+    assert len(subject) == 0
+
+    # -- __iter__ on populated subject --
+    subject.add(a)
+    ref.add(a)
+    assert set(subject) == ref
+
+    # -- __repr__ --
+    r = repr(subject)
+    assert isinstance(r, str)
+    assert len(r) > 0
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index c73ba3ffd1c..94517004a77 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -3,8 +3,11 @@
 
 import pytest
 from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer
+from helpers.collection_interface_testers import assert_mutable_set_interface
 
-from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, system
+from cuda.core._memory import _peer_access_utils
+from cuda.core._memory._peer_access_utils import PeerAccessibleBySetProxy
 from cuda.core._utils.cuda_utils import CUDAError
 
 NBYTES = 1024
@@ -137,3 +140,251 @@ def test_peer_access_shared_pool_queries_driver(mempool_device_x2):
     assert dmr1.peer_accessible_by == set()
     assert dmr2.peer_accessible_by == set()
     assert dmr3.peer_accessible_by == set()
+
+
+# ---------------------------------------------------------------------------
+# Set-proxy interface coverage
+#
+# These tests exercise the ``PeerAccessibleBySetProxy`` surface added in
+# v1.0.0. They run against ``mempool_device_x2`` because every CI machine has
+# at most 2 GPUs, which means at most one valid peer device. The relaxed
+# ``support_multi_insert=False`` path on ``assert_mutable_set_interface``
+# threads that single insertable element through the full ``MutableSet``
+# protocol.
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def isolated_dmr_x2(mempool_device_x2):
+    """Owned-pool DMR on dev0 + the lone peer device (dev1).
+
+    The owned pool guarantees a clean, empty initial peer-access state so the
+    proxy tests are not polluted by other tests sharing a default pool.
+    """
+    dev0, dev1 = mempool_device_x2
+    dmr = DeviceMemoryResource(dev0, DeviceMemoryResourceOptions())
+    dmr.peer_accessible_by = []
+    return dmr, dev0, dev1
+
+
+def test_peer_accessible_by_mutable_set_interface(isolated_dmr_x2):
+    """Run the full MutableSet protocol against a single-peer driver-backed view.
+
+    On a 2-GPU box the proxy can only ever hold ``{dev1}``. The relaxed helper
+    uses ``dev0`` (the owner, which the proxy refuses to insert) as the
+    non-member sentinel; this exercises every ``MutableSet`` method while
+    respecting the proxy's "insert at most one" hardware reality.
+    """
+    dmr, dev0, dev1 = isolated_dmr_x2
+    assert_mutable_set_interface(
+        dmr.peer_accessible_by,
+        items=[dev1],
+        non_members=[dev0, dev0.device_id],
+        support_multi_insert=False,
+    )
+
+
+def test_peer_accessible_by_accepts_int_and_device(isolated_dmr_x2):
+    """``add``/``discard``/``__contains__`` accept ``Device`` and ``int`` interchangeably."""
+    dmr, dev0, dev1 = isolated_dmr_x2
+
+    dmr.peer_accessible_by.add(dev1.device_id)
+    assert dmr.peer_accessible_by == {dev1}
+    assert dev1 in dmr.peer_accessible_by
+    assert dev1.device_id in dmr.peer_accessible_by
+
+    dmr.peer_accessible_by.discard(dev1)
+    assert dmr.peer_accessible_by == set()
+
+    dmr.peer_accessible_by.add(dev1)
+    assert dmr.peer_accessible_by == {dev1}
+    dmr.peer_accessible_by.discard(dev1.device_id)
+    assert dmr.peer_accessible_by == set()
+
+
+def test_peer_accessible_by_silently_ignores_owner(isolated_dmr_x2):
+    """The owner device is silently filtered on every write; ``__contains__`` returns False."""
+    dmr, dev0, dev1 = isolated_dmr_x2
+
+    # add/discard on owner is a no-op (no error, no state change)
+    dmr.peer_accessible_by.add(dev0)
+    dmr.peer_accessible_by.add(dev0.device_id)
+    assert dmr.peer_accessible_by == set()
+    dmr.peer_accessible_by.discard(dev0)
+    dmr.peer_accessible_by.discard(dev0.device_id)
+    assert dmr.peer_accessible_by == set()
+
+    # __contains__ on owner is False (matches set semantics, never raises)
+    assert dev0 not in dmr.peer_accessible_by
+    assert dev0.device_id not in dmr.peer_accessible_by
+
+    # Owner mixed into bulk ops is filtered, the peer is still added/removed
+    dmr.peer_accessible_by |= {dev0, dev1}
+    assert dmr.peer_accessible_by == {dev1}
+    dmr.peer_accessible_by -= {dev0, dev1}
+    assert dmr.peer_accessible_by == set()
+
+
+def test_peer_accessible_by_rejects_invalid_inputs(isolated_dmr_x2):
+    """``add`` raises on out-of-range/unsupported inputs; lenient methods do not."""
+    dmr, dev0, dev1 = isolated_dmr_x2
+    bad_id = system.get_num_devices()  # one past the last valid device ordinal
+
+    # add: validates strictly, propagates errors from Device(bad_id)
+    with pytest.raises((ValueError, CUDAError)):
+        dmr.peer_accessible_by.add(bad_id)
+    # Non-coercible inputs surface whatever Device(value) raises (TypeError or
+    # ValueError depending on Cython's int coercion path).
+    with pytest.raises((TypeError, ValueError)):
+        dmr.peer_accessible_by.add("not-a-device")
+
+    # discard: silently ignores non-coercible values (matches set.discard)
+    dmr.peer_accessible_by.discard("not-a-device")
+    assert dmr.peer_accessible_by == set()
+
+    # __contains__: returns False on non-coercible values, never raises
+    assert "not-a-device" not in dmr.peer_accessible_by
+
+    # remove on a non-member raises KeyError (inherited from MutableSet)
+    with pytest.raises(KeyError):
+        dmr.peer_accessible_by.remove(dev1)
+
+
+def test_peer_accessible_by_no_cache_across_proxies(mempool_device_x2):
+    """Updates via one wrapper are immediately visible through any other proxy."""
+    dev0, dev1 = mempool_device_x2
+    dmr_a = DeviceMemoryResource(dev0)
+    dmr_b = DeviceMemoryResource(dev0)
+    dmr_a.peer_accessible_by = []
+
+    proxy = dmr_a.peer_accessible_by  # acquired before the change below
+    dmr_b.peer_accessible_by.add(dev1)
+    # The proxy must reflect the new driver state, not a snapshot.
+    assert dev1 in proxy
+    assert proxy == {dev1}
+
+    dmr_b.peer_accessible_by.clear()
+    assert proxy == set()
+
+
+def test_peer_accessible_by_iteration_order_is_sorted(mempool_device_x2):
+    """``__iter__`` yields peers in ascending device-ordinal order."""
+    dev0, dev1 = mempool_device_x2
+    dmr = DeviceMemoryResource(dev0, DeviceMemoryResourceOptions())
+    dmr.peer_accessible_by = [dev1]
+    devices = list(dmr.peer_accessible_by)
+    ids = [d.device_id for d in devices]
+    assert ids == sorted(ids)
+    assert all(isinstance(d, Device) for d in devices)
+
+
+def test_peer_accessible_by_repr(isolated_dmr_x2):
+    """``repr`` includes the class name and reflects the live contents."""
+    dmr, dev0, dev1 = isolated_dmr_x2
+    empty_repr = repr(dmr.peer_accessible_by)
+    assert "PeerAccessibleBySetProxy" in empty_repr
+    assert "set()" in empty_repr
+
+    dmr.peer_accessible_by.add(dev1)
+    populated_repr = repr(dmr.peer_accessible_by)
+    assert "PeerAccessibleBySetProxy" in populated_repr
+    # Don't pin the exact device repr; just confirm content changed.
+    assert populated_repr != empty_repr
+
+
+def test_peer_accessible_by_returns_proxy_type(isolated_dmr_x2):
+    """The getter returns the documented proxy type (anchors the public contract)."""
+    dmr, dev0, dev1 = isolated_dmr_x2
+    assert isinstance(dmr.peer_accessible_by, PeerAccessibleBySetProxy)
+
+
+# ---------------------------------------------------------------------------
+# Batching contract: every bulk op must issue at most one cuMemPoolSetAccess
+#
+# Spying via ``monkeypatch.setattr`` on the module-level
+# ``_apply_peer_access_diff`` works because the proxy and the property setter
+# call it by bare name, which Cython resolves through the module's globals at
+# runtime (the wrapper is a plain ``def``, not a ``cdef inline``).
+# ---------------------------------------------------------------------------
+
+
+class _DiffSpy:
+    """Counts batched driver calls and forwards each call to the real wrapper."""
+
+    def __init__(self, real):
+        self._real = real
+        self.calls = []
+
+    def __call__(self, mr, to_add, to_remove):
+        # Materialize tuples once so the recorded args reflect what the proxy
+        # passed (they may be sets / lists of mixed Device/int upstream).
+        recorded_add = tuple(to_add)
+        recorded_remove = tuple(to_remove)
+        self.calls.append((recorded_add, recorded_remove))
+        self._real(mr, to_add, to_remove)
+
+
+@pytest.fixture
+def diff_spy(monkeypatch):
+    spy = _DiffSpy(_peer_access_utils._apply_peer_access_diff)
+    monkeypatch.setattr(_peer_access_utils, "_apply_peer_access_diff", spy)
+    return spy
+
+
+def test_peer_accessible_by_setter_batches_one_call(diff_spy, isolated_dmr_x2):
+    """``mr.peer_accessible_by = [...]`` issues exactly one driver call (or zero on no-op)."""
+    dmr, dev0, dev1 = isolated_dmr_x2
+    dmr.peer_accessible_by = [dev1]
+    assert len(diff_spy.calls) == 1
+    assert dev1.device_id in diff_spy.calls[-1][0]
+
+    # Reassigning the same set is a no-op (zero driver calls).
+    diff_spy.calls.clear()
+    dmr.peer_accessible_by = [dev1]
+    assert diff_spy.calls == []
+
+    # Revoking everything is a single call (one removal).
+    dmr.peer_accessible_by = []
+    assert len(diff_spy.calls) == 1
+    assert dev1.device_id in diff_spy.calls[-1][1]
+
+
+def test_peer_accessible_by_bulk_ops_batch_one_call(diff_spy, isolated_dmr_x2):
+    """``|=``, ``&=``, ``-=``, ``^=``, ``clear``, ``update`` each issue at most one call."""
+    dmr, dev0, dev1 = isolated_dmr_x2
+
+    dmr.peer_accessible_by |= {dev1}
+    assert len(diff_spy.calls) == 1
+    diff_spy.calls.clear()
+
+    # &= keeping the lone member: no driver call (no diff).
+    dmr.peer_accessible_by &= {dev1}
+    assert diff_spy.calls == []
+
+    # &= dropping the lone member: one removal.
+    dmr.peer_accessible_by &= {dev0}
+    assert len(diff_spy.calls) == 1
+    diff_spy.calls.clear()
+
+    # ^= toggling the lone peer in then out: two ops, one call each.
+    dmr.peer_accessible_by ^= {dev1}
+    assert len(diff_spy.calls) == 1
+    dmr.peer_accessible_by ^= {dev1}
+    assert len(diff_spy.calls) == 2
+    diff_spy.calls.clear()
+
+    # update() with the peer already absent: one add.
+    dmr.peer_accessible_by.update([dev1])
+    assert len(diff_spy.calls) == 1
+    diff_spy.calls.clear()
+
+    # clear() with one member: one removal.
+    dmr.peer_accessible_by.clear()
+    assert len(diff_spy.calls) == 1
+    diff_spy.calls.clear()
+
+    # Already-empty bulk ops are no-ops (nothing to add or remove).
+    dmr.peer_accessible_by.clear()
+    dmr.peer_accessible_by.difference_update([dev1])
+    dmr.peer_accessible_by -= {dev1}
+    assert diff_spy.calls == []

From 0e4a0832df45dd8f52b4c848ace4e4209048462d Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 4 May 2026 17:04:15 -0700
Subject: [PATCH 5/8] cuda.core: filter empty-delta calls in peer-access
 batching spy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Augmented assignment on the ``peer_accessible_by`` property
(``dmr.peer_accessible_by |= {...}``) is two trips through the
proxy/setter pair, not one: Python fetches the proxy, the proxy mutates
itself in place via ``__ior__``, and Python then assigns the
(already-mutated) proxy back through the setter. That trailing setter
call computes the diff against current driver state, finds it empty,
and short-circuits inside the ``cdef inline`` before issuing any
``cuMemPoolSetAccess`` work — so the *driver-level* contract ("one
batched call per bulk op") still holds, but the wrapper is invoked
twice, which the spy was counting.

Also, the fixture's ``dmr.peer_accessible_by = []`` reset on an already
empty pool is itself an empty-delta wrapper call.

Filter the recorded calls down to those with non-empty deltas (the ones
that translate to real driver work) and switch the bulk-ops test to use
a locally bound proxy so augmented assignment goes through ``__ior__``
once with no extra setter invocation. The setter test stays on
``dmr.peer_accessible_by = ...`` because that is the public API
contract under test there.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cuda_core/tests/test_memory_peer_access.py | 51 +++++++++++++++-------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index 94517004a77..b5e0e47dffe 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -309,18 +309,26 @@ def test_peer_accessible_by_returns_proxy_type(isolated_dmr_x2):
 
 
 class _DiffSpy:
-    """Counts batched driver calls and forwards each call to the real wrapper."""
+    """Counts batched driver calls and forwards each call to the real wrapper.
+
+    Only invocations with non-empty deltas are recorded in :attr:`calls`,
+    because those are the ones that translate to actual ``cuMemPoolSetAccess``
+    work. The wrapper still gets called with empty deltas in a few places
+    (e.g. the property setter when the requested target matches current
+    driver state), but the underlying ``cdef inline`` short-circuits before
+    issuing the driver call, so those invocations do not count against the
+    "one call per bulk op" contract.
+    """
 
     def __init__(self, real):
         self._real = real
         self.calls = []
 
     def __call__(self, mr, to_add, to_remove):
-        # Materialize tuples once so the recorded args reflect what the proxy
-        # passed (they may be sets / lists of mixed Device/int upstream).
         recorded_add = tuple(to_add)
         recorded_remove = tuple(to_remove)
-        self.calls.append((recorded_add, recorded_remove))
+        if recorded_add or recorded_remove:
+            self.calls.append((recorded_add, recorded_remove))
         self._real(mr, to_add, to_remove)
 
 
@@ -350,41 +358,54 @@ def test_peer_accessible_by_setter_batches_one_call(diff_spy, isolated_dmr_x2):
 
 
 def test_peer_accessible_by_bulk_ops_batch_one_call(diff_spy, isolated_dmr_x2):
-    """``|=``, ``&=``, ``-=``, ``^=``, ``clear``, ``update`` each issue at most one call."""
+    """``|=``/``&=``/``-=``/``^=``/``update``/``clear`` each issue at most one driver call.
+
+    .. note::
+       ``proxy = dmr.peer_accessible_by; proxy |= {...}`` is exactly one driver
+       call. Augmented assignment directly on the property
+       (``dmr.peer_accessible_by |= {...}``) is two operations because Python
+       fetches the proxy, mutates it, and then assigns the (already-mutated)
+       proxy back through the setter; that final assignment is a no-op at the
+       driver level (deltas come out empty), but it is two trips through the
+       proxy/setter machinery. The spy filters out empty-delta calls so the
+       invariant under test is "actual driver work" rather than that
+       Python-level quirk.
+    """
     dmr, dev0, dev1 = isolated_dmr_x2
+    proxy = dmr.peer_accessible_by  # use the local-binding form for predictable counting
 
-    dmr.peer_accessible_by |= {dev1}
+    proxy |= {dev1}
     assert len(diff_spy.calls) == 1
     diff_spy.calls.clear()
 
     # &= keeping the lone member: no driver call (no diff).
-    dmr.peer_accessible_by &= {dev1}
+    proxy &= {dev1}
     assert diff_spy.calls == []
 
     # &= dropping the lone member: one removal.
-    dmr.peer_accessible_by &= {dev0}
+    proxy &= {dev0}
     assert len(diff_spy.calls) == 1
     diff_spy.calls.clear()
 
     # ^= toggling the lone peer in then out: two ops, one call each.
-    dmr.peer_accessible_by ^= {dev1}
+    proxy ^= {dev1}
     assert len(diff_spy.calls) == 1
-    dmr.peer_accessible_by ^= {dev1}
+    proxy ^= {dev1}
     assert len(diff_spy.calls) == 2
     diff_spy.calls.clear()
 
     # update() with the peer already absent: one add.
-    dmr.peer_accessible_by.update([dev1])
+    proxy.update([dev1])
     assert len(diff_spy.calls) == 1
     diff_spy.calls.clear()
 
     # clear() with one member: one removal.
-    dmr.peer_accessible_by.clear()
+    proxy.clear()
     assert len(diff_spy.calls) == 1
     diff_spy.calls.clear()
 
     # Already-empty bulk ops are no-ops (nothing to add or remove).
-    dmr.peer_accessible_by.clear()
-    dmr.peer_accessible_by.difference_update([dev1])
-    dmr.peer_accessible_by -= {dev1}
+    proxy.clear()
+    proxy.difference_update([dev1])
+    proxy -= {dev1}
     assert diff_spy.calls == []

From e89c19bf286381921b62b6e0d61a84353d6c3227 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 4 May 2026 17:12:21 -0700
Subject: [PATCH 6/8] cuda.core: spy on the driver call directly to assert
 peer-access batching

Move the actual ``cuMemPoolSetAccess`` invocation (descriptor-array
build + driver call) into a thin Python-visible ``def _set_pool_access``
in ``_peer_access_utils.pyx``. ``_apply_peer_access_diff`` now does only
the empty-diff short-circuit and delegates the work to
``_set_pool_access``, which Cython resolves through the module globals
at runtime so tests can intercept it via ``monkeypatch.setattr``.

Replace the previous internal-wrapper spy with a driver-call spy that
counts every real ``cuMemPoolSetAccess`` invocation. Earlier no-op
layers (e.g. the augmented-assignment-on-property pattern that writes
an already-mutated proxy back through the setter) short-circuit before
reaching ``_set_pool_access``, so the recorded count is exactly the
number of driver calls. The empty-delta filter and the local-binding
workaround in the bulk-ops test are gone; we now also assert that
``dmr.peer_accessible_by |= {...}`` directly on the property is still
exactly one driver call.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../cuda/core/_memory/_peer_access_utils.pyx  |  35 +++---
 cuda_core/tests/test_memory_peer_access.py    | 107 +++++++++---------
 2 files changed, 76 insertions(+), 66 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
index b1f92452f4b..ed26a4f0f2a 100644
--- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
@@ -122,17 +122,22 @@ cdef inline bint _peer_access_includes(DeviceMemoryResource mr, int dev_id):
     return flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
 
 
-cdef inline _apply_peer_access_diff_cython(
-    DeviceMemoryResource mr, tuple to_add, tuple to_remove
-):
-    """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas."""
+def _set_pool_access(mr, tuple to_add, tuple to_remove):
+    """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas.
+
+    The thin Python-callable layer that wraps the actual driver call: building
+    the ``CUmemAccessDesc`` array and invoking ``cuMemPoolSetAccess`` happens
+    in here. Tests monkeypatch this on the module to spy on real driver work
+    without intercepting earlier no-op paths.
+
+    Preconditions: ``len(to_add) + len(to_remove) > 0`` (the caller is
+    responsible for skipping empty diffs).
+    """
+    cdef DeviceMemoryResource mr_typed = <DeviceMemoryResource>mr
     cdef size_t count = len(to_add) + len(to_remove)
     cdef cydriver.CUmemAccessDesc* access_desc = NULL
     cdef size_t i = 0
 
-    if count == 0:
-        return
-
     access_desc = <cydriver.CUmemAccessDesc*>PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc))
     if access_desc == NULL:
         raise MemoryError("Failed to allocate memory for access descriptors")
@@ -150,21 +155,25 @@ cdef inline _apply_peer_access_diff_cython(
             i += 1
 
         with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(mr._h_pool), access_desc, count))
+            HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(mr_typed._h_pool), access_desc, count))
     finally:
         if access_desc != NULL:
             PyMem_Free(access_desc)
 
 
 def _apply_peer_access_diff(mr, to_add, to_remove):
-    """Python-visible wrapper around the cdef inline implementation.
+    """Apply a peer-access diff in at most one driver call.
 
     Every write path on :class:`PeerAccessibleBySetProxy` and the
-    ``peer_accessible_by`` setter route through this function so tests can
-    intercept it via ``monkeypatch.setattr`` on the module to assert the
-    "exactly one ``cuMemPoolSetAccess`` per bulk op" batching contract.
+    ``peer_accessible_by`` setter routes through this function. Empty diffs
+    short-circuit here so the driver-level helper :func:`_set_pool_access` is
+    only invoked when there is actual work for ``cuMemPoolSetAccess`` to do.
     """
-    _apply_peer_access_diff_cython(<DeviceMemoryResource>mr, tuple(to_add), tuple(to_remove))
+    add_tuple = tuple(to_add)
+    remove_tuple = tuple(to_remove)
+    if not add_tuple and not remove_tuple:
+        return
+    _set_pool_access(mr, add_tuple, remove_tuple)
 
 
 cpdef replace_peer_accessible_by(DeviceMemoryResource mr, devices):
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index b5e0e47dffe..4ec24eabfc7 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -308,16 +308,15 @@ def test_peer_accessible_by_returns_proxy_type(isolated_dmr_x2):
 # ---------------------------------------------------------------------------
 
 
-class _DiffSpy:
-    """Counts batched driver calls and forwards each call to the real wrapper.
-
-    Only invocations with non-empty deltas are recorded in :attr:`calls`,
-    because those are the ones that translate to actual ``cuMemPoolSetAccess``
-    work. The wrapper still gets called with empty deltas in a few places
-    (e.g. the property setter when the requested target matches current
-    driver state), but the underlying ``cdef inline`` short-circuits before
-    issuing the driver call, so those invocations do not count against the
-    "one call per bulk op" contract.
+class _DriverCallSpy:
+    """Records every actual ``cuMemPoolSetAccess`` invocation.
+
+    Spies on :func:`cuda.core._memory._peer_access_utils._set_pool_access` —
+    the thin Python-visible wrapper that builds the descriptor array and
+    issues the single driver call. Earlier no-op layers (e.g. the
+    augmented-assignment-on-property quirk that reassigns an already-mutated
+    proxy back through the setter) short-circuit before reaching here, so the
+    recorded count is exactly the number of real ``cuMemPoolSetAccess`` calls.
     """
 
     def __init__(self, real):
@@ -325,87 +324,89 @@ def __init__(self, real):
         self.calls = []
 
     def __call__(self, mr, to_add, to_remove):
-        recorded_add = tuple(to_add)
-        recorded_remove = tuple(to_remove)
-        if recorded_add or recorded_remove:
-            self.calls.append((recorded_add, recorded_remove))
+        self.calls.append((tuple(to_add), tuple(to_remove)))
         self._real(mr, to_add, to_remove)
 
 
 @pytest.fixture
-def diff_spy(monkeypatch):
-    spy = _DiffSpy(_peer_access_utils._apply_peer_access_diff)
-    monkeypatch.setattr(_peer_access_utils, "_apply_peer_access_diff", spy)
+def driver_spy(monkeypatch):
+    spy = _DriverCallSpy(_peer_access_utils._set_pool_access)
+    monkeypatch.setattr(_peer_access_utils, "_set_pool_access", spy)
     return spy
 
 
-def test_peer_accessible_by_setter_batches_one_call(diff_spy, isolated_dmr_x2):
+def test_peer_accessible_by_setter_batches_one_call(driver_spy, isolated_dmr_x2):
     """``mr.peer_accessible_by = [...]`` issues exactly one driver call (or zero on no-op)."""
     dmr, dev0, dev1 = isolated_dmr_x2
     dmr.peer_accessible_by = [dev1]
-    assert len(diff_spy.calls) == 1
-    assert dev1.device_id in diff_spy.calls[-1][0]
+    assert len(driver_spy.calls) == 1
+    assert dev1.device_id in driver_spy.calls[-1][0]
 
     # Reassigning the same set is a no-op (zero driver calls).
-    diff_spy.calls.clear()
+    driver_spy.calls.clear()
     dmr.peer_accessible_by = [dev1]
-    assert diff_spy.calls == []
+    assert driver_spy.calls == []
 
     # Revoking everything is a single call (one removal).
     dmr.peer_accessible_by = []
-    assert len(diff_spy.calls) == 1
-    assert dev1.device_id in diff_spy.calls[-1][1]
-
-
-def test_peer_accessible_by_bulk_ops_batch_one_call(diff_spy, isolated_dmr_x2):
-    """``|=``/``&=``/``-=``/``^=``/``update``/``clear`` each issue at most one driver call.
-
-    .. note::
-       ``proxy = dmr.peer_accessible_by; proxy |= {...}`` is exactly one driver
-       call. Augmented assignment directly on the property
-       (``dmr.peer_accessible_by |= {...}``) is two operations because Python
-       fetches the proxy, mutates it, and then assigns the (already-mutated)
-       proxy back through the setter; that final assignment is a no-op at the
-       driver level (deltas come out empty), but it is two trips through the
-       proxy/setter machinery. The spy filters out empty-delta calls so the
-       invariant under test is "actual driver work" rather than that
-       Python-level quirk.
+    assert len(driver_spy.calls) == 1
+    assert dev1.device_id in driver_spy.calls[-1][1]
+
+
+def test_peer_accessible_by_bulk_ops_batch_one_call(driver_spy, isolated_dmr_x2):
+    """Every bulk op issues exactly one ``cuMemPoolSetAccess`` (or zero on no-op).
+
+    Covers ``|=``, ``&=``, ``^=``, ``update``, ``difference_update``, and
+    ``clear``. Both operand styles are exercised: a locally bound proxy
+    (``proxy |= {...}``) and augmented assignment directly on the property
+    (``dmr.peer_accessible_by |= {...}``). The latter trips Python's
+    augmented-assignment-on-property pattern (fetch proxy, mutate, write
+    back through setter) but the trailing setter call discovers an empty
+    diff and short-circuits before reaching the driver, so the count is
+    still one.
     """
     dmr, dev0, dev1 = isolated_dmr_x2
-    proxy = dmr.peer_accessible_by  # use the local-binding form for predictable counting
+    proxy = dmr.peer_accessible_by
 
     proxy |= {dev1}
-    assert len(diff_spy.calls) == 1
-    diff_spy.calls.clear()
+    assert len(driver_spy.calls) == 1
+    driver_spy.calls.clear()
 
     # &= keeping the lone member: no driver call (no diff).
     proxy &= {dev1}
-    assert diff_spy.calls == []
+    assert driver_spy.calls == []
 
     # &= dropping the lone member: one removal.
     proxy &= {dev0}
-    assert len(diff_spy.calls) == 1
-    diff_spy.calls.clear()
+    assert len(driver_spy.calls) == 1
+    driver_spy.calls.clear()
 
     # ^= toggling the lone peer in then out: two ops, one call each.
     proxy ^= {dev1}
-    assert len(diff_spy.calls) == 1
+    assert len(driver_spy.calls) == 1
     proxy ^= {dev1}
-    assert len(diff_spy.calls) == 2
-    diff_spy.calls.clear()
+    assert len(driver_spy.calls) == 2
+    driver_spy.calls.clear()
 
     # update() with the peer already absent: one add.
     proxy.update([dev1])
-    assert len(diff_spy.calls) == 1
-    diff_spy.calls.clear()
+    assert len(driver_spy.calls) == 1
+    driver_spy.calls.clear()
 
     # clear() with one member: one removal.
     proxy.clear()
-    assert len(diff_spy.calls) == 1
-    diff_spy.calls.clear()
+    assert len(driver_spy.calls) == 1
+    driver_spy.calls.clear()
 
     # Already-empty bulk ops are no-ops (nothing to add or remove).
     proxy.clear()
     proxy.difference_update([dev1])
     proxy -= {dev1}
-    assert diff_spy.calls == []
+    assert driver_spy.calls == []
+
+    # Augmented assignment directly on the property is also one driver call:
+    # the proxy mutates the pool via __ior__, the setter writes back an
+    # already-up-to-date proxy, and the empty-diff short-circuit prevents a
+    # second driver call.
+    dmr.peer_accessible_by |= {dev1}
+    assert len(driver_spy.calls) == 1

From 774acf159949ce8c0182f8841d9b3144e29918bd Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 4 May 2026 17:20:31 -0700
Subject: [PATCH 7/8] cuda.core: split MutableSet conformance helpers by
 capacity

Replace the ``support_multi_insert`` flag and ``non_members`` keyword
with two purpose-built helpers:

- ``assert_mutable_set_interface(subject, items)`` keeps the original
  signature and contract: at least five distinct insertable items,
  exercised against a reference set in the standard way. The graph
  ``AdjacencySetProxy`` test continues to use this unchanged.
- ``assert_single_member_mutable_set_interface(subject, member,
  non_member)`` is a focused pass for proxies whose backing store admits
  at most one insertable element at a time (here, the peer-access view
  on a 2-GPU box). It threads a single member and one non-member
  sentinel through every ``MutableSet`` method.

The two helpers share small private utilities (empty-state checks,
``__repr__`` shape) but keep their public surfaces small and linear.
A capacity-one proxy is a meaningfully different contract from a
general mutable set; naming that explicitly in the API reads better
than a flag and avoids forcing call sites to plumb sentinels through.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../helpers/collection_interface_testers.py   | 151 ++++++++----------
 cuda_core/tests/test_memory_peer_access.py    |  19 ++-
 2 files changed, 77 insertions(+), 93 deletions(-)

diff --git a/cuda_core/tests/helpers/collection_interface_testers.py b/cuda_core/tests/helpers/collection_interface_testers.py
index d719c47aa3f..63fbed8b381 100644
--- a/cuda_core/tests/helpers/collection_interface_testers.py
+++ b/cuda_core/tests/helpers/collection_interface_testers.py
@@ -1,70 +1,63 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Reusable helpers to verify collections.abc protocol conformance."""
+"""Reusable helpers to verify collections.abc protocol conformance.
+
+Two helpers are provided for ``MutableSet``-like subjects, picked by the
+capacity of the backing store:
+
+- :func:`assert_mutable_set_interface` is the standard pass; it requires at
+  least five distinct insertable items so every method (including the
+  multi-element bulk operators) can be exercised.
+- :func:`assert_single_member_mutable_set_interface` is a focused pass for
+  proxies whose backing store admits at most one insertable element at a time
+  (for example, a peer-access view on a system with one valid peer device).
+  It runs every ``MutableSet`` method at least once using a single member and
+  one non-member sentinel.
+
+The two helpers are intentionally separate rather than one helper with a
+mode flag: a single-member proxy is a substantially different contract
+("capacity one, by hardware") and naming it explicitly in the API keeps each
+helper's signature small and its assertions linear.
+"""
 
 from collections.abc import MutableSet, Set
 
 import pytest
 
 
-def assert_mutable_set_interface(
-    subject,
-    items,
-    *,
-    non_members=None,
-    support_multi_insert=True,
-):
-    """Exercise every MutableSet method on *subject* against a reference set.
+def _assert_empty(subject):
+    """Assertions that hold for any empty MutableSet-like subject."""
+    assert isinstance(subject, Set)
+    assert isinstance(subject, MutableSet)
+    assert len(subject) == 0
+    assert subject == set()
+    assert list(subject) == []
+
+
+def _assert_repr_nonempty(subject):
+    """``__repr__`` produces a non-empty string."""
+    r = repr(subject)
+    assert isinstance(r, str)
+    assert len(r) > 0
 
-    Two modes are supported:
 
-    - ``support_multi_insert=True`` (default): the standard protocol pass that
-      inserts up to five distinct elements simultaneously. Use this whenever
-      the subject can hold an arbitrary number of items.
-    - ``support_multi_insert=False``: a reduced pass for proxies whose backing
-      store admits at most one insertable element at a time (e.g. a peer-access
-      view on a 2-GPU system). The subject only ever holds ``{}`` or ``{a}``,
-      and ``non_members`` supplies sentinel values used as the *other* side of
-      comparisons, ``isdisjoint``, subset/superset, and binary/in-place
-      operators. Every ``MutableSet`` method is still exercised at least once.
+def assert_mutable_set_interface(subject, items):
+    """Exercise every MutableSet method on *subject* against a reference set.
 
     Parameters
     ----------
     subject : MutableSet
         An **empty** mutable-set-like object to test.
     items : sequence
-        Distinct, hashable objects valid for insertion into *subject*. With
-        ``support_multi_insert=True`` at least five are required; with
-        ``support_multi_insert=False`` exactly one is needed (extras ignored).
-    non_members : sequence, optional
-        Distinct, hashable values that compare equal across set semantics but
-        are guaranteed *never* to be inserted into *subject* (typically because
-        the backing store rejects them). Required and used only when
-        ``support_multi_insert=False``; at least two are needed there.
+        At least five distinct, hashable objects valid for insertion into
+        *subject*.
     """
-    if support_multi_insert:
-        _assert_mutable_set_interface_multi(subject, items)
-    else:
-        if non_members is None:
-            raise TypeError("non_members is required when support_multi_insert=False")
-        _assert_mutable_set_interface_single(subject, items, non_members)
-
-
-def _assert_mutable_set_interface_multi(subject, items):
     assert len(items) >= 5
     a, b, c, d, e = items[:5]
     ref = set()
 
-    # -- ABC conformance --
-    assert isinstance(subject, Set)
-    assert isinstance(subject, MutableSet)
-
-    # -- empty state --
-    assert len(subject) == 0
-    assert subject == ref
-    assert subject == set()
-    assert list(subject) == []
+    _assert_empty(subject)
 
     # -- add --
     subject.add(a)
@@ -169,39 +162,37 @@ def _assert_mutable_set_interface_multi(subject, items):
     # -- __iter__ --
     assert set(subject) == ref
 
-    # -- __repr__ --
-    r = repr(subject)
-    assert isinstance(r, str)
-    assert len(r) > 0
+    _assert_repr_nonempty(subject)
 
 
-def _assert_mutable_set_interface_single(subject, items, non_members):
-    """Reduced protocol pass for subjects that admit at most one insertable element.
+def assert_single_member_mutable_set_interface(subject, member, non_member):
+    """Exercise every MutableSet method on a subject with capacity one.
 
-    Invariants:
-    - ``a`` is the lone insertable element; the subject only ever holds ``{}``
-      or ``{a}``.
-    - ``x``, ``y`` are sentinels guaranteed *not* to be inserted; they appear
-      only on the right-hand side of operators and comparisons. They must
-      compare correctly under set semantics (i.e. equality and hashing).
-    """
-    assert len(items) >= 1
-    assert len(non_members) >= 2
-    a = items[0]
-    x, y = non_members[:2]
-    assert a != x and a != y and x != y, "items[0] and non_members[:2] must be distinct"
+    Use this for proxies whose backing store admits at most one insertable
+    element at a time (typically because the underlying resource is bounded
+    by hardware, e.g. a peer-access view on a system with a single valid
+    peer device). The subject only ever holds ``set()`` or ``{member}``;
+    *non_member* supplies the right-hand side of comparisons, ``isdisjoint``,
+    subset/superset, and binary/in-place operators so every ``MutableSet``
+    method is exercised at least once.
 
+    Parameters
+    ----------
+    subject : MutableSet
+        An **empty** mutable-set-like object to test.
+    member : hashable
+        A distinct, hashable object valid for insertion into *subject*.
+    non_member : hashable
+        A distinct, hashable object that compares correctly under set
+        semantics but is guaranteed never to be inserted into *subject*
+        (typically because the backing store rejects it).
+    """
+    assert member != non_member, "member and non_member must be distinct"
+    a = member
+    x = non_member
     ref = set()
 
-    # -- ABC conformance --
-    assert isinstance(subject, Set)
-    assert isinstance(subject, MutableSet)
-
-    # -- empty state --
-    assert len(subject) == 0
-    assert subject == ref
-    assert subject == set()
-    assert list(subject) == []
+    _assert_empty(subject)
 
     # -- add --
     subject.add(a)
@@ -222,7 +213,7 @@ def _assert_mutable_set_interface_single(subject, items, non_members):
     assert subject != set()
 
     # -- isdisjoint --
-    assert subject.isdisjoint({x, y})
+    assert subject.isdisjoint({x})
     assert not subject.isdisjoint({a, x})
 
     # -- subset / superset --
@@ -287,16 +278,13 @@ def _assert_mutable_set_interface_single(subject, items, non_members):
     assert subject == ref
 
     # -- in-place intersection (&=) drops the lone member --
-    subject &= {x, y}
-    ref &= {x, y}
+    subject &= {x}
+    ref &= {x}
     assert subject == ref
 
-    # -- in-place union via bulk path again, ahead of -= and ^= --
+    # -- in-place difference (-=) on non-member is a no-op --
     subject |= {a}
     ref |= {a}
-    assert subject == ref
-
-    # -- in-place difference (-=) on non-member is a no-op --
     subject -= {x}
     ref -= {x}
     assert subject == ref
@@ -327,7 +315,4 @@ def _assert_mutable_set_interface_single(subject, items, non_members):
     ref.add(a)
     assert set(subject) == ref
 
-    # -- __repr__ --
-    r = repr(subject)
-    assert isinstance(r, str)
-    assert len(r) > 0
+    _assert_repr_nonempty(subject)
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index 4ec24eabfc7..70add843a90 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -3,7 +3,7 @@
 
 import pytest
 from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer
-from helpers.collection_interface_testers import assert_mutable_set_interface
+from helpers.collection_interface_testers import assert_single_member_mutable_set_interface
 
 from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, system
 from cuda.core._memory import _peer_access_utils
@@ -168,19 +168,18 @@ def isolated_dmr_x2(mempool_device_x2):
 
 
 def test_peer_accessible_by_mutable_set_interface(isolated_dmr_x2):
-    """Run the full MutableSet protocol against a single-peer driver-backed view.
+    """Run the MutableSet protocol against a single-peer driver-backed view.
 
-    On a 2-GPU box the proxy can only ever hold ``{dev1}``. The relaxed helper
-    uses ``dev0`` (the owner, which the proxy refuses to insert) as the
-    non-member sentinel; this exercises every ``MutableSet`` method while
-    respecting the proxy's "insert at most one" hardware reality.
+    On a 2-GPU box the proxy can only ever hold ``{dev1}`` because there is a
+    single valid peer. The capacity-one helper exercises every ``MutableSet``
+    method using ``dev1`` as the lone insertable element and ``dev0`` (the
+    owner, which the proxy refuses to insert) as the non-member sentinel.
     """
     dmr, dev0, dev1 = isolated_dmr_x2
-    assert_mutable_set_interface(
+    assert_single_member_mutable_set_interface(
         dmr.peer_accessible_by,
-        items=[dev1],
-        non_members=[dev0, dev0.device_id],
-        support_multi_insert=False,
+        member=dev1,
+        non_member=dev0,
     )
 
 

From 3233538afb57e031cc0337d1bf11ea540b34efd9 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 6 May 2026 13:41:59 -0700
Subject: [PATCH 8/8] Address review feedback on peer_accessible_by proxy

- Optimize add/discard to check the single target device (1 driver
  call) instead of scanning all devices via _query_peer_access_ids
- Add test for out-of-range int in __contains__ (returns False)
- Fix stale comment referencing removed support_multi_insert flag
- Add #2018 link to release notes entry

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../cuda/core/_memory/_peer_access_utils.pyx   | 18 ++++++++++++++++--
 cuda_core/docs/source/release/1.0.0-notes.rst  |  1 +
 cuda_core/tests/test_memory_peer_access.py     | 10 ++++++----
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
index ed26a4f0f2a..8086aaff170 100644
--- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
+++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx
@@ -252,7 +252,16 @@ class PeerAccessibleBySetProxy(MutableSet):
 
     def add(self, value) -> None:
         """Grant peer access from ``value`` to allocations in this pool."""
-        self._apply([value], ())
+        dev_id = _resolve_peer_device_id(value)
+        cdef DeviceMemoryResource mr = <DeviceMemoryResource>self._mr
+        if dev_id == mr._dev_id:
+            return
+        if _peer_access_includes(mr, dev_id):
+            return
+        from cuda.core._device import Device
+        if not Device(mr._dev_id).can_access_peer(dev_id):
+            raise ValueError(f"Device {mr._dev_id} cannot access peer: {dev_id}")
+        _apply_peer_access_diff(mr, (dev_id,), ())
 
     def discard(self, value) -> None:
         """Revoke peer access from ``value`` to allocations in this pool."""
@@ -260,7 +269,12 @@ class PeerAccessibleBySetProxy(MutableSet):
             dev_id = _resolve_peer_device_id(value)
         except (TypeError, ValueError):
             return
-        self._apply((), [dev_id])
+        cdef DeviceMemoryResource mr = <DeviceMemoryResource>self._mr
+        if dev_id == mr._dev_id:
+            return
+        if not _peer_access_includes(mr, dev_id):
+            return
+        _apply_peer_access_diff(mr, (), (dev_id,))
 
     # --- bulk overrides: one driver call per op ---
 
diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst
index 9b7bd65ff05..747b033c117 100644
--- a/cuda_core/docs/source/release/1.0.0-notes.rst
+++ b/cuda_core/docs/source/release/1.0.0-notes.rst
@@ -123,6 +123,7 @@ Breaking changes
 - :attr:`DeviceMemoryResource.peer_accessible_by` now returns a
   :class:`collections.abc.MutableSet` of :obj:`~_device.Device` objects instead
   of a sorted ``tuple[int, ...]``. The property setter is unchanged.
+  (`#2018 <https://github.com/NVIDIA/cuda-python/pull/2018>`__)
 
 - ``stream`` is now a required keyword-only argument on APIs that schedule
   work on a stream
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index 1002d96ad0b..71beb459143 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -147,10 +147,9 @@ def test_peer_access_shared_pool_queries_driver(mempool_device_x2):
 #
 # These tests exercise the ``PeerAccessibleBySetProxy`` surface added in
 # v1.0.0. They run against ``mempool_device_x2`` because every CI machine has
-# at most 2 GPUs, which means at most one valid peer device. The relaxed
-# ``support_multi_insert=False`` path on ``assert_mutable_set_interface``
-# threads that single insertable element through the full ``MutableSet``
-# protocol.
+# at most 2 GPUs, which means at most one valid peer device. The
+# ``assert_single_member_mutable_set_interface`` helper threads that single
+# insertable element through the full ``MutableSet`` protocol.
 # ---------------------------------------------------------------------------
 
 
@@ -244,6 +243,9 @@ def test_peer_accessible_by_rejects_invalid_inputs(isolated_dmr_x2):
     # __contains__: returns False on non-coercible values, never raises
     assert "not-a-device" not in dmr.peer_accessible_by
 
+    # __contains__: out-of-range int returns False, never raises
+    assert bad_id not in dmr.peer_accessible_by
+
     # remove on a non-member raises KeyError (inherited from MutableSet)
     with pytest.raises(KeyError):
         dmr.peer_accessible_by.remove(dev1)