From abdec47f2c3de514a02d14f08fffe3fc097ed729 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 16 Mar 2026 17:37:49 -0700 Subject: [PATCH 01/68] wip --- cuda_core/cuda/core/_memory/_buffer.pxd | 1 + cuda_core/cuda/core/_memory/_buffer.pyx | 284 ++++++++++++++++++ cuda_core/docs/source/release/0.7.x-notes.rst | 5 + cuda_core/tests/test_memory.py | 127 ++++++++ 4 files changed, 417 insertions(+) diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 91c0cfe24af..04b5707e18e 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -12,6 +12,7 @@ cdef struct _MemAttrs: int device_id bint is_device_accessible bint is_host_accessible + bint is_managed cdef class Buffer: diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 83009f74aed..686585b5276 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -72,6 +72,194 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`. """ + +cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( + "device", + "host", + "host_numa", + "host_numa_current", +) + +cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { + "device": "CU_MEM_LOCATION_TYPE_DEVICE", + "host": "CU_MEM_LOCATION_TYPE_HOST", + "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", + "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", +} + +cdef dict _MANAGED_ADVICE_ALIASES = { + "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "cu_mem_advise_set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "cu_mem_advise_unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "cu_mem_advise_set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "cu_mem_advise_unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "cu_mem_advise_set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", + "cu_mem_advise_unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", +} + +cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( + "set_read_mostly", + "unset_read_mostly", + "unset_preferred_location", +)) + +cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset(( + "set_accessed_by", + "unset_accessed_by", +)) + + +cdef inline object _managed_location_enum(str location_type): + cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] + if not hasattr(driver.CUmemLocationType, attr_name): + raise RuntimeError( + f"Managed-memory location type {location_type!r} is not supported by the " + f"installed cuda.bindings package." + ) + return getattr(driver.CUmemLocationType, attr_name) + + +cdef inline object _make_managed_location(str location_type, int location_id): + cdef object location = driver.CUmemLocation() + location.type = _managed_location_enum(location_type) + if location_type == "host": + location.id = int(getattr(driver, "CU_DEVICE_CPU", -1)) + elif location_type == "host_numa_current": + location.id = 0 + else: + location.id = location_id + return location + + +cdef inline tuple _normalize_managed_advice(object advice): + cdef str alias + cdef str attr_name + if isinstance(advice, str): + alias = advice.lower() + attr_name = _MANAGED_ADVICE_ALIASES.get(alias) + if attr_name is None: + raise ValueError( + "advice must be one of " + f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" + ) + return alias, getattr(driver.CUmem_advise, attr_name) + + if isinstance(advice, driver.CUmem_advise): + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + if alias.startswith("cu_mem_advise_"): + continue + if advice == getattr(driver.CUmem_advise, attr_name): + return alias, advice + raise ValueError(f"Unsupported advice value: {advice!r}") + + raise TypeError( + "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" + ) + + +cdef inline object _normalize_managed_location( + object location, + object location_type, + str what, + bint allow_none=False, + bint allow_host=True, + bint allow_host_numa=True, + bint allow_host_numa_current=True, +): + cdef object loc_type + cdef int loc_id + + if isinstance(location, Device): + location = (location).device_id + + if location_type is not None and not isinstance(location_type, str): + raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") + + loc_type = None if location_type is None else (location_type).lower() + if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: + raise ValueError( + f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " + f"or None, got {location_type!r}" + ) + + if loc_type is None: + if location is None: + if allow_none: + return _make_managed_location("host", -1) + raise ValueError(f"{what} requires a location") + if not isinstance(location, int): + raise TypeError( + f"{what} location must be a Device, int, or None, got {type(location).__name__}" + ) + loc_id = location + if loc_id == -1: + loc_type = "host" + elif loc_id >= 0: + loc_type = "device" + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" + ) + elif loc_type == "device": + if isinstance(location, int) and location >= 0: + loc_id = location + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" + ) + return _make_managed_location(loc_type, loc_id) + elif loc_type == "host": + if location not in (None, -1): + raise ValueError( + f"{what} location must be None or -1 when location_type is 'host', got {location!r}" + ) + if not allow_host: + raise ValueError(f"{what} does not support location_type='host'") + return _make_managed_location(loc_type, -1) + elif loc_type == "host_numa": + if not allow_host_numa: + raise ValueError(f"{what} does not support location_type='host_numa'") + if not isinstance(location, int) or location < 0: + raise ValueError( + f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" + ) + return _make_managed_location(loc_type, location) + else: + if not allow_host_numa_current: + raise ValueError(f"{what} does not support location_type='host_numa_current'") + if location is not None: + raise ValueError( + f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" + ) + return _make_managed_location(loc_type, 0) + + if loc_type == "host" and not allow_host: + raise ValueError(f"{what} does not support host locations") + if loc_type == "host_numa" and not allow_host_numa: + raise ValueError(f"{what} does not support location_type='host_numa'") + if loc_type == "host_numa_current" and not allow_host_numa_current: + raise ValueError(f"{what} does not support location_type='host_numa_current'") + return _make_managed_location(loc_type, loc_id) + + +cdef inline void _require_managed_buffer(Buffer self, str what): + _init_mem_attrs(self) + if not self._mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory buffer") + + +cdef inline void _require_managed_discard_prefetch_support(): + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + raise RuntimeError( + "Buffer.discard_prefetch requires cuda.bindings support for " + "cuMemDiscardAndPrefetchBatchAsync" + ) + cdef class Buffer: """Represent a handle to allocated memory. @@ -293,6 +481,99 @@ cdef class Buffer: finally: PyBuffer_Release(&buf) + def advise( + self, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + location_type: str | None = None, + ): + """Apply a managed-memory advice to this buffer. + + This method is only valid for buffers backed by managed memory. + + Parameters + ---------- + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, + ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + _require_managed_buffer(self, "Buffer.advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "Buffer.advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allow_host=True, + allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, + allow_host_numa_current=advice_name == "set_preferred_location", + ) + handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) + + def prefetch( + self, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + location_type: str | None = None, + ): + """Prefetch this managed-memory buffer to a target location.""" + cdef Stream s = Stream_accept(stream) + _require_managed_buffer(self, "Buffer.prefetch") + location = _normalize_managed_location( + location, + location_type, + "Buffer.prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) + + def discard_prefetch( + self, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + location_type: str | None = None, + ): + """Discard this managed-memory buffer and prefetch it to a target location.""" + cdef Stream s = Stream_accept(stream) + _require_managed_buffer(self, "Buffer.discard_prefetch") + _require_managed_discard_prefetch_support() + location = _normalize_managed_location( + location, + location_type, + "Buffer.discard_prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [self.handle], + [self._size], + 1, + [location], + [0], + 1, + 0, + s.handle, + ) + ) + def __dlpack__( self, *, @@ -453,6 +734,7 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = False out.device_id = -1 + out.is_managed = False elif ( is_managed or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST @@ -461,10 +743,12 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = True out.device_id = device_id + out.is_managed = is_managed != 0 elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: out.is_host_accessible = False out.is_device_accessible = True out.device_id = device_id + out.is_managed = False else: with cython.gil: raise ValueError(f"Unsupported memory type: {memory_type}") diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst index 98551603b62..18b3bede36b 100644 --- a/cuda_core/docs/source/release/0.7.x-notes.rst +++ b/cuda_core/docs/source/release/0.7.x-notes.rst @@ -35,6 +35,11 @@ New features preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or ``("host_numa", 3)``. +- Added managed-memory controls on :class:`Buffer`: ``advise()``, + ``prefetch()``, and ``discard_prefetch()``. These methods validate that the + underlying allocation is managed memory and then forward to the corresponding + CUDA driver operations for range advice and migration. + - Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit control over host NUMA node placement. When ``ipc_enabled=True`` and ``numa_id`` is not set, the NUMA node is automatically derived from the diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 0473d2d183d..dd146785ec8 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1134,6 +1134,133 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): ) +def _get_mem_range_attr(buffer, attribute, data_size): + return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) + + +def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("discard-prefetch requires cuda.bindings support") + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(4096) + stream = device.create_stream() + + buffer.advise("set_read_mostly") + assert _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + 4, + ) == 1 + + buffer.advise("set_preferred_location", device, location_type="device") + preferred_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE, + 4, + ) + preferred_id = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID, + 4, + ) + assert int(preferred_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) + assert preferred_id == device.device_id + + buffer.prefetch(-1, stream=stream) + stream.sync() + last_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, + 4, + ) + assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST) + + buffer.discard_prefetch(device, stream=stream) + stream.sync() + last_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, + 4, + ) + last_id = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + 4, + ) + assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) + assert last_id == device.device_id + + buffer.close() + + +def test_managed_buffer_operations_support_external_managed_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + stream = device.create_stream() + + buffer.prefetch(device, stream=stream) + stream.sync() + + last_type = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, + 4, + ) + last_id = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + 4, + ) + assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) + assert last_id == device.device_id + + buffer.close() + + +def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda): + device = Device() + device.set_current() + + buffer = DummyDeviceMemoryResource(device).allocate(4096) + stream = device.create_stream() + + with pytest.raises(ValueError, match="managed-memory buffer"): + buffer.advise("set_read_mostly") + with pytest.raises(ValueError, match="managed-memory buffer"): + buffer.prefetch(device, stream=stream) + with pytest.raises(ValueError, match="managed-memory buffer"): + buffer.discard_prefetch(device, stream=stream) + + buffer.close() + + +def test_managed_buffer_operation_validation(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(4096) + stream = device.create_stream() + + with pytest.raises(ValueError, match="requires a location"): + buffer.prefetch(stream=stream) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + buffer.advise("set_accessed_by", 0, location_type="host_numa") + with pytest.raises(ValueError, match="location must be None or -1"): + buffer.prefetch(0, stream=stream, location_type="host") + + buffer.close() + + def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch From c418050043ef38cc15a74e733d9038d564068c0d Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 16 Mar 2026 17:44:49 -0700 Subject: [PATCH 02/68] wip --- cuda_core/tests/test_memory.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index dd146785ec8..44d50e356c4 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1151,11 +1151,14 @@ def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): stream = device.create_stream() buffer.advise("set_read_mostly") - assert _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - 4, - ) == 1 + assert ( + _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + 4, + ) + == 1 + ) buffer.advise("set_preferred_location", device, location_type="device") preferred_type = _get_mem_range_attr( From b879fa5b13922b2a41122f31751cd11c0c1fbaee Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 16 Mar 2026 17:51:36 -0700 Subject: [PATCH 03/68] fixing ci compiler errors --- cuda_core/cuda/core/_memory/_buffer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 686585b5276..05a1667b3fe 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -36,7 +36,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver +from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core._device import Device @@ -175,7 +175,7 @@ cdef inline object _normalize_managed_location( cdef int loc_id if isinstance(location, Device): - location = (location).device_id + location = location.device_id if location_type is not None and not isinstance(location_type, str): raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") From 04ee3de1859c91158f30a7bffd3246024d422f0e Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 09:07:10 -0700 Subject: [PATCH 04/68] skipping tests that aren't supported --- cuda_core/tests/test_memory.py | 130 ++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 44d50e356c4..95c6e6e9646 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1138,18 +1138,70 @@ def _get_mem_range_attr(buffer, attribute, data_size): return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) -def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() +def _skip_if_managed_allocation_unsupported(device): + try: + if not device.properties.managed_memory: + pytest.skip("Device does not support managed memory operations") + except AttributeError: + pytest.skip("Managed-memory buffer operations require CUDA support") + +def _skip_if_managed_location_ops_unsupported(device): + _skip_if_managed_allocation_unsupported(device) + try: + if not device.properties.concurrent_managed_access: + pytest.skip("Device does not support concurrent managed memory access") + except AttributeError: + pytest.skip("Managed-memory location operations require CUDA support") + + +def _skip_if_managed_discard_prefetch_unsupported(device): + _skip_if_managed_location_ops_unsupported(device) if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): pytest.skip("discard-prefetch requires cuda.bindings support") + visible_devices = Device.get_all_devices() + if not all(dev.properties.concurrent_managed_access for dev in visible_devices): + pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") + + +def test_managed_buffer_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() buffer = mr.allocate(4096) stream = device.create_stream() + buffer.prefetch(-1, stream=stream) + stream.sync() + last_location = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + 4, + ) + assert last_location == -1 + + buffer.prefetch(device, stream=stream) + stream.sync() + last_location = _get_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + 4, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_buffer_advise_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer.advise("set_read_mostly") assert ( _get_mem_range_attr( @@ -1160,70 +1212,60 @@ def test_managed_buffer_advise_prefetch_and_discard_prefetch(init_cuda): == 1 ) - buffer.advise("set_preferred_location", device, location_type="device") - preferred_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE, - 4, - ) - preferred_id = _get_mem_range_attr( + # cuda.bindings currently exposes the combined location attributes for + # cuMemRangeGetAttribute, so use the legacy location query here. + buffer.advise("set_preferred_location", location_type="host") + preferred_location = _get_mem_range_attr( buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, 4, ) - assert int(preferred_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) - assert preferred_id == device.device_id + assert preferred_location == -1 - buffer.prefetch(-1, stream=stream) - stream.sync() - last_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, - 4, - ) - assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST) + buffer.close() - buffer.discard_prefetch(device, stream=stream) + +def test_managed_buffer_prefetch_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + stream = device.create_stream() + + buffer.prefetch(device, stream=stream) stream.sync() - last_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, - 4, - ) - last_id = _get_mem_range_attr( + + last_location = _get_mem_range_attr( buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, 4, ) - assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) - assert last_id == device.device_id + assert last_location == device.device_id buffer.close() -def test_managed_buffer_operations_support_external_managed_allocations(init_cuda): +def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(init_cuda): device = Device() - skip_if_managed_memory_unsupported(device) + _skip_if_managed_discard_prefetch_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(4096) stream = device.create_stream() - buffer.prefetch(device, stream=stream) + buffer.prefetch(-1, stream=stream) stream.sync() - last_type = _get_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE, - 4, - ) - last_id = _get_mem_range_attr( + buffer.discard_prefetch(device, stream=stream) + stream.sync() + + last_location = _get_mem_range_attr( buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, 4, ) - assert int(last_type) == int(driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE) - assert last_id == device.device_id + assert last_location == device.device_id buffer.close() From 9ab3f465d1c7d072a6dd9c6b8b70a9b47a24f3d8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 09:34:29 -0700 Subject: [PATCH 05/68] cu12 support --- cuda_core/cuda/core/_memory/_buffer.pyx | 40 ++++++++++++++++++-- cuda_core/tests/test_memory.py | 50 ++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 05a1667b3fe..4460de900d4 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -36,7 +36,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver, handle_return +from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return from cuda.core._device import Device @@ -247,6 +247,20 @@ cdef inline object _normalize_managed_location( return _make_managed_location(loc_type, loc_id) +cdef inline bint _managed_location_uses_v2_bindings(): + # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. + return get_binding_version() >= (13, 0) + + +cdef inline int _managed_location_to_legacy_device(object location, str what): + cdef object loc_type = location.type + if loc_type == _managed_location_enum("device") or loc_type == _managed_location_enum("host"): + return location.id + raise RuntimeError( + f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" + ) + + cdef inline void _require_managed_buffer(Buffer self, str what): _init_mem_attrs(self) if not self._mem_attrs.is_managed: @@ -518,7 +532,17 @@ cdef class Buffer: allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, allow_host_numa_current=advice_name == "set_preferred_location", ) - handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + self.handle, + self._size, + advice, + _managed_location_to_legacy_device(location, "Buffer.advise"), + ) + ) def prefetch( self, @@ -539,7 +563,17 @@ cdef class Buffer: allow_host_numa=True, allow_host_numa_current=True, ) - handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) + else: + handle_return( + driver.cuMemPrefetchAsync( + self.handle, + self._size, + _managed_location_to_legacy_device(location, "Buffer.prefetch"), + s.handle, + ) + ) def discard_prefetch( self, diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 95c6e6e9646..380b581e7b2 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -43,7 +43,7 @@ system as ccx_system, ) from cuda.core._dlpack import DLDeviceType -from cuda.core._memory import IPCBufferDescriptor +from cuda.core._memory import IPCBufferDescriptor, _buffer from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView @@ -1270,6 +1270,54 @@ def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(i buffer.close() +def test_managed_buffer_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + calls = [] + + def fake_cuMemAdvise(ptr, size, advice, location): + calls.append((ptr, size, advice, location)) + return (driver.CUresult.CUDA_SUCCESS,) + + monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) + + buffer.advise("set_read_mostly") + + assert len(calls) == 1 + assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", -1)) + + buffer.close() + + +def test_managed_buffer_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(4096) + stream = device.create_stream() + calls = [] + + def fake_cuMemPrefetchAsync(ptr, size, location, hstream): + calls.append((ptr, size, location, hstream)) + return (driver.CUresult.CUDA_SUCCESS,) + + monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) + + buffer.prefetch(device, stream=stream) + + assert len(calls) == 1 + assert calls[0][2] == device.device_id + assert int(calls[0][3]) == int(stream.handle) + + buffer.close() + + def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda): device = Device() device.set_current() From a948066ab2fc6fda3dfb74516538091e96e68746 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 16:45:51 -0700 Subject: [PATCH 06/68] Moving to function from Buffer class methods to free standing functions in the cuda.core.managed_memory namespace --- cuda_core/cuda/core/__init__.py | 2 +- cuda_core/cuda/core/_memory/_buffer.pyx | 322 +++++++++++------- cuda_core/cuda/core/experimental/__init__.py | 3 +- cuda_core/cuda/core/managed_memory.py | 9 + cuda_core/docs/source/api.rst | 13 + cuda_core/docs/source/release/0.7.x-notes.rst | 10 +- cuda_core/pixi.lock | 18 +- .../test_experimental_backward_compat.py | 7 + cuda_core/tests/test_memory.py | 137 +++++--- 9 files changed, 335 insertions(+), 186 deletions(-) create mode 100644 cuda_core/cuda/core/managed_memory.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 139078e86e4..c55c0786ed7 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -28,7 +28,7 @@ finally: del bindings, importlib, subdir, cuda_major, cuda_minor -from cuda.core import system, utils +from cuda.core import managed_memory, system, utils from cuda.core._device import Device from cuda.core._event import Event, EventOptions from cuda.core._graph import ( diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 8ae6d22ee55..4663302b347 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -113,6 +113,13 @@ cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset(( "unset_accessed_by", )) +cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 +cdef int _HOST_NUMA_CURRENT_ID = 0 +cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 +cdef size_t _SINGLE_RANGE_COUNT = 1 +cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 +cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 + cdef inline object _managed_location_enum(str location_type): cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] @@ -130,7 +137,7 @@ cdef inline object _make_managed_location(str location_type, int location_id): if location_type == "host": location.id = int(getattr(driver, "CU_DEVICE_CPU", -1)) elif location_type == "host_numa_current": - location.id = 0 + location.id = _HOST_NUMA_CURRENT_ID else: location.id = location_id return location @@ -236,7 +243,7 @@ cdef inline object _normalize_managed_location( raise ValueError( f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" ) - return _make_managed_location(loc_type, 0) + return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) if loc_type == "host" and not allow_host: raise ValueError(f"{what} does not support host locations") @@ -264,16 +271,206 @@ cdef inline int _managed_location_to_legacy_device(object location, str what): cdef inline void _require_managed_buffer(Buffer self, str what): _init_mem_attrs(self) if not self._mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory buffer") + raise ValueError(f"{what} requires a managed-memory allocation") -cdef inline void _require_managed_discard_prefetch_support(): +cdef inline void _require_managed_discard_prefetch_support(str what): if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): raise RuntimeError( - "Buffer.discard_prefetch requires cuda.bindings support for " - "cuMemDiscardAndPrefetchBatchAsync" + f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" ) + +cdef inline tuple _managed_range_from_buffer( + Buffer buffer, + int size, + str what, +): + if size != _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} does not accept size= when target is a Buffer") + _require_managed_buffer(buffer, what) + return buffer.handle, buffer._size + + +cdef inline uintptr_t _coerce_raw_pointer(object target, str what) except? 0: + cdef object ptr_obj + try: + ptr_obj = int(target) + except Exception as exc: + raise TypeError( + f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" + ) from exc + if ptr_obj < 0: + raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") + return ptr_obj + + +cdef inline int _require_managed_pointer(uintptr_t ptr, str what) except -1: + cdef _MemAttrs mem_attrs + with nogil: + _query_memory_attrs(mem_attrs, ptr) + if not mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + return 0 + + +cdef inline tuple _normalize_managed_target_range( + object target, + int size, + str what, +): + cdef uintptr_t ptr + + if isinstance(target, Buffer): + return _managed_range_from_buffer(target, size, what) + + if size == _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} requires size= when target is a raw pointer") + ptr = _coerce_raw_pointer(target, what) + _require_managed_pointer(ptr, what) + return ptr, size + + +def advise( + target, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Apply managed-memory advice to an allocation range. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allow_host=True, + allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, + allow_host_numa_current=advice_name == "set_preferred_location", + ) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + ptr, + nbytes, + advice, + _managed_location_to_legacy_device(location, "advise"), + ) + ) + + +def prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Prefetch a managed-memory allocation range to a target location.""" + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") + location = _normalize_managed_location( + location, + location_type, + "prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + if _managed_location_uses_v2_bindings(): + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + location, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + else: + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + _managed_location_to_legacy_device(location, "prefetch"), + s.handle, + ) + ) + + +def discard_prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Discard a managed-memory allocation range and prefetch it to a target location.""" + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef object batch_ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + batch_ptr = driver.CUdeviceptr(int(ptr)) + _require_managed_discard_prefetch_support("discard_prefetch") + location = _normalize_managed_location( + location, + location_type, + "discard_prefetch", + allow_none=False, + allow_host=True, + allow_host_numa=True, + allow_host_numa_current=True, + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [batch_ptr], + [nbytes], + _SINGLE_RANGE_COUNT, + [location], + [_FIRST_PREFETCH_LOCATION_INDEX], + _SINGLE_PREFETCH_LOCATION_COUNT, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + cdef class Buffer: """Represent a handle to allocated memory. @@ -502,119 +699,6 @@ cdef class Buffer: finally: PyBuffer_Release(&buf) - def advise( - self, - advice: driver.CUmem_advise | str, - location: Device | int | None = None, - *, - location_type: str | None = None, - ): - """Apply a managed-memory advice to this buffer. - - This method is only valid for buffers backed by managed memory. - - Parameters - ---------- - advice : :obj:`~driver.CUmem_advise` | str - Managed-memory advice to apply. String aliases such as - ``"set_read_mostly"``, ``"set_preferred_location"``, and - ``"set_accessed_by"`` are accepted. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None`` for - advice values that ignore location. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, - ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef str advice_name - _require_managed_buffer(self, "Buffer.advise") - advice_name, advice = _normalize_managed_advice(advice) - location = _normalize_managed_location( - location, - location_type, - "Buffer.advise", - allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allow_host=True, - allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, - allow_host_numa_current=advice_name == "set_preferred_location", - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemAdvise(self.handle, self._size, advice, location)) - else: - handle_return( - driver.cuMemAdvise( - self.handle, - self._size, - advice, - _managed_location_to_legacy_device(location, "Buffer.advise"), - ) - ) - - def prefetch( - self, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - location_type: str | None = None, - ): - """Prefetch this managed-memory buffer to a target location.""" - cdef Stream s = Stream_accept(stream) - _require_managed_buffer(self, "Buffer.prefetch") - location = _normalize_managed_location( - location, - location_type, - "Buffer.prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemPrefetchAsync(self.handle, self._size, location, 0, s.handle)) - else: - handle_return( - driver.cuMemPrefetchAsync( - self.handle, - self._size, - _managed_location_to_legacy_device(location, "Buffer.prefetch"), - s.handle, - ) - ) - - def discard_prefetch( - self, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - location_type: str | None = None, - ): - """Discard this managed-memory buffer and prefetch it to a target location.""" - cdef Stream s = Stream_accept(stream) - _require_managed_buffer(self, "Buffer.discard_prefetch") - _require_managed_discard_prefetch_support() - location = _normalize_managed_location( - location, - location_type, - "Buffer.discard_prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, - ) - handle_return( - driver.cuMemDiscardAndPrefetchBatchAsync( - [self.handle], - [self._size], - 1, - [location], - [0], - 1, - 0, - s.handle, - ) - ) - def __dlpack__( self, *, diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index e7989f0f263..83fb1c75817 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -38,9 +38,10 @@ def _warn_deprecated(): _warn_deprecated() -from cuda.core import system, utils +from cuda.core import managed_memory, system, utils # Make utils accessible as a submodule for backward compatibility +__import__("sys").modules[__spec__.name + ".managed_memory"] = managed_memory __import__("sys").modules[__spec__.name + ".utils"] = utils diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py new file mode 100644 index 00000000000..f11aabcd194 --- /dev/null +++ b/cuda_core/cuda/core/managed_memory.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Managed-memory range operations.""" + +from cuda.core._memory._buffer import advise, discard_prefetch, prefetch + +__all__ = ["advise", "prefetch", "discard_prefetch"] diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index fa7ce48eb5a..4d63bbcf88c 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -62,6 +62,19 @@ CUDA runtime on other non-blocking streams. +.. module:: cuda.core.managed_memory + +Managed memory +-------------- + +.. autosummary:: + :toctree: generated/ + + advise + prefetch + discard_prefetch + + CUDA compilation toolchain -------------------------- diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst index 18b3bede36b..186e3181f12 100644 --- a/cuda_core/docs/source/release/0.7.x-notes.rst +++ b/cuda_core/docs/source/release/0.7.x-notes.rst @@ -35,10 +35,12 @@ New features preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or ``("host_numa", 3)``. -- Added managed-memory controls on :class:`Buffer`: ``advise()``, - ``prefetch()``, and ``discard_prefetch()``. These methods validate that the - underlying allocation is managed memory and then forward to the corresponding - CUDA driver operations for range advice and migration. +- Added managed-memory range operations under :mod:`cuda.core.managed_memory`: + ``advise()``, ``prefetch()``, and ``discard_prefetch()``. These free + functions accept either a managed :class:`Buffer` or a raw pointer plus + ``size=``, validate that the target allocation is managed memory, and then + forward to the corresponding CUDA driver operations for range advice and + migration. - Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit control over host NUMA node placement. When ``ipc_enabled=True`` and diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock index 78da9addb58..e2f8b7b0c23 100644 --- a/cuda_core/pixi.lock +++ b/cuda_core/pixi.lock @@ -2598,7 +2598,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2625,7 +2625,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2653,7 +2653,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2794,7 +2794,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2817,7 +2817,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 12.* + cuda_version: 12.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2840,7 +2840,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2862,7 +2862,7 @@ packages: build: py314ha6d028f_0 subdir: linux-64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-64 depends: @@ -2884,7 +2884,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2906,7 +2906,7 @@ packages: build: py314he8946ed_0 subdir: linux-aarch64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-aarch64 depends: diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py index c3215b056ac..82e2cdd5bea 100644 --- a/cuda_core/tests/test_experimental_backward_compat.py +++ b/cuda_core/tests/test_experimental_backward_compat.py @@ -38,6 +38,7 @@ def test_experimental_backward_compatibility(): assert hasattr(cuda.core.experimental, "Device") assert hasattr(cuda.core.experimental, "Stream") assert hasattr(cuda.core.experimental, "Buffer") + assert hasattr(cuda.core.experimental, "managed_memory") assert hasattr(cuda.core.experimental, "system") # Test 2: Direct imports - should emit deprecation warning @@ -73,6 +74,7 @@ def test_experimental_backward_compatibility(): assert cuda.core.experimental.Linker is cuda.core.Linker # Compare singletons + assert cuda.core.experimental.managed_memory is cuda.core.managed_memory assert cuda.core.experimental.system is cuda.core.system # Test 4: Utils module works @@ -88,6 +90,11 @@ def test_experimental_backward_compatibility(): assert StridedMemoryView is not None assert args_viewable_as_strided_memory is not None + from cuda.core.experimental.managed_memory import advise, discard_prefetch, prefetch + + assert advise is not None + assert prefetch is not None + assert discard_prefetch is not None # Test 5: Options classes are accessible assert hasattr(cuda.core.experimental, "EventOptions") diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 380b581e7b2..927014826a8 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -38,6 +38,7 @@ PinnedMemoryResourceOptions, VirtualMemoryResource, VirtualMemoryResourceOptions, + managed_memory, ) from cuda.core import ( system as ccx_system, @@ -48,6 +49,12 @@ from cuda.core.utils import StridedMemoryView POOL_SIZE = 2097152 # 2MB size +_MANAGED_TEST_ALLOCATION_SIZE = 4096 +_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4 +_READ_MOSTLY_ENABLED = 1 +_HOST_LOCATION_ID = -1 +_INVALID_HOST_DEVICE_ORDINAL = 0 +_LEGACY_BINDINGS_VERSION = (12, 9) class DummyDeviceMemoryResource(MemoryResource): @@ -1138,6 +1145,10 @@ def _get_mem_range_attr(buffer, attribute, data_size): return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) +def _get_int_mem_range_attr(buffer, attribute): + return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE) + + def _skip_if_managed_allocation_unsupported(device): try: if not device.properties.managed_memory: @@ -1165,140 +1176,134 @@ def _skip_if_managed_discard_prefetch_unsupported(device): pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") -def test_managed_buffer_prefetch_supports_managed_pool_allocations(init_cuda): +def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): device = Device() skip_if_managed_memory_unsupported(device) device.set_current() mr = create_managed_memory_resource_or_skip() - buffer = mr.allocate(4096) + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - buffer.prefetch(-1, stream=stream) + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) - assert last_location == -1 + assert last_location == _HOST_LOCATION_ID - buffer.prefetch(device, stream=stream) + managed_memory.prefetch(buffer, device, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) assert last_location == device.device_id buffer.close() -def test_managed_buffer_advise_supports_external_managed_allocations(init_cuda): +def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_allocation_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - buffer.advise("set_read_mostly") + managed_memory.advise(buffer, "set_read_mostly") assert ( - _get_mem_range_attr( + _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - 4, ) - == 1 + == _READ_MOSTLY_ENABLED ) # cuda.bindings currently exposes the combined location attributes for # cuMemRangeGetAttribute, so use the legacy location query here. - buffer.advise("set_preferred_location", location_type="host") - preferred_location = _get_mem_range_attr( + managed_memory.advise(buffer, "set_preferred_location", location_type="host") + preferred_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, - 4, ) - assert preferred_location == -1 + assert preferred_location == _HOST_LOCATION_ID buffer.close() -def test_managed_buffer_prefetch_supports_external_managed_allocations(init_cuda): +def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - buffer.prefetch(device, stream=stream) + managed_memory.prefetch(buffer, device, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) assert last_location == device.device_id buffer.close() -def test_managed_buffer_discard_prefetch_supports_external_managed_allocations(init_cuda): +def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_discard_prefetch_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - buffer.prefetch(-1, stream=stream) + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - buffer.discard_prefetch(device, stream=stream) + managed_memory.discard_prefetch(buffer, device, stream=stream) stream.sync() - last_location = _get_mem_range_attr( + last_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - 4, ) assert last_location == device.device_id buffer.close() -def test_managed_buffer_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): +def test_managed_memory_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): device = Device() _skip_if_managed_allocation_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) calls = [] def fake_cuMemAdvise(ptr, size, advice, location): calls.append((ptr, size, advice, location)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) - buffer.advise("set_read_mostly") + managed_memory.advise(buffer, "set_read_mostly") assert len(calls) == 1 - assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", -1)) + assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", _HOST_LOCATION_ID)) buffer.close() -def test_managed_buffer_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): +def test_managed_memory_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(4096) + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() calls = [] @@ -1306,10 +1311,10 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): calls.append((ptr, size, location, hstream)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: (12, 9)) + monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) - buffer.prefetch(device, stream=stream) + managed_memory.prefetch(buffer, device, stream=stream) assert len(calls) == 1 assert calls[0][2] == device.device_id @@ -1318,38 +1323,66 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): buffer.close() -def test_managed_buffer_operations_reject_non_managed_buffers(init_cuda): +def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): device = Device() device.set_current() - buffer = DummyDeviceMemoryResource(device).allocate(4096) + buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(ValueError, match="managed-memory buffer"): - buffer.advise("set_read_mostly") - with pytest.raises(ValueError, match="managed-memory buffer"): - buffer.prefetch(device, stream=stream) - with pytest.raises(ValueError, match="managed-memory buffer"): - buffer.discard_prefetch(device, stream=stream) + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.advise(buffer, "set_read_mostly") + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.prefetch(buffer, device, stream=stream) + with pytest.raises(ValueError, match="managed-memory allocation"): + managed_memory.discard_prefetch(buffer, device, stream=stream) buffer.close() -def test_managed_buffer_operation_validation(init_cuda): +def test_managed_memory_operation_validation(init_cuda): device = Device() skip_if_managed_memory_unsupported(device) device.set_current() mr = create_managed_memory_resource_or_skip() - buffer = mr.allocate(4096) + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() with pytest.raises(ValueError, match="requires a location"): - buffer.prefetch(stream=stream) + managed_memory.prefetch(buffer, stream=stream) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - buffer.advise("set_accessed_by", 0, location_type="host_numa") + managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa") with pytest.raises(ValueError, match="location must be None or -1"): - buffer.prefetch(0, stream=stream, location_type="host") + managed_memory.prefetch(buffer, _INVALID_HOST_DEVICE_ORDINAL, stream=stream, location_type="host") + + buffer.close() + + +def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.advise(buffer.handle, "set_read_mostly", size=buffer.size) + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + managed_memory.prefetch(buffer.handle, device, size=buffer.size, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id buffer.close() From 14575991d65ca85973a4f1dc61f068efc4fc3293 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 16:46:20 -0700 Subject: [PATCH 07/68] precommit format --- cuda_core/cuda/core/managed_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index f11aabcd194..f5bb09c13d7 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -6,4 +6,4 @@ from cuda.core._memory._buffer import advise, discard_prefetch, prefetch -__all__ = ["advise", "prefetch", "discard_prefetch"] +__all__ = ["advise", "discard_prefetch", "prefetch"] From acb402478cac58689f069e0836819b2e91010c09 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 17:30:41 -0700 Subject: [PATCH 08/68] iterating on implementation --- cuda_bindings/pixi.lock | 86 ++++++++++++------------- cuda_core/cuda/core/_memory/_buffer.pyx | 63 ++++++++++++++---- cuda_core/tests/test_memory.py | 85 ++++++++++++++++++++++++ 3 files changed, 178 insertions(+), 56 deletions(-) diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock index b01d6eec69d..237a1695801 100644 --- a/cuda_bindings/pixi.lock +++ b/cuda_bindings/pixi.lock @@ -1081,21 +1081,21 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/conda-gcc-specs-15.2.0-h53410ce_16.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-13.2.27-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-13.2.20-h7938cbb_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_linux-64-12.9.27-ha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-12.9.86-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-dev-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-12.9.79-h5888daf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-12.9.79-h3f2d84a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-12.9.86-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-12.9.86-h69a702a_6.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-12.9.86-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-12.9.86-h4bc722e_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-12.9.86-h4bc722e_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-profiler-api-12.9.79-h7938cbb_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cython-3.2.3-py314h1807b08_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dav1d-1.2.1-hd590300_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h24cb091_1.conda @@ -1134,7 +1134,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.0.44-h85c024f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.14.1.1-hbc026e6_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda @@ -1160,8 +1160,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-13.2.51-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvfatbin-12.9.82-hecca717_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-12.9.86-hecca717_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenvino-2025.2.0-hb617929_1.conda @@ -1264,7 +1264,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - conda: . - build: py314hb727236_0 + build: py314ha6d028f_0 - conda: ../cuda_pathfinder linux-aarch64: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2 @@ -1460,21 +1460,21 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h5782bbf_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/conda-gcc-specs-15.2.0-hd546029_16.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-12.9.27-h57928b3_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-12.9.86-h57928b3_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-12.9.79-he0c23c2_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-12.9.86-hac47afa_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-12.9.86-h719f0c7_6.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-12.9.86-h57928b3_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-12.9.86-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-12.9.86-h2466b09_2.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-12.9.79-h57928b3_1.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cccl_win-64-13.2.27-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_win-64-13.2.51-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-dev-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-dev_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.2.51-h719f0c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.2.51-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.2.51-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-tools-13.2.51-h2466b09_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-profiler-api-13.2.20-h57928b3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cython-3.2.3-py314h344ed54_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/dav1d-1.2.1-hcfcfb64_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda @@ -1520,8 +1520,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/liblapack-3.11.0-5_hf9ab0e9_mkl.conda - conda: https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_2.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libmpdec-4.0.0-h2466b09_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-12.9.82-hac47afa_1.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-12.9.86-hac47afa_2.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvfatbin-13.2.51-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/libnvjitlink-13.2.51-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libogg-1.3.5-h2466b09_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libopus-1.6-h6a83c73_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.53-h7351971_0.conda @@ -1583,7 +1583,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-h534d264_6.conda - conda: . - build: py314h5e6f764_0 + build: py314h356c398_0 - conda: ../cuda_pathfinder packages: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -2154,7 +2154,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 13.2.* + cuda_version: 13.2.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2182,7 +2182,7 @@ packages: subdir: win-64 variants: c_compiler: vs2022 - cuda-version: 12.* + cuda_version: 12.* cxx_compiler: vs2022 python: 3.14.* target_platform: win-64 @@ -2209,7 +2209,7 @@ packages: build: py314h9a28ecd_0 subdir: linux-aarch64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-aarch64 depends: @@ -2237,7 +2237,7 @@ packages: build: py314ha6d028f_0 subdir: linux-64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-64 depends: @@ -2265,7 +2265,7 @@ packages: build: py314hb727236_0 subdir: linux-64 variants: - cuda-version: 13.2.* + cuda_version: 13.2.* python: 3.14.* target_platform: linux-64 depends: @@ -2293,7 +2293,7 @@ packages: build: py314he8946ed_0 subdir: linux-aarch64 variants: - cuda-version: 12.* + cuda_version: 12.* python: 3.14.* target_platform: linux-aarch64 depends: diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 4663302b347..829e05b3ad7 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -205,9 +205,11 @@ cdef inline object _normalize_managed_location( ) loc_id = location if loc_id == -1: - loc_type = "host" + if not allow_host: + raise ValueError(f"{what} does not support host locations") + return _make_managed_location("host", -1) elif loc_id >= 0: - loc_type = "device" + return _make_managed_location("device", loc_id) else: raise ValueError( f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" @@ -245,23 +247,22 @@ cdef inline object _normalize_managed_location( ) return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) - if loc_type == "host" and not allow_host: - raise ValueError(f"{what} does not support host locations") - if loc_type == "host_numa" and not allow_host_numa: - raise ValueError(f"{what} does not support location_type='host_numa'") - if loc_type == "host_numa_current" and not allow_host_numa_current: - raise ValueError(f"{what} does not support location_type='host_numa_current'") - return _make_managed_location(loc_type, loc_id) - cdef inline bint _managed_location_uses_v2_bindings(): # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. return get_binding_version() >= (13, 0) +cdef object _LEGACY_LOC_DEVICE = None +cdef object _LEGACY_LOC_HOST = None + cdef inline int _managed_location_to_legacy_device(object location, str what): + global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST + if _LEGACY_LOC_DEVICE is None: + _LEGACY_LOC_DEVICE = _managed_location_enum("device") + _LEGACY_LOC_HOST = _managed_location_enum("host") cdef object loc_type = location.type - if loc_type == _managed_location_enum("device") or loc_type == _managed_location_enum("host"): + if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: return location.id raise RuntimeError( f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" @@ -396,7 +397,25 @@ def prefetch( int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): - """Prefetch a managed-memory allocation range to a target location.""" + """Prefetch a managed-memory allocation range to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous prefetch. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ cdef Stream s = Stream_accept(stream) cdef object ptr cdef size_t nbytes @@ -440,7 +459,25 @@ def discard_prefetch( int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): - """Discard a managed-memory allocation range and prefetch it to a target location.""" + """Discard a managed-memory allocation range and prefetch it to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for discard_prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous operation. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ cdef Stream s = Stream_accept(stream) cdef object ptr cdef object batch_ptr diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 927014826a8..ea827818ac1 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1359,6 +1359,91 @@ def test_managed_memory_operation_validation(init_cuda): buffer.close() +def test_managed_memory_advise_location_validation(init_cuda): + """Verify doc-specified location constraints for each advice kind.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + # set_read_mostly works without a location (location is ignored) + managed_memory.advise(buffer, "set_read_mostly") + + # set_preferred_location requires a location; device ordinal works + managed_memory.advise(buffer, "set_preferred_location", device.device_id) + + # set_preferred_location with host location_type + managed_memory.advise(buffer, "set_preferred_location", location_type="host") + + # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + managed_memory.advise(buffer, "set_accessed_by", 0, location_type="host_numa") + + # set_accessed_by with host_numa_current also raises ValueError + with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): + managed_memory.advise(buffer, "set_accessed_by", location_type="host_numa_current") + + # Inferred location from int: -1 maps to host, 0 maps to device + managed_memory.advise(buffer, "set_preferred_location", -1) + managed_memory.advise(buffer, "set_preferred_location", 0) + + buffer.close() + + +def test_managed_memory_advise_accepts_enum_value(init_cuda): + """advise() accepts CUmem_advise enum values directly, not just string aliases.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY + managed_memory.advise(buffer, advice_enum) + + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + buffer.close() + + +def test_managed_memory_advise_size_rejected_for_buffer(init_cuda): + """advise() raises TypeError when size= is given with a Buffer target.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(TypeError, match="does not accept size="): + managed_memory.advise(buffer, "set_read_mostly", size=1024) + + buffer.close() + + +def test_managed_memory_advise_invalid_advice_values(init_cuda): + """advise() rejects invalid advice strings and wrong types.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(ValueError, match="advice must be one of"): + managed_memory.advise(buffer, "not_a_real_advice") + + with pytest.raises(TypeError, match="advice must be"): + managed_memory.advise(buffer, 42) + + buffer.close() + + def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) From d10ab07e2f402628b83b08e07d95da39c4f2b634 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 17 Mar 2026 18:13:36 -0700 Subject: [PATCH 09/68] Simplify managed-memory helpers: remove long-form aliases, cache lookups, fix docs - Remove duplicate long-form "cu_mem_advise_*" string aliases from _MANAGED_ADVICE_ALIASES; users pass short strings or the enum directly - Replace 4 boolean allow_* params in _normalize_managed_location with a single allowed_loctypes frozenset driven by _MANAGED_ADVICE_ALLOWED_LOCTYPES - Cache immutable runtime checks: CU_DEVICE_CPU, v2 bindings flag, discard_prefetch support, and advice enum-to-alias reverse map - Collapse hasattr+getattr to single getattr in _managed_location_enum - Move _require_managed_discard_prefetch_support to top of discard_prefetch for fail-fast behavior - Fix docs build: reset Sphinx module scope after managed_memory section in api.rst so subsequent sections resolve under cuda.core - Add discard_prefetch pool-allocation test and comment on _get_mem_range_attr Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 94 ++++++++++++++----------- cuda_core/docs/source/api.rst | 2 + cuda_core/tests/test_memory.py | 26 +++++++ 3 files changed, 79 insertions(+), 43 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 829e05b3ad7..d280b4ea2b5 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -89,17 +89,11 @@ cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { cdef dict _MANAGED_ADVICE_ALIASES = { "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", - "cu_mem_advise_set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", - "cu_mem_advise_unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", - "cu_mem_advise_set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", - "cu_mem_advise_unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", - "cu_mem_advise_set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", - "cu_mem_advise_unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", } cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( @@ -108,10 +102,18 @@ cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( "unset_preferred_location", )) -cdef frozenset _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY = frozenset(( - "set_accessed_by", - "unset_accessed_by", -)) +cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) +cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) +cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) + +cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { + "set_read_mostly": _DEVICE_HOST_NUMA, + "unset_read_mostly": _DEVICE_HOST_NUMA, + "set_preferred_location": _ALL_LOCATION_TYPES, + "unset_preferred_location": _DEVICE_HOST_NUMA, + "set_accessed_by": _DEVICE_HOST_ONLY, + "unset_accessed_by": _DEVICE_HOST_ONLY, +} cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 cdef int _HOST_NUMA_CURRENT_ID = 0 @@ -120,22 +122,32 @@ cdef size_t _SINGLE_RANGE_COUNT = 1 cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 +# Lazily cached values for immutable runtime properties. +cdef object _CU_DEVICE_CPU = None +cdef dict _ADVICE_ENUM_TO_ALIAS = None +cdef int _V2_BINDINGS = -1 +cdef int _DISCARD_PREFETCH_SUPPORTED = -1 + cdef inline object _managed_location_enum(str location_type): cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] - if not hasattr(driver.CUmemLocationType, attr_name): + cdef object result = getattr(driver.CUmemLocationType, attr_name, None) + if result is None: raise RuntimeError( f"Managed-memory location type {location_type!r} is not supported by the " f"installed cuda.bindings package." ) - return getattr(driver.CUmemLocationType, attr_name) + return result cdef inline object _make_managed_location(str location_type, int location_id): + global _CU_DEVICE_CPU cdef object location = driver.CUmemLocation() location.type = _managed_location_enum(location_type) if location_type == "host": - location.id = int(getattr(driver, "CU_DEVICE_CPU", -1)) + if _CU_DEVICE_CPU is None: + _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) + location.id = _CU_DEVICE_CPU elif location_type == "host_numa_current": location.id = _HOST_NUMA_CURRENT_ID else: @@ -157,12 +169,17 @@ cdef inline tuple _normalize_managed_advice(object advice): return alias, getattr(driver.CUmem_advise, attr_name) if isinstance(advice, driver.CUmem_advise): - for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): - if alias.startswith("cu_mem_advise_"): - continue - if advice == getattr(driver.CUmem_advise, attr_name): - return alias, advice - raise ValueError(f"Unsupported advice value: {advice!r}") + global _ADVICE_ENUM_TO_ALIAS + if _ADVICE_ENUM_TO_ALIAS is None: + _ADVICE_ENUM_TO_ALIAS = {} + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + enum_val = getattr(driver.CUmem_advise, attr_name, None) + if enum_val is not None: + _ADVICE_ENUM_TO_ALIAS[enum_val] = alias + alias = _ADVICE_ENUM_TO_ALIAS.get(advice) + if alias is None: + raise ValueError(f"Unsupported advice value: {advice!r}") + return alias, advice raise TypeError( "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" @@ -174,9 +191,7 @@ cdef inline object _normalize_managed_location( object location_type, str what, bint allow_none=False, - bint allow_host=True, - bint allow_host_numa=True, - bint allow_host_numa_current=True, + frozenset allowed_loctypes=_ALL_LOCATION_TYPES, ): cdef object loc_type cdef int loc_id @@ -194,6 +209,9 @@ cdef inline object _normalize_managed_location( f"or None, got {location_type!r}" ) + if loc_type is not None and loc_type not in allowed_loctypes: + raise ValueError(f"{what} does not support location_type='{loc_type}'") + if loc_type is None: if location is None: if allow_none: @@ -205,7 +223,7 @@ cdef inline object _normalize_managed_location( ) loc_id = location if loc_id == -1: - if not allow_host: + if "host" not in allowed_loctypes: raise ValueError(f"{what} does not support host locations") return _make_managed_location("host", -1) elif loc_id >= 0: @@ -227,20 +245,14 @@ cdef inline object _normalize_managed_location( raise ValueError( f"{what} location must be None or -1 when location_type is 'host', got {location!r}" ) - if not allow_host: - raise ValueError(f"{what} does not support location_type='host'") return _make_managed_location(loc_type, -1) elif loc_type == "host_numa": - if not allow_host_numa: - raise ValueError(f"{what} does not support location_type='host_numa'") if not isinstance(location, int) or location < 0: raise ValueError( f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" ) return _make_managed_location(loc_type, location) else: - if not allow_host_numa_current: - raise ValueError(f"{what} does not support location_type='host_numa_current'") if location is not None: raise ValueError( f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" @@ -250,7 +262,10 @@ cdef inline object _normalize_managed_location( cdef inline bint _managed_location_uses_v2_bindings(): # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. - return get_binding_version() >= (13, 0) + global _V2_BINDINGS + if _V2_BINDINGS < 0: + _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 + return _V2_BINDINGS != 0 cdef object _LEGACY_LOC_DEVICE = None @@ -276,7 +291,10 @@ cdef inline void _require_managed_buffer(Buffer self, str what): cdef inline void _require_managed_discard_prefetch_support(str what): - if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + global _DISCARD_PREFETCH_SUPPORTED + if _DISCARD_PREFETCH_SUPPORTED < 0: + _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 + if not _DISCARD_PREFETCH_SUPPORTED: raise RuntimeError( f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" ) @@ -372,9 +390,7 @@ def advise( location_type, "advise", allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allow_host=True, - allow_host_numa=advice_name not in _MANAGED_ADVICE_HOST_OR_DEVICE_ONLY, - allow_host_numa_current=advice_name == "set_preferred_location", + allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], ) if _managed_location_uses_v2_bindings(): handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) @@ -425,10 +441,6 @@ def prefetch( location, location_type, "prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, ) if _managed_location_uses_v2_bindings(): handle_return( @@ -478,6 +490,7 @@ def discard_prefetch( Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ + _require_managed_discard_prefetch_support("discard_prefetch") cdef Stream s = Stream_accept(stream) cdef object ptr cdef object batch_ptr @@ -485,15 +498,10 @@ def discard_prefetch( ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") batch_ptr = driver.CUdeviceptr(int(ptr)) - _require_managed_discard_prefetch_support("discard_prefetch") location = _normalize_managed_location( location, location_type, "discard_prefetch", - allow_none=False, - allow_host=True, - allow_host_numa=True, - allow_host_numa_current=True, ) handle_return( driver.cuMemDiscardAndPrefetchBatchAsync( diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 4d63bbcf88c..7bf59ae4956 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -74,6 +74,8 @@ Managed memory prefetch discard_prefetch +.. module:: cuda.core + :no-index: CUDA compilation toolchain -------------------------- diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index ea827818ac1..5296ea344af 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1142,6 +1142,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): def _get_mem_range_attr(buffer, attribute, data_size): + # cuMemRangeGetAttribute returns a raw integer when data_size <= 4. return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) @@ -1252,6 +1253,31 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda buffer.close() +def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + + managed_memory.discard_prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): device = Device() _skip_if_managed_discard_prefetch_unsupported(device) From c250c92e47393fa6cb0e6611245c5a4dd0c3b6cf Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 09:21:11 -0700 Subject: [PATCH 10/68] fix(test): reset _V2_BINDINGS cache so legacy-signature tests take the legacy path The _V2_BINDINGS cache in _buffer.pyx persists across tests, so monkeypatching get_binding_version alone is insufficient when earlier tests have already populated the cache with the v2 value. Promote _V2_BINDINGS from cdef int to a Python-level variable so tests can monkeypatch it directly via monkeypatch.setattr, and reset it to -1 in both legacy-signature tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 2 +- cuda_core/tests/test_memory.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 6f5809e06cb..d109de2ac44 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -124,7 +124,7 @@ cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 # Lazily cached values for immutable runtime properties. cdef object _CU_DEVICE_CPU = None cdef dict _ADVICE_ENUM_TO_ALIAS = None -cdef int _V2_BINDINGS = -1 +_V2_BINDINGS = -1 cdef int _DISCARD_PREFETCH_SUPPORTED = -1 diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9cd3209d8d3..411a3c6cb5a 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1314,6 +1314,7 @@ def fake_cuMemAdvise(ptr, size, advice, location): return (driver.CUresult.CUDA_SUCCESS,) monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) managed_memory.advise(buffer, "set_read_mostly") @@ -1338,6 +1339,7 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): return (driver.CUresult.CUDA_SUCCESS,) monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) managed_memory.prefetch(buffer, device, stream=stream) From 89329d9c6eff581445b4806fe0217e598a2313fa Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 10:18:41 -0700 Subject: [PATCH 11/68] fix(test): require concurrent_managed_access for advise tests that hit real hardware These three tests call cuMemAdvise on real CUDA devices and verify memory range attributes. On devices without concurrent_managed_access (e.g. Windows/WDDM), set_read_mostly silently no-ops and set_preferred_location fails with CUDA_ERROR_INVALID_DEVICE. Use the stricter _skip_if_managed_location_ops_unsupported guard, matching the pattern already used by test_managed_memory_functions_accept_raw_pointer_ranges. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/tests/test_memory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 411a3c6cb5a..56c505fbe6b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1207,7 +1207,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): device = Device() - _skip_if_managed_allocation_unsupported(device) + _skip_if_managed_location_ops_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -1390,7 +1390,7 @@ def test_managed_memory_operation_validation(init_cuda): def test_managed_memory_advise_location_validation(init_cuda): """Verify doc-specified location constraints for each advice kind.""" device = Device() - _skip_if_managed_allocation_unsupported(device) + _skip_if_managed_location_ops_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -1422,7 +1422,7 @@ def test_managed_memory_advise_location_validation(init_cuda): def test_managed_memory_advise_accepts_enum_value(init_cuda): """advise() accepts CUmem_advise enum values directly, not just string aliases.""" device = Device() - _skip_if_managed_allocation_unsupported(device) + _skip_if_managed_location_ops_unsupported(device) device.set_current() buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) From 8a75d1bf1f1172e4681bb232a22f00ff9567d5d8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 11:23:53 -0700 Subject: [PATCH 12/68] fix: validate managed buffer before checking discard_prefetch bindings support Reorder checks in discard_prefetch so _normalize_managed_target_range runs before _require_managed_discard_prefetch_support. This ensures non-managed buffers raise ValueError before the RuntimeError for missing cuMemDiscardAndPrefetchBatchAsync support. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index d109de2ac44..ffd82facb55 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -489,13 +489,13 @@ def discard_prefetch( Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ - _require_managed_discard_prefetch_support("discard_prefetch") - cdef Stream s = Stream_accept(stream) cdef object ptr cdef object batch_ptr cdef size_t nbytes ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + _require_managed_discard_prefetch_support("discard_prefetch") + cdef Stream s = Stream_accept(stream) batch_ptr = driver.CUdeviceptr(int(ptr)) location = _normalize_managed_location( location, From 9e9b1e0914d30f855389a349cf8d41d134b1c4dc Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 14:08:24 -0700 Subject: [PATCH 13/68] refactor: extract managed memory ops into dedicated _managed_memory_ops module Move advise, prefetch, and discard_prefetch functions and their helpers out of _buffer.pyx into a new _managed_memory_ops Cython module to improve separation of concerns. Expose _init_mem_attrs and _query_memory_attrs as non-inline cdef functions in _buffer.pxd so the new module can reuse them. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pxd | 8 + cuda_core/cuda/core/_memory/_buffer.pyx | 449 +---------------- .../cuda/core/_memory/_managed_memory_ops.pxd | 6 + .../cuda/core/_memory/_managed_memory_ops.pyx | 458 ++++++++++++++++++ cuda_core/cuda/core/managed_memory.py | 2 +- cuda_core/tests/test_memory.py | 14 +- 6 files changed, 483 insertions(+), 454 deletions(-) create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pxd create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pyx diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 04b5707e18e..9065da77eb8 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -4,6 +4,7 @@ from libc.stdint cimport uintptr_t +from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport DevicePtrHandle from cuda.core._stream cimport Stream @@ -38,3 +39,10 @@ cdef Buffer Buffer_from_deviceptr_handle( MemoryResource mr, object ipc_descriptor = * ) + +# Memory attribute query helpers (used by _managed_memory_ops) +cdef void _init_mem_attrs(Buffer self) +cdef int _query_memory_attrs( + _MemAttrs& out, + cydriver.CUdeviceptr ptr, +) except -1 nogil diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index ffd82facb55..104252a62bc 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -35,7 +35,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core._device import Device @@ -72,449 +72,6 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting """ -cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( - "device", - "host", - "host_numa", - "host_numa_current", -) - -cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { - "device": "CU_MEM_LOCATION_TYPE_DEVICE", - "host": "CU_MEM_LOCATION_TYPE_HOST", - "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", - "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", -} - -cdef dict _MANAGED_ADVICE_ALIASES = { - "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", - "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", - "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", - "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", - "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", - "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", -} - -cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( - "set_read_mostly", - "unset_read_mostly", - "unset_preferred_location", -)) - -cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) -cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) -cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) - -cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { - "set_read_mostly": _DEVICE_HOST_NUMA, - "unset_read_mostly": _DEVICE_HOST_NUMA, - "set_preferred_location": _ALL_LOCATION_TYPES, - "unset_preferred_location": _DEVICE_HOST_NUMA, - "set_accessed_by": _DEVICE_HOST_ONLY, - "unset_accessed_by": _DEVICE_HOST_ONLY, -} - -cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 -cdef int _HOST_NUMA_CURRENT_ID = 0 -cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 -cdef size_t _SINGLE_RANGE_COUNT = 1 -cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 -cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 - -# Lazily cached values for immutable runtime properties. -cdef object _CU_DEVICE_CPU = None -cdef dict _ADVICE_ENUM_TO_ALIAS = None -_V2_BINDINGS = -1 -cdef int _DISCARD_PREFETCH_SUPPORTED = -1 - - -cdef inline object _managed_location_enum(str location_type): - cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] - cdef object result = getattr(driver.CUmemLocationType, attr_name, None) - if result is None: - raise RuntimeError( - f"Managed-memory location type {location_type!r} is not supported by the " - f"installed cuda.bindings package." - ) - return result - - -cdef inline object _make_managed_location(str location_type, int location_id): - global _CU_DEVICE_CPU - cdef object location = driver.CUmemLocation() - location.type = _managed_location_enum(location_type) - if location_type == "host": - if _CU_DEVICE_CPU is None: - _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) - location.id = _CU_DEVICE_CPU - elif location_type == "host_numa_current": - location.id = _HOST_NUMA_CURRENT_ID - else: - location.id = location_id - return location - - -cdef inline tuple _normalize_managed_advice(object advice): - cdef str alias - cdef str attr_name - if isinstance(advice, str): - alias = advice.lower() - attr_name = _MANAGED_ADVICE_ALIASES.get(alias) - if attr_name is None: - raise ValueError( - "advice must be one of " - f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" - ) - return alias, getattr(driver.CUmem_advise, attr_name) - - if isinstance(advice, driver.CUmem_advise): - global _ADVICE_ENUM_TO_ALIAS - if _ADVICE_ENUM_TO_ALIAS is None: - _ADVICE_ENUM_TO_ALIAS = {} - for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): - enum_val = getattr(driver.CUmem_advise, attr_name, None) - if enum_val is not None: - _ADVICE_ENUM_TO_ALIAS[enum_val] = alias - alias = _ADVICE_ENUM_TO_ALIAS.get(advice) - if alias is None: - raise ValueError(f"Unsupported advice value: {advice!r}") - return alias, advice - - raise TypeError( - "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" - ) - - -cdef inline object _normalize_managed_location( - object location, - object location_type, - str what, - bint allow_none=False, - frozenset allowed_loctypes=_ALL_LOCATION_TYPES, -): - cdef object loc_type - cdef int loc_id - - if isinstance(location, Device): - location = location.device_id - - if location_type is not None and not isinstance(location_type, str): - raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") - - loc_type = None if location_type is None else (location_type).lower() - if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: - raise ValueError( - f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " - f"or None, got {location_type!r}" - ) - - if loc_type is not None and loc_type not in allowed_loctypes: - raise ValueError(f"{what} does not support location_type='{loc_type}'") - - if loc_type is None: - if location is None: - if allow_none: - return _make_managed_location("host", -1) - raise ValueError(f"{what} requires a location") - if not isinstance(location, int): - raise TypeError( - f"{what} location must be a Device, int, or None, got {type(location).__name__}" - ) - loc_id = location - if loc_id == -1: - if "host" not in allowed_loctypes: - raise ValueError(f"{what} does not support host locations") - return _make_managed_location("host", -1) - elif loc_id >= 0: - return _make_managed_location("device", loc_id) - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" - ) - elif loc_type == "device": - if isinstance(location, int) and location >= 0: - loc_id = location - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" - ) - return _make_managed_location(loc_type, loc_id) - elif loc_type == "host": - if location not in (None, -1): - raise ValueError( - f"{what} location must be None or -1 when location_type is 'host', got {location!r}" - ) - return _make_managed_location(loc_type, -1) - elif loc_type == "host_numa": - if not isinstance(location, int) or location < 0: - raise ValueError( - f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" - ) - return _make_managed_location(loc_type, location) - else: - if location is not None: - raise ValueError( - f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" - ) - return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) - - -cdef inline bint _managed_location_uses_v2_bindings(): - # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. - global _V2_BINDINGS - if _V2_BINDINGS < 0: - _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 - return _V2_BINDINGS != 0 - - -cdef object _LEGACY_LOC_DEVICE = None -cdef object _LEGACY_LOC_HOST = None - -cdef inline int _managed_location_to_legacy_device(object location, str what): - global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST - if _LEGACY_LOC_DEVICE is None: - _LEGACY_LOC_DEVICE = _managed_location_enum("device") - _LEGACY_LOC_HOST = _managed_location_enum("host") - cdef object loc_type = location.type - if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: - return location.id - raise RuntimeError( - f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" - ) - - -cdef inline void _require_managed_buffer(Buffer self, str what): - _init_mem_attrs(self) - if not self._mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory allocation") - - -cdef inline void _require_managed_discard_prefetch_support(str what): - global _DISCARD_PREFETCH_SUPPORTED - if _DISCARD_PREFETCH_SUPPORTED < 0: - _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 - if not _DISCARD_PREFETCH_SUPPORTED: - raise RuntimeError( - f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" - ) - - -cdef inline tuple _managed_range_from_buffer( - Buffer buffer, - int size, - str what, -): - if size != _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} does not accept size= when target is a Buffer") - _require_managed_buffer(buffer, what) - return buffer.handle, buffer._size - - -cdef inline uintptr_t _coerce_raw_pointer(object target, str what) except? 0: - cdef object ptr_obj - try: - ptr_obj = int(target) - except Exception as exc: - raise TypeError( - f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" - ) from exc - if ptr_obj < 0: - raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") - return ptr_obj - - -cdef inline int _require_managed_pointer(uintptr_t ptr, str what) except -1: - cdef _MemAttrs mem_attrs - with nogil: - _query_memory_attrs(mem_attrs, ptr) - if not mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory allocation") - return 0 - - -cdef inline tuple _normalize_managed_target_range( - object target, - int size, - str what, -): - cdef uintptr_t ptr - - if isinstance(target, Buffer): - return _managed_range_from_buffer(target, size, what) - - if size == _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} requires size= when target is a raw pointer") - ptr = _coerce_raw_pointer(target, what) - _require_managed_pointer(ptr, what) - return ptr, size - - -def advise( - target, - advice: driver.CUmem_advise | str, - location: Device | int | None = None, - *, - int size=_MANAGED_SIZE_NOT_PROVIDED, - location_type: str | None = None, -): - """Apply managed-memory advice to an allocation range. - - Parameters - ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). - advice : :obj:`~driver.CUmem_advise` | str - Managed-memory advice to apply. String aliases such as - ``"set_read_mostly"``, ``"set_preferred_location"``, and - ``"set_accessed_by"`` are accepted. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None`` for - advice values that ignore location. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef str advice_name - cdef object ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "advise") - advice_name, advice = _normalize_managed_advice(advice) - location = _normalize_managed_location( - location, - location_type, - "advise", - allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) - else: - handle_return( - driver.cuMemAdvise( - ptr, - nbytes, - advice, - _managed_location_to_legacy_device(location, "advise"), - ) - ) - - -def prefetch( - target, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, - location_type: str | None = None, -): - """Prefetch a managed-memory allocation range to a target location. - - Parameters - ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous prefetch. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef Stream s = Stream_accept(stream) - cdef object ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") - location = _normalize_managed_location( - location, - location_type, - "prefetch", - ) - if _managed_location_uses_v2_bindings(): - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - location, - _MANAGED_OPERATION_FLAGS, - s.handle, - ) - ) - else: - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - _managed_location_to_legacy_device(location, "prefetch"), - s.handle, - ) - ) - - -def discard_prefetch( - target, - location: Device | int | None = None, - *, - stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, - location_type: str | None = None, -): - """Discard a managed-memory allocation range and prefetch it to a target location. - - Parameters - ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for discard_prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous operation. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. - """ - cdef object ptr - cdef object batch_ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") - _require_managed_discard_prefetch_support("discard_prefetch") - cdef Stream s = Stream_accept(stream) - batch_ptr = driver.CUdeviceptr(int(ptr)) - location = _normalize_managed_location( - location, - location_type, - "discard_prefetch", - ) - handle_return( - driver.cuMemDiscardAndPrefetchBatchAsync( - [batch_ptr], - [nbytes], - _SINGLE_RANGE_COUNT, - [location], - [_FIRST_PREFETCH_LOCATION_INDEX], - _SINGLE_PREFETCH_LOCATION_COUNT, - _MANAGED_OPERATION_FLAGS, - s.handle, - ) - ) - cdef class Buffer: """Represent a handle to allocated memory. @@ -864,14 +421,14 @@ cdef class Buffer: # Memory Attribute Query Helpers # ------------------------------ -cdef inline void _init_mem_attrs(Buffer self): +cdef void _init_mem_attrs(Buffer self): """Initialize memory attributes by querying the pointer.""" if not self._mem_attrs_inited: _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr)) self._mem_attrs_inited = True -cdef inline int _query_memory_attrs( +cdef int _query_memory_attrs( _MemAttrs& out, cydriver.CUdeviceptr ptr ) except -1 nogil: diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd new file mode 100644 index 00000000000..a7019c784d0 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# Managed-memory operation helpers (advise, prefetch, discard_prefetch). +# The public API is exposed via def functions; no cdef declarations needed. diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx new file mode 100644 index 00000000000..649c2cbe72d --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -0,0 +1,458 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver +from cuda.core._memory._buffer cimport Buffer, _MemAttrs, _init_mem_attrs, _query_memory_attrs +from cuda.core._stream cimport Stream, Stream_accept + +from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._device import Device + + +cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( + "device", + "host", + "host_numa", + "host_numa_current", +) + +cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { + "device": "CU_MEM_LOCATION_TYPE_DEVICE", + "host": "CU_MEM_LOCATION_TYPE_HOST", + "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", + "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", +} + +cdef dict _MANAGED_ADVICE_ALIASES = { + "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", + "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + "set_preferred_location": "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + "unset_preferred_location": "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + "set_accessed_by": "CU_MEM_ADVISE_SET_ACCESSED_BY", + "unset_accessed_by": "CU_MEM_ADVISE_UNSET_ACCESSED_BY", +} + +cdef frozenset _MANAGED_ADVICE_IGNORE_LOCATION = frozenset(( + "set_read_mostly", + "unset_read_mostly", + "unset_preferred_location", +)) + +cdef frozenset _ALL_LOCATION_TYPES = frozenset(("device", "host", "host_numa", "host_numa_current")) +cdef frozenset _DEVICE_HOST_NUMA = frozenset(("device", "host", "host_numa")) +cdef frozenset _DEVICE_HOST_ONLY = frozenset(("device", "host")) + +cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { + "set_read_mostly": _DEVICE_HOST_NUMA, + "unset_read_mostly": _DEVICE_HOST_NUMA, + "set_preferred_location": _ALL_LOCATION_TYPES, + "unset_preferred_location": _DEVICE_HOST_NUMA, + "set_accessed_by": _DEVICE_HOST_ONLY, + "unset_accessed_by": _DEVICE_HOST_ONLY, +} + +cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 +cdef int _HOST_NUMA_CURRENT_ID = 0 +cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 +cdef size_t _SINGLE_RANGE_COUNT = 1 +cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 +cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 + +# Lazily cached values for immutable runtime properties. +cdef object _CU_DEVICE_CPU = None +cdef dict _ADVICE_ENUM_TO_ALIAS = None +_V2_BINDINGS = -1 +cdef int _DISCARD_PREFETCH_SUPPORTED = -1 + + +cdef object _managed_location_enum(str location_type): + cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] + cdef object result = getattr(driver.CUmemLocationType, attr_name, None) + if result is None: + raise RuntimeError( + f"Managed-memory location type {location_type!r} is not supported by the " + f"installed cuda.bindings package." + ) + return result + + +cdef object _make_managed_location(str location_type, int location_id): + global _CU_DEVICE_CPU + cdef object location = driver.CUmemLocation() + location.type = _managed_location_enum(location_type) + if location_type == "host": + if _CU_DEVICE_CPU is None: + _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) + location.id = _CU_DEVICE_CPU + elif location_type == "host_numa_current": + location.id = _HOST_NUMA_CURRENT_ID + else: + location.id = location_id + return location + + +cdef tuple _normalize_managed_advice(object advice): + cdef str alias + cdef str attr_name + if isinstance(advice, str): + alias = advice.lower() + attr_name = _MANAGED_ADVICE_ALIASES.get(alias) + if attr_name is None: + raise ValueError( + "advice must be one of " + f"{tuple(sorted(_MANAGED_ADVICE_ALIASES))!r}, got {advice!r}" + ) + return alias, getattr(driver.CUmem_advise, attr_name) + + if isinstance(advice, driver.CUmem_advise): + global _ADVICE_ENUM_TO_ALIAS + if _ADVICE_ENUM_TO_ALIAS is None: + _ADVICE_ENUM_TO_ALIAS = {} + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): + enum_val = getattr(driver.CUmem_advise, attr_name, None) + if enum_val is not None: + _ADVICE_ENUM_TO_ALIAS[enum_val] = alias + alias = _ADVICE_ENUM_TO_ALIAS.get(advice) + if alias is None: + raise ValueError(f"Unsupported advice value: {advice!r}") + return alias, advice + + raise TypeError( + "advice must be a cuda.bindings.driver.CUmem_advise value or a supported string alias" + ) + + +cdef object _normalize_managed_location( + object location, + object location_type, + str what, + bint allow_none=False, + frozenset allowed_loctypes=_ALL_LOCATION_TYPES, +): + cdef object loc_type + cdef int loc_id + + if isinstance(location, Device): + location = location.device_id + + if location_type is not None and not isinstance(location_type, str): + raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") + + loc_type = None if location_type is None else (location_type).lower() + if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: + raise ValueError( + f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " + f"or None, got {location_type!r}" + ) + + if loc_type is not None and loc_type not in allowed_loctypes: + raise ValueError(f"{what} does not support location_type='{loc_type}'") + + if loc_type is None: + if location is None: + if allow_none: + return _make_managed_location("host", -1) + raise ValueError(f"{what} requires a location") + if not isinstance(location, int): + raise TypeError( + f"{what} location must be a Device, int, or None, got {type(location).__name__}" + ) + loc_id = location + if loc_id == -1: + if "host" not in allowed_loctypes: + raise ValueError(f"{what} does not support host locations") + return _make_managed_location("host", -1) + elif loc_id >= 0: + return _make_managed_location("device", loc_id) + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" + ) + elif loc_type == "device": + if isinstance(location, int) and location >= 0: + loc_id = location + else: + raise ValueError( + f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" + ) + return _make_managed_location(loc_type, loc_id) + elif loc_type == "host": + if location not in (None, -1): + raise ValueError( + f"{what} location must be None or -1 when location_type is 'host', got {location!r}" + ) + return _make_managed_location(loc_type, -1) + elif loc_type == "host_numa": + if not isinstance(location, int) or location < 0: + raise ValueError( + f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" + ) + return _make_managed_location(loc_type, location) + else: + if location is not None: + raise ValueError( + f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" + ) + return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) + + +cdef bint _managed_location_uses_v2_bindings(): + # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. + global _V2_BINDINGS + if _V2_BINDINGS < 0: + _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 + return _V2_BINDINGS != 0 + + +cdef object _LEGACY_LOC_DEVICE = None +cdef object _LEGACY_LOC_HOST = None + +cdef int _managed_location_to_legacy_device(object location, str what): + global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST + if _LEGACY_LOC_DEVICE is None: + _LEGACY_LOC_DEVICE = _managed_location_enum("device") + _LEGACY_LOC_HOST = _managed_location_enum("host") + cdef object loc_type = location.type + if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: + return location.id + raise RuntimeError( + f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" + ) + + +cdef void _require_managed_buffer(Buffer self, str what): + _init_mem_attrs(self) + if not self._mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + + +cdef void _require_managed_discard_prefetch_support(str what): + global _DISCARD_PREFETCH_SUPPORTED + if _DISCARD_PREFETCH_SUPPORTED < 0: + _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 + if not _DISCARD_PREFETCH_SUPPORTED: + raise RuntimeError( + f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" + ) + + +cdef tuple _managed_range_from_buffer( + Buffer buffer, + int size, + str what, +): + if size != _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} does not accept size= when target is a Buffer") + _require_managed_buffer(buffer, what) + return buffer.handle, buffer._size + + +cdef uintptr_t _coerce_raw_pointer(object target, str what) except? 0: + cdef object ptr_obj + try: + ptr_obj = int(target) + except Exception as exc: + raise TypeError( + f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" + ) from exc + if ptr_obj < 0: + raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") + return ptr_obj + + +cdef int _require_managed_pointer(uintptr_t ptr, str what) except -1: + cdef _MemAttrs mem_attrs + with nogil: + _query_memory_attrs(mem_attrs, ptr) + if not mem_attrs.is_managed: + raise ValueError(f"{what} requires a managed-memory allocation") + return 0 + + +cdef tuple _normalize_managed_target_range( + object target, + int size, + str what, +): + cdef uintptr_t ptr + + if isinstance(target, Buffer): + return _managed_range_from_buffer(target, size, what) + + if size == _MANAGED_SIZE_NOT_PROVIDED: + raise TypeError(f"{what} requires size= when target is a raw pointer") + ptr = _coerce_raw_pointer(target, what) + _require_managed_pointer(ptr, what) + return ptr, size + + +def advise( + target, + advice: driver.CUmem_advise | str, + location: Device | int | None = None, + *, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Apply managed-memory advice to an allocation range. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + advice : :obj:`~driver.CUmem_advise` | str + Managed-memory advice to apply. String aliases such as + ``"set_read_mostly"``, ``"set_preferred_location"``, and + ``"set_accessed_by"`` are accepted. + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None`` for + advice values that ignore location. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef str advice_name + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "advise") + advice_name, advice = _normalize_managed_advice(advice) + location = _normalize_managed_location( + location, + location_type, + "advise", + allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, + allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], + ) + if _managed_location_uses_v2_bindings(): + handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) + else: + handle_return( + driver.cuMemAdvise( + ptr, + nbytes, + advice, + _managed_location_to_legacy_device(location, "advise"), + ) + ) + + +def prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Prefetch a managed-memory allocation range to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous prefetch. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef Stream s = Stream_accept(stream) + cdef object ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") + location = _normalize_managed_location( + location, + location_type, + "prefetch", + ) + if _managed_location_uses_v2_bindings(): + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + location, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) + else: + handle_return( + driver.cuMemPrefetchAsync( + ptr, + nbytes, + _managed_location_to_legacy_device(location, "prefetch"), + s.handle, + ) + ) + + +def discard_prefetch( + target, + location: Device | int | None = None, + *, + stream: Stream | GraphBuilder, + int size=_MANAGED_SIZE_NOT_PROVIDED, + location_type: str | None = None, +): + """Discard a managed-memory allocation range and prefetch it to a target location. + + Parameters + ---------- + target : :class:`Buffer` | int | object + Managed allocation to operate on. This may be a :class:`Buffer` or a + raw pointer (requires ``size=``). + location : :obj:`~_device.Device` | int | None, optional + Target location. When ``location_type`` is ``None``, values are + interpreted as a device ordinal, ``-1`` for host, or ``None``. + A location is required for discard_prefetch. + stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` + Keyword argument specifying the stream for the asynchronous operation. + size : int, optional + Allocation size in bytes. Required when ``target`` is a raw pointer. + location_type : str | None, optional + Explicit location kind. Supported values are ``"device"``, ``"host"``, + ``"host_numa"``, and ``"host_numa_current"``. + """ + cdef object ptr + cdef object batch_ptr + cdef size_t nbytes + + ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + _require_managed_discard_prefetch_support("discard_prefetch") + cdef Stream s = Stream_accept(stream) + batch_ptr = driver.CUdeviceptr(int(ptr)) + location = _normalize_managed_location( + location, + location_type, + "discard_prefetch", + ) + handle_return( + driver.cuMemDiscardAndPrefetchBatchAsync( + [batch_ptr], + [nbytes], + _SINGLE_RANGE_COUNT, + [location], + [_FIRST_PREFETCH_LOCATION_INDEX], + _SINGLE_PREFETCH_LOCATION_COUNT, + _MANAGED_OPERATION_FLAGS, + s.handle, + ) + ) diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index f5bb09c13d7..005c9ec3cf0 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -4,6 +4,6 @@ """Managed-memory range operations.""" -from cuda.core._memory._buffer import advise, discard_prefetch, prefetch +from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch __all__ = ["advise", "discard_prefetch", "prefetch"] diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 56c505fbe6b..544b7afc032 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -44,7 +44,7 @@ system as ccx_system, ) from cuda.core._dlpack import DLDeviceType -from cuda.core._memory import IPCBufferDescriptor, _buffer +from cuda.core._memory import IPCBufferDescriptor, _managed_memory_ops from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView @@ -1313,9 +1313,9 @@ def fake_cuMemAdvise(ptr, size, advice, location): calls.append((ptr, size, advice, location)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) - monkeypatch.setattr(_buffer.driver, "cuMemAdvise", fake_cuMemAdvise) + monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) + monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise) managed_memory.advise(buffer, "set_read_mostly") @@ -1338,9 +1338,9 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): calls.append((ptr, size, location, hstream)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_buffer, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_buffer, "_V2_BINDINGS", -1) - monkeypatch.setattr(_buffer.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) + monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) + monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) managed_memory.prefetch(buffer, device, stream=stream) From 90f07117615a25b45baf9722c3c1f0835c85d1c5 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 18 Mar 2026 14:16:38 -0700 Subject: [PATCH 14/68] pre-commit fix --- cuda_core/cuda/core/_memory/_buffer.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 104252a62bc..e47f3f4926e 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -35,7 +35,7 @@ else: BufferProtocol = object from cuda.core._dlpack import DLDeviceType, make_py_capsule -from cuda.core._utils.cuda_utils import driver, handle_return +from cuda.core._utils.cuda_utils import driver from cuda.core._device import Device From b4d252cdb5a8899d775db185d0cc9ec92c9cd474 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 19 Mar 2026 11:07:46 -0700 Subject: [PATCH 15/68] Removing blank file --- cuda_core/cuda/core/_memory/_managed_memory_ops.pxd | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 cuda_core/cuda/core/_memory/_managed_memory_ops.pxd diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd b/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd deleted file mode 100644 index a7019c784d0..00000000000 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pxd +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -# Managed-memory operation helpers (advise, prefetch, discard_prefetch). -# The public API is exposed via def functions; no cdef declarations needed. From faaa1d881363eb4ea5d3d13cf0a21b433cdcd61f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 19 Mar 2026 13:15:08 -0700 Subject: [PATCH 16/68] wip --- .../cuda/core/_memory/_managed_memory_ops.pyx | 117 +++++------------- cuda_core/tests/test_memory.py | 42 ------- 2 files changed, 29 insertions(+), 130 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 649c2cbe72d..04dc33ed755 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -4,10 +4,7 @@ from __future__ import annotations -from libc.stdint cimport uintptr_t - -from cuda.bindings cimport cydriver -from cuda.core._memory._buffer cimport Buffer, _MemAttrs, _init_mem_attrs, _query_memory_attrs +from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return @@ -56,7 +53,6 @@ cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { "unset_accessed_by": _DEVICE_HOST_ONLY, } -cdef int _MANAGED_SIZE_NOT_PROVIDED = -1 cdef int _HOST_NUMA_CURRENT_ID = 0 cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 cdef size_t _SINGLE_RANGE_COUNT = 1 @@ -241,71 +237,19 @@ cdef void _require_managed_discard_prefetch_support(str what): ) -cdef tuple _managed_range_from_buffer( - Buffer buffer, - int size, - str what, -): - if size != _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} does not accept size= when target is a Buffer") - _require_managed_buffer(buffer, what) - return buffer.handle, buffer._size - - -cdef uintptr_t _coerce_raw_pointer(object target, str what) except? 0: - cdef object ptr_obj - try: - ptr_obj = int(target) - except Exception as exc: - raise TypeError( - f"{what} target must be a Buffer or a raw pointer, got {type(target).__name__}" - ) from exc - if ptr_obj < 0: - raise ValueError(f"{what} target pointer must be >= 0, got {target!r}") - return ptr_obj - - -cdef int _require_managed_pointer(uintptr_t ptr, str what) except -1: - cdef _MemAttrs mem_attrs - with nogil: - _query_memory_attrs(mem_attrs, ptr) - if not mem_attrs.is_managed: - raise ValueError(f"{what} requires a managed-memory allocation") - return 0 - - -cdef tuple _normalize_managed_target_range( - object target, - int size, - str what, -): - cdef uintptr_t ptr - - if isinstance(target, Buffer): - return _managed_range_from_buffer(target, size, what) - - if size == _MANAGED_SIZE_NOT_PROVIDED: - raise TypeError(f"{what} requires size= when target is a raw pointer") - ptr = _coerce_raw_pointer(target, what) - _require_managed_pointer(ptr, what) - return ptr, size - - def advise( - target, + target: Buffer, advice: driver.CUmem_advise | str, location: Device | int | None = None, *, - int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): """Apply managed-memory advice to an allocation range. Parameters ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). + target : :class:`Buffer` + Managed allocation to operate on. advice : :obj:`~driver.CUmem_advise` | str Managed-memory advice to apply. String aliases such as ``"set_read_mostly"``, ``"set_preferred_location"``, and @@ -314,17 +258,18 @@ def advise( Target location. When ``location_type`` is ``None``, values are interpreted as a device ordinal, ``-1`` for host, or ``None`` for advice values that ignore location. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. location_type : str | None, optional Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ + if not isinstance(target, Buffer): + raise TypeError(f"advise target must be a Buffer, got {type(target).__name__}") + cdef Buffer buf = target + _require_managed_buffer(buf, "advise") cdef str advice_name - cdef object ptr - cdef size_t nbytes + cdef object ptr = buf.handle + cdef size_t nbytes = buf._size - ptr, nbytes = _normalize_managed_target_range(target, size, "advise") advice_name, advice = _normalize_managed_advice(advice) location = _normalize_managed_location( location, @@ -347,37 +292,36 @@ def advise( def prefetch( - target, + target: Buffer, location: Device | int | None = None, *, stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): """Prefetch a managed-memory allocation range to a target location. Parameters ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). + target : :class:`Buffer` + Managed allocation to operate on. location : :obj:`~_device.Device` | int | None, optional Target location. When ``location_type`` is ``None``, values are interpreted as a device ordinal, ``-1`` for host, or ``None``. A location is required for prefetch. stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` Keyword argument specifying the stream for the asynchronous prefetch. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. location_type : str | None, optional Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ + if not isinstance(target, Buffer): + raise TypeError(f"prefetch target must be a Buffer, got {type(target).__name__}") + cdef Buffer buf = target + _require_managed_buffer(buf, "prefetch") cdef Stream s = Stream_accept(stream) - cdef object ptr - cdef size_t nbytes + cdef object ptr = buf.handle + cdef size_t nbytes = buf._size - ptr, nbytes = _normalize_managed_target_range(target, size, "prefetch") location = _normalize_managed_location( location, location_type, @@ -405,40 +349,37 @@ def prefetch( def discard_prefetch( - target, + target: Buffer, location: Device | int | None = None, *, stream: Stream | GraphBuilder, - int size=_MANAGED_SIZE_NOT_PROVIDED, location_type: str | None = None, ): """Discard a managed-memory allocation range and prefetch it to a target location. Parameters ---------- - target : :class:`Buffer` | int | object - Managed allocation to operate on. This may be a :class:`Buffer` or a - raw pointer (requires ``size=``). + target : :class:`Buffer` + Managed allocation to operate on. location : :obj:`~_device.Device` | int | None, optional Target location. When ``location_type`` is ``None``, values are interpreted as a device ordinal, ``-1`` for host, or ``None``. A location is required for discard_prefetch. stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` Keyword argument specifying the stream for the asynchronous operation. - size : int, optional - Allocation size in bytes. Required when ``target`` is a raw pointer. location_type : str | None, optional Explicit location kind. Supported values are ``"device"``, ``"host"``, ``"host_numa"``, and ``"host_numa_current"``. """ - cdef object ptr - cdef object batch_ptr - cdef size_t nbytes - - ptr, nbytes = _normalize_managed_target_range(target, size, "discard_prefetch") + if not isinstance(target, Buffer): + raise TypeError(f"discard_prefetch target must be a Buffer, got {type(target).__name__}") + cdef Buffer buf = target + _require_managed_buffer(buf, "discard_prefetch") _require_managed_discard_prefetch_support("discard_prefetch") cdef Stream s = Stream_accept(stream) - batch_ptr = driver.CUdeviceptr(int(ptr)) + cdef object ptr = buf.handle + cdef size_t nbytes = buf._size + cdef object batch_ptr = driver.CUdeviceptr(int(ptr)) location = _normalize_managed_location( location, location_type, diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 544b7afc032..dbb5ac6d8c5 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1441,20 +1441,6 @@ def test_managed_memory_advise_accepts_enum_value(init_cuda): buffer.close() -def test_managed_memory_advise_size_rejected_for_buffer(init_cuda): - """advise() raises TypeError when size= is given with a Buffer target.""" - device = Device() - _skip_if_managed_allocation_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - - with pytest.raises(TypeError, match="does not accept size="): - managed_memory.advise(buffer, "set_read_mostly", size=1024) - - buffer.close() - - def test_managed_memory_advise_invalid_advice_values(init_cuda): """advise() rejects invalid advice strings and wrong types.""" device = Device() @@ -1472,34 +1458,6 @@ def test_managed_memory_advise_invalid_advice_values(init_cuda): buffer.close() -def test_managed_memory_functions_accept_raw_pointer_ranges(init_cuda): - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - managed_memory.advise(buffer.handle, "set_read_mostly", size=buffer.size) - assert ( - _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - ) - == _READ_MOSTLY_ENABLED - ) - - managed_memory.prefetch(buffer.handle, device, size=buffer.size, stream=stream) - stream.sync() - last_location = _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last_location == device.device_id - - buffer.close() - - def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch From cf2f20d1be323b8cd31f76125dffad959cf0b947 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 16:46:30 -0700 Subject: [PATCH 17/68] fix(cuda.core): update binding_version import after upstream merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream renamed get_binding_version → binding_version and moved it from cuda.core._utils.cuda_utils to cuda.core._utils.version. Update the managed-memory ops module to match. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 04dc33ed755..81ff5582a62 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -7,7 +7,8 @@ from __future__ import annotations from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs from cuda.core._stream cimport Stream, Stream_accept -from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._utils.cuda_utils import driver, handle_return +from cuda.core._utils.version import binding_version from cuda.core._device import Device @@ -201,7 +202,7 @@ cdef bint _managed_location_uses_v2_bindings(): # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. global _V2_BINDINGS if _V2_BINDINGS < 0: - _V2_BINDINGS = 1 if get_binding_version() >= (13, 0) else 0 + _V2_BINDINGS = 1 if binding_version() >= (13, 0) else 0 return _V2_BINDINGS != 0 From db3bac2e042ff07b6ab37f510f2fe06bc1cbc598 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 16:46:36 -0700 Subject: [PATCH 18/68] revert: drop managed_memory shim in cuda.core.experimental The cuda.core.experimental namespace is being deprecated and should not gain new submodules. Per review feedback, the managed_memory module should only be reachable via cuda.core.managed_memory, not via the experimental compatibility shim. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/experimental/__init__.py | 3 +-- cuda_core/tests/test_experimental_backward_compat.py | 7 ------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 34b442173b2..f65e7852a9a 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -38,10 +38,9 @@ def _warn_deprecated(): _warn_deprecated() -from cuda.core import managed_memory, system, utils +from cuda.core import system, utils # Make utils accessible as a submodule for backward compatibility -__import__("sys").modules[__spec__.name + ".managed_memory"] = managed_memory __import__("sys").modules[__spec__.name + ".utils"] = utils diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py index 82e2cdd5bea..c3215b056ac 100644 --- a/cuda_core/tests/test_experimental_backward_compat.py +++ b/cuda_core/tests/test_experimental_backward_compat.py @@ -38,7 +38,6 @@ def test_experimental_backward_compatibility(): assert hasattr(cuda.core.experimental, "Device") assert hasattr(cuda.core.experimental, "Stream") assert hasattr(cuda.core.experimental, "Buffer") - assert hasattr(cuda.core.experimental, "managed_memory") assert hasattr(cuda.core.experimental, "system") # Test 2: Direct imports - should emit deprecation warning @@ -74,7 +73,6 @@ def test_experimental_backward_compatibility(): assert cuda.core.experimental.Linker is cuda.core.Linker # Compare singletons - assert cuda.core.experimental.managed_memory is cuda.core.managed_memory assert cuda.core.experimental.system is cuda.core.system # Test 4: Utils module works @@ -90,11 +88,6 @@ def test_experimental_backward_compatibility(): assert StridedMemoryView is not None assert args_viewable_as_strided_memory is not None - from cuda.core.experimental.managed_memory import advise, discard_prefetch, prefetch - - assert advise is not None - assert prefetch is not None - assert discard_prefetch is not None # Test 5: Options classes are accessible assert hasattr(cuda.core.experimental, "EventOptions") From 20d036ebe1ae148222b4ad9e0fdca20502ed24de Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 16:59:58 -0700 Subject: [PATCH 19/68] feat(cuda.core): add Location dataclass for managed memory Frozen dataclass with classmethod constructors for the four CUmemLocationType kinds (device, host, host_numa, host_numa_current). Validates id constraints in __post_init__. Re-exported from cuda.core.managed_memory. This will replace the location=/location_type= kwargs in the upcoming unified 1..N managed-memory ops API. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_location.py | 51 +++++++++++++++++++ cuda_core/cuda/core/managed_memory.py | 3 +- cuda_core/tests/test_memory.py | 43 ++++++++++++++++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 cuda_core/cuda/core/_memory/_managed_location.py diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py new file mode 100644 index 00000000000..7e2515f573e --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +_VALID_KINDS = ("device", "host", "host_numa", "host_numa_current") +LocationKind = Literal["device", "host", "host_numa", "host_numa_current"] + + +@dataclass(frozen=True) +class Location: + """Typed managed-memory location. + + Use the classmethod constructors (``device``, ``host``, ``host_numa``, + ``host_numa_current``) rather than constructing directly. + """ + + kind: LocationKind + id: int | None = None + + def __post_init__(self) -> None: + if self.kind not in _VALID_KINDS: + raise ValueError(f"kind must be one of {_VALID_KINDS!r}, got {self.kind!r}") + if self.kind == "device": + if not isinstance(self.id, int) or self.id < 0: + raise ValueError("device id must be >= 0") + elif self.kind == "host_numa": + if not isinstance(self.id, int) or self.id < 0: + raise ValueError("host_numa id must be >= 0") + elif self.kind in ("host", "host_numa_current"): + if self.id is not None: + raise ValueError(f"{self.kind} location must have id=None") + + @classmethod + def device(cls, device_id: int) -> "Location": + return cls(kind="device", id=device_id) + + @classmethod + def host(cls) -> "Location": + return cls(kind="host", id=None) + + @classmethod + def host_numa(cls, numa_id: int) -> "Location": + return cls(kind="host_numa", id=numa_id) + + @classmethod + def host_numa_current(cls) -> "Location": + return cls(kind="host_numa_current", id=None) diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index 005c9ec3cf0..25191fe0381 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -4,6 +4,7 @@ """Managed-memory range operations.""" +from cuda.core._memory._managed_location import Location from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch -__all__ = ["advise", "discard_prefetch", "prefetch"] +__all__ = ["Location", "advise", "discard_prefetch", "prefetch"] diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 7ff15047e8b..8b3db88b8d8 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1918,3 +1918,46 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): assert buffer.handle >= 0 assert buffer.size == 0 assert buffer.device_id == mr.device_id + + +class TestLocation: + def test_device_constructor(self): + from cuda.core.managed_memory import Location + loc = Location.device(0) + assert loc.kind == "device" + assert loc.id == 0 + + def test_host_constructor(self): + from cuda.core.managed_memory import Location + loc = Location.host() + assert loc.kind == "host" + assert loc.id is None + + def test_host_numa_constructor(self): + from cuda.core.managed_memory import Location + loc = Location.host_numa(3) + assert loc.kind == "host_numa" + assert loc.id == 3 + + def test_host_numa_current_constructor(self): + from cuda.core.managed_memory import Location + loc = Location.host_numa_current() + assert loc.kind == "host_numa_current" + assert loc.id is None + + def test_frozen(self): + import dataclasses + from cuda.core.managed_memory import Location + loc = Location.device(0) + with pytest.raises(dataclasses.FrozenInstanceError): + loc.id = 1 + + def test_invalid_device_id(self): + from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="device id must be >= 0"): + Location.device(-1) + + def test_invalid_kind(self): + from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="kind must be one of"): + Location(kind="not_a_kind", id=None) From c2dae533f073fab65d81f6524be78d9c2e129d1e Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:02:51 -0700 Subject: [PATCH 20/68] feat(cuda.core): add _coerce_location helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Centralizes back-compat coercion for managed-memory Location inputs: - Location → passthrough - Device → Location.device(device_id) - int >= 0 → Location.device(int) - int == -1 → Location.host() - None → None when allow_none=True, else ValueError Will be used by the unified 1..N managed-memory ops API. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_location.py | 29 ++++++++++++ cuda_core/tests/test_memory.py | 44 +++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index 7e2515f573e..e081a8da32b 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -49,3 +49,32 @@ def host_numa(cls, numa_id: int) -> "Location": @classmethod def host_numa_current(cls) -> "Location": return cls(kind="host_numa_current", id=None) + + +def _coerce_location(value, *, allow_none: bool = False) -> Location | None: + """Coerce user input to a Location instance. + + Accepts: Location (passthrough), Device (uses device_id), int (>=0 → device, + -1 → host), None (only if allow_none=True). + """ + from cuda.core._device import Device # avoid import cycle at module load + + if isinstance(value, Location): + return value + if isinstance(value, Device): + return Location.device(value.device_id) + if value is None: + if allow_none: + return None + raise ValueError("location is required") + if isinstance(value, int): + if value == -1: + return Location.host() + if value >= 0: + return Location.device(value) + raise ValueError( + f"device ordinal must be >= 0 (or -1 for host), got {value}" + ) + raise TypeError( + f"location must be a Location, Device, int, or None; got {type(value).__name__}" + ) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 8b3db88b8d8..bccc0fa67be 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1961,3 +1961,47 @@ def test_invalid_kind(self): from cuda.core.managed_memory import Location with pytest.raises(ValueError, match="kind must be one of"): Location(kind="not_a_kind", id=None) + + +class TestLocationCoerce: + def test_passthrough(self): + from cuda.core._memory._managed_location import _coerce_location + from cuda.core.managed_memory import Location + loc = Location.device(0) + assert _coerce_location(loc) is loc + + def test_int_device(self): + from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(0).kind == "device" + assert _coerce_location(0).id == 0 + + def test_int_minus_one_is_host(self): + from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(-1).kind == "host" + + def test_device_object(self, init_cuda): + from cuda.core import Device + from cuda.core._memory._managed_location import _coerce_location + dev = Device() + loc = _coerce_location(dev) + assert loc.kind == "device" + assert loc.id == dev.device_id + + def test_none_when_disallowed(self): + from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(ValueError, match="location is required"): + _coerce_location(None, allow_none=False) + + def test_none_when_allowed(self): + from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(None, allow_none=True) is None + + def test_bad_int(self): + from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(ValueError, match="device ordinal"): + _coerce_location(-2) + + def test_bad_type(self): + from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(TypeError, match="Location, Device, int, or None"): + _coerce_location("device") From 935c8ba7b34a8c7e3afc391318d480baee23a551 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:18:37 -0700 Subject: [PATCH 21/68] test(cuda.core): update monkeypatch target after binding_version rename The legacy-bindings monkeypatch tests still referenced get_binding_version, which was renamed to binding_version in cf2f20d1be. Update both occurrences. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/tests/test_memory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index bccc0fa67be..2304c370fd3 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1371,7 +1371,7 @@ def fake_cuMemAdvise(ptr, size, advice, location): calls.append((ptr, size, advice, location)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise) @@ -1396,7 +1396,7 @@ def fake_cuMemPrefetchAsync(ptr, size, location, hstream): calls.append((ptr, size, location, hstream)) return (driver.CUresult.CUDA_SUCCESS,) - monkeypatch.setattr(_managed_memory_ops, "get_binding_version", lambda: _LEGACY_BINDINGS_VERSION) + monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION) monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) From dc4653513bc04d1ce1fe1214630fdf628f13ef8a Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:19:59 -0700 Subject: [PATCH 22/68] refactor(cuda.core): tighten memory-attr query Address review feedback on _buffer.pyx: - Restore `inline` on `_init_mem_attrs` and `_query_memory_attrs`. - Set `out.is_managed = (is_managed != 0)` once outside the if/elif, rather than per-branch (driver leaves the attribute zero for non-managed pointers, so all three branches converged on the same value anyway). - Add a TODO noting that HMM/ATS-enabled sysmem should also report `is_managed=True`; the CU_POINTER_ATTRIBUTE_IS_MANAGED query does not capture that yet. The Cython modernization of _managed_memory_ops.pyx (cimport cydriver, IF/ELSE for the 12/13 ABI split) is folded into Tasks 5-8 where the public API is being rewritten anyway; doing it here would mean rewriting the same call sites twice. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 6c7f8ffd141..4ca8650e8db 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -422,14 +422,14 @@ cdef class Buffer: # Memory Attribute Query Helpers # ------------------------------ -cdef void _init_mem_attrs(Buffer self): +cdef inline void _init_mem_attrs(Buffer self): """Initialize memory attributes by querying the pointer.""" if not self._mem_attrs_inited: _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr)) self._mem_attrs_inited = True -cdef int _query_memory_attrs( +cdef inline int _query_memory_attrs( _MemAttrs& out, cydriver.CUdeviceptr ptr ) except -1 nogil: @@ -456,12 +456,15 @@ cdef int _query_memory_attrs( ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) HANDLE_RETURN(ret) + # TODO: HMM/ATS-enabled sysmem should also report is_managed=True; the + # CU_POINTER_ATTRIBUTE_IS_MANAGED query does not capture that yet. + out.is_managed = is_managed != 0 + if memory_type == 0: # unregistered host pointer out.is_host_accessible = True out.is_device_accessible = False out.device_id = -1 - out.is_managed = False elif ( is_managed or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST @@ -470,12 +473,10 @@ cdef int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = True out.device_id = device_id - out.is_managed = is_managed != 0 elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: out.is_host_accessible = False out.is_device_accessible = True out.device_id = device_id - out.is_managed = False else: with cython.gil: raise ValueError(f"Unsupported memory type: {memory_type}") From 818f5d25d8416245b5f781d3d06b5c751337eaa6 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:53:12 -0700 Subject: [PATCH 23/68] feat(cuda.core): unified 1..N managed_memory.prefetch with cydriver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite prefetch() with the unified single-or-batched signature targeted by issue #1333: - prefetch(targets, location, *, options=None, stream) - targets accepts a single Buffer or a sequence of Buffers - location accepts a Location dataclass, Device, int (-1 = host), or a sequence broadcasting to per-buffer locations - length mismatch raises ValueError; empty targets raises ValueError - options is reserved for future per-call flags and must be None - stream moved to the end, kept keyword-only Internals: switch from Python-level driver.cuMemPrefetchAsync to Cython-level cydriver.cuMemPrefetchAsync via cimport cydriver, with HANDLE_RETURN. Replace the runtime _V2_BINDINGS check with compile-time IF CUDA_CORE_BUILD_MAJOR >= 13 / ELSE per the codebase precedent in _managed_memory_resource.pyx, _memory_pool.pyx, _tensor_map.pyx. N>1 dispatches to cydriver.cuMemPrefetchBatchAsync (CUDA 13 only); on CUDA 12 builds, batched prefetch raises NotImplementedError. Single-range prefetch continues to work on both CUDA 12 and 13 builds. The location_type= keyword is removed; callers express location kind via the Location dataclass added in 20d036ebe1. The advise() and discard_prefetch() functions still use the legacy _normalize_managed_location helper and Python-level driver calls; they will be migrated in their own tasks. Also drops test_managed_memory_prefetch_uses_legacy_bindings_signature, which monkeypatched the Python-level driver.cuMemPrefetchAsync — no longer applicable since the prefetch path uses cydriver. The corresponding advise legacy-bindings test stays for now (advise still uses Python driver). Closes Andy-Jost's review comment that the existing API is "non-Pythonic" by making it Pythonic in a different direction (typed Location dataclass) while preserving the free-function shape pending Leo's tie-break on ManagedBuffer subclass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 210 ++++++++++++++---- cuda_core/tests/test_memory.py | 147 +++++++++--- 2 files changed, 284 insertions(+), 73 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 81ff5582a62..b608b532ab2 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -4,12 +4,19 @@ from __future__ import annotations +from cpython.mem cimport PyMem_Free, PyMem_Malloc +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs +from cuda.core._resource_handles cimport as_cu from cuda.core._stream cimport Stream, Stream_accept +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core._utils.version import binding_version from cuda.core._device import Device +from cuda.core._memory._managed_location import Location, _coerce_location cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( @@ -228,6 +235,74 @@ cdef void _require_managed_buffer(Buffer self, str what): raise ValueError(f"{what} requires a managed-memory allocation") +# Coerce ``targets`` (single Buffer or sequence) to a tuple[Buffer, ...]. +cdef tuple _coerce_buffer_targets(object targets, str what): + cdef list out + if isinstance(targets, Buffer): + return (targets,) + if isinstance(targets, (list, tuple)): + if not targets: + raise ValueError(f"{what}: empty targets sequence") + out = [] + for t in targets: + if not isinstance(t, Buffer): + raise TypeError( + f"{what}: each target must be a Buffer, got {type(t).__name__}" + ) + out.append(t) + return tuple(out) + raise TypeError( + f"{what}: targets must be a Buffer or sequence of Buffer, " + f"got {type(targets).__name__}" + ) + + +# Broadcast a single location across ``n`` targets, or coerce a length-N +# sequence elementwise. +cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what): + cdef object coerced + if isinstance(location, (list, tuple)): + if len(location) != n: + raise ValueError( + f"{what}: location length {len(location)} does not match " + f"targets length {n}" + ) + return tuple(_coerce_location(loc, allow_none=allow_none) for loc in location) + coerced = _coerce_location(location, allow_none=allow_none) + return tuple([coerced] * n) + + +IF CUDA_CORE_BUILD_MAJOR >= 13: + # Convert a Location dataclass to a cydriver.CUmemLocation struct. + cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc): + cdef cydriver.CUmemLocation out + cdef str kind = loc.kind + if kind == "device": + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + out.id = loc.id + elif kind == "host": + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST + out.id = 0 + elif kind == "host_numa": + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA + out.id = loc.id + else: # host_numa_current + out.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT + out.id = 0 + return out +ELSE: + # CUDA 12 cuMemPrefetchAsync takes a device ordinal (-1 = host). + cdef inline int _to_legacy_device(object loc) except? -2: + cdef str kind = loc.kind + if kind == "device": + return loc.id + if kind == "host": + return -1 + raise RuntimeError( + f"location_type={kind!r} requires a CUDA 13 build of cuda.core" + ) + + cdef void _require_managed_discard_prefetch_support(str what): global _DISCARD_PREFETCH_SUPPORTED if _DISCARD_PREFETCH_SUPPORTED < 0: @@ -293,59 +368,106 @@ def advise( def prefetch( - target: Buffer, - location: Device | int | None = None, + targets, + location=None, *, - stream: Stream | GraphBuilder, - location_type: str | None = None, + options=None, + stream, ): - """Prefetch a managed-memory allocation range to a target location. + """Prefetch one or more managed-memory ranges to a target location. Parameters ---------- - target : :class:`Buffer` - Managed allocation to operate on. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous prefetch. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to operate on. + location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + Target location(s). A single location applies to all targets; a + sequence must match ``len(targets)``. ``Device`` and ``int`` values + are coerced to :class:`Location` (``-1`` maps to host). + options : None + Reserved for future per-call flags. Must be ``None``. + stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` + Stream for the asynchronous prefetch (keyword-only). + + Raises + ------ + NotImplementedError + If ``len(targets) > 1`` on a CUDA 12 build of ``cuda.core``. """ - if not isinstance(target, Buffer): - raise TypeError(f"prefetch target must be a Buffer, got {type(target).__name__}") - cdef Buffer buf = target - _require_managed_buffer(buf, "prefetch") + if options is not None: + raise TypeError( + f"prefetch options must be None (reserved); got {type(options).__name__}" + ) + cdef tuple bufs = _coerce_buffer_targets(targets, "prefetch") + cdef Py_ssize_t n = len(bufs) + cdef tuple locs = _broadcast_locations(location, n, False, "prefetch") cdef Stream s = Stream_accept(stream) - cdef object ptr = buf.handle - cdef size_t nbytes = buf._size - location = _normalize_managed_location( - location, - location_type, - "prefetch", - ) - if _managed_location_uses_v2_bindings(): - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - location, - _MANAGED_OPERATION_FLAGS, - s.handle, - ) - ) + cdef Buffer buf + for buf in bufs: + _require_managed_buffer(buf, "prefetch") + + if n == 1: + _do_single_prefetch(bufs[0], locs[0], s) else: - handle_return( - driver.cuMemPrefetchAsync( - ptr, - nbytes, - _managed_location_to_legacy_device(location, "prefetch"), - s.handle, - ) + _do_batch_prefetch(bufs, locs, s) + + +cdef void _do_single_prefetch(Buffer buf, object loc, Stream s): + cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) + cdef size_t nbytes = buf._size + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef cydriver.CUmemLocation cu_loc = _to_cumemlocation(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, cu_loc, 0, hstream)) + ELSE: + cdef int dev_int = _to_legacy_device(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream)) + + +cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef Py_ssize_t n = len(bufs) + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( + n * sizeof(cydriver.CUdeviceptr) + ) + cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) + cdef cydriver.CUmemLocation* loc_arr = PyMem_Malloc( + n * sizeof(cydriver.CUmemLocation) + ) + cdef size_t* loc_indices = PyMem_Malloc(n * sizeof(size_t)) + if not (ptrs and sizes and loc_arr and loc_indices): + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + raise MemoryError() + cdef Buffer buf + cdef Py_ssize_t i + try: + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + loc_arr[i] = _to_cumemlocation(locs[i]) + loc_indices[i] = i + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchBatchAsync( + ptrs, sizes, n, + loc_arr, loc_indices, n, + 0, hstream, + )) + finally: + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + ELSE: + raise NotImplementedError( + "batched prefetch requires a CUDA 13 build of cuda.core" ) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 2304c370fd3..89c8fda1c04 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1383,32 +1383,6 @@ def fake_cuMemAdvise(ptr, size, advice, location): buffer.close() -def test_managed_memory_prefetch_uses_legacy_bindings_signature(monkeypatch, init_cuda): - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - calls = [] - - def fake_cuMemPrefetchAsync(ptr, size, location, hstream): - calls.append((ptr, size, location, hstream)) - return (driver.CUresult.CUDA_SUCCESS,) - - monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) - monkeypatch.setattr(_managed_memory_ops.driver, "cuMemPrefetchAsync", fake_cuMemPrefetchAsync) - - managed_memory.prefetch(buffer, device, stream=stream) - - assert len(calls) == 1 - assert calls[0][2] == device.device_id - assert int(calls[0][3]) == int(stream.handle) - - buffer.close() - - def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): device = Device() device.set_current() @@ -1435,12 +1409,10 @@ def test_managed_memory_operation_validation(init_cuda): buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(ValueError, match="requires a location"): + with pytest.raises(ValueError, match="location is required"): managed_memory.prefetch(buffer, stream=stream) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa") - with pytest.raises(ValueError, match="location must be None or -1"): - managed_memory.prefetch(buffer, _INVALID_HOST_DEVICE_ORDINAL, stream=stream, location_type="host") buffer.close() @@ -2005,3 +1977,120 @@ def test_bad_type(self): from cuda.core._memory._managed_location import _coerce_location with pytest.raises(TypeError, match="Location, Device, int, or None"): _coerce_location("device") + + +class TestPrefetch: + def test_single_with_location_host(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + prefetch(buf, Location.host(), stream=stream) + stream.sync() + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == _HOST_LOCATION_ID + buf.close() + + def test_batched_same_location(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemPrefetchBatchAsync"): + pytest.skip("cuMemPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] + stream = device.create_stream() + + prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + + for buf in bufs: + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_batched_per_buffer_location(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemPrefetchBatchAsync"): + pytest.skip("cuMemPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + + prefetch(bufs, [Location.host(), Location.device(device.device_id)], stream=stream) + stream.sync() + + last0 = _get_int_mem_range_attr( + bufs[0], + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + last1 = _get_int_mem_range_attr( + bufs[1], + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last0 == _HOST_LOCATION_ID + assert last1 == device.device_id + for buf in bufs: + buf.close() + + def test_length_mismatch(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + + with pytest.raises(ValueError, match="length"): + prefetch(bufs, [Location.host()], stream=stream) + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + prefetch(buf, Location.host(), stream=stream) + buf.close() + + def test_location_required(self, init_cuda): + from cuda.core.managed_memory import prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="location is required"): + prefetch(buf, None, stream=stream) + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.managed_memory import Location, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(TypeError, match="must be None"): + prefetch(buf, Location.host(), options={}, stream=stream) + buf.close() From e296e72986b124dcbb07027e17160a5e0290b8b0 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 17:59:24 -0700 Subject: [PATCH 24/68] feat(cuda.core): add managed_memory.discard Adds a new discard(targets, *, options=None, stream) free function that wraps cuMemDiscardBatchAsync. Accepts a single Buffer or a sequence; N>=1 dispatches to the batched driver entry point. Requires a CUDA 13 build of cuda.core (NotImplementedError on CUDA 12 builds). Closes the second of three batched managed-memory operations from #1333: P1: cudaMemDiscardBatchAsync <- this commit P1: cudaMemPrefetchBatchAsync <- 818f5d25d8 P1: cudaMemDiscardAndPrefetchBatchAsync <- next commit Re-exported from cuda.core.managed_memory. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 71 +++++++++++++++++++ cuda_core/cuda/core/managed_memory.py | 4 +- cuda_core/tests/test_memory.py | 57 +++++++++++++++ 3 files changed, 130 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index b608b532ab2..031b56a8af0 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -313,6 +313,77 @@ cdef void _require_managed_discard_prefetch_support(str what): ) +def discard( + targets, + *, + options=None, + stream, +): + """Discard one or more managed-memory ranges. + + Parameters + ---------- + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to discard. Their resident pages + are released without prefetching new contents; subsequent access + is satisfied by lazy migration. + options : None + Reserved for future per-call flags. Must be ``None``. + stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` + Stream for the asynchronous discard (keyword-only). + + Raises + ------ + NotImplementedError + On a CUDA 12 build of ``cuda.core``. Discard requires CUDA 13+. + """ + if options is not None: + raise TypeError( + f"discard options must be None (reserved); got {type(options).__name__}" + ) + cdef tuple bufs = _coerce_buffer_targets(targets, "discard") + cdef Py_ssize_t n = len(bufs) + cdef Stream s = Stream_accept(stream) + + cdef Buffer buf + for buf in bufs: + _require_managed_buffer(buf, "discard") + + _do_batch_discard(bufs, s) + + +cdef void _do_batch_discard(tuple bufs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef Py_ssize_t n = len(bufs) + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( + n * sizeof(cydriver.CUdeviceptr) + ) + cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) + if not (ptrs and sizes): + PyMem_Free(ptrs) + PyMem_Free(sizes) + raise MemoryError() + cdef Buffer buf + cdef Py_ssize_t i + try: + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + with nogil: + HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync( + ptrs, sizes, n, 0, hstream, + )) + finally: + PyMem_Free(ptrs) + PyMem_Free(sizes) + ELSE: + raise NotImplementedError( + "discard requires a CUDA 13 build of cuda.core" + ) + + def advise( target: Buffer, advice: driver.CUmem_advise | str, diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py index 25191fe0381..509e874ccc0 100644 --- a/cuda_core/cuda/core/managed_memory.py +++ b/cuda_core/cuda/core/managed_memory.py @@ -5,6 +5,6 @@ """Managed-memory range operations.""" from cuda.core._memory._managed_location import Location -from cuda.core._memory._managed_memory_ops import advise, discard_prefetch, prefetch +from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch -__all__ = ["Location", "advise", "discard_prefetch", "prefetch"] +__all__ = ["Location", "advise", "discard", "discard_prefetch", "prefetch"] diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 89c8fda1c04..c18fa725198 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -2094,3 +2094,60 @@ def test_options_must_be_none(self, init_cuda): with pytest.raises(TypeError, match="must be None"): prefetch(buf, Location.host(), options={}, stream=stream) buf.close() + + +class TestDiscard: + def test_single_buffer(self, init_cuda): + from cuda.core.managed_memory import Location, discard, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardBatchAsync"): + pytest.skip("cuMemDiscardBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + prefetch(buf, Location.device(device.device_id), stream=stream) + stream.sync() + discard(buf, stream=stream) + stream.sync() + buf.close() + + def test_batched(self, init_cuda): + from cuda.core.managed_memory import Location, discard, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardBatchAsync"): + pytest.skip("cuMemDiscardBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] + stream = device.create_stream() + prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + discard(bufs, stream=stream) + stream.sync() + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.managed_memory import discard + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + discard(buf, stream=stream) + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.managed_memory import discard + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(TypeError, match="must be None"): + discard(buf, options={}, stream=stream) + buf.close() From e697131defa9c65cce468b8f946e0f16f442744a Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:01:33 -0700 Subject: [PATCH 25/68] feat(cuda.core): unified 1..N managed_memory.discard_prefetch with cydriver Rewrite discard_prefetch() with the unified single-or-batched signature: discard_prefetch(targets, location, *, options=None, stream) - targets accepts a single Buffer or a sequence of Buffers - location accepts a Location, Device, int, or per-buffer sequence - length mismatch / empty targets raise ValueError - options must be None (reserved) - stream moved to end, kept keyword-only Internals: switch from Python-level driver.cuMemDiscardAndPrefetchBatchAsync to Cython-level cydriver.cuMemDiscardAndPrefetchBatchAsync. The runtime discard-prefetch availability check is replaced by compile-time IF CUDA_CORE_BUILD_MAJOR >= 13 / ELSE; on CUDA 12 builds the call raises NotImplementedError. The location_type= keyword is removed; use Location dataclass instead. Closes the third managed-memory batched op from #1333. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 117 ++++++++++++------ cuda_core/tests/test_memory.py | 70 +++++++++++ 2 files changed, 147 insertions(+), 40 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 031b56a8af0..21926883207 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -543,51 +543,88 @@ cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): def discard_prefetch( - target: Buffer, - location: Device | int | None = None, + targets, + location=None, *, - stream: Stream | GraphBuilder, - location_type: str | None = None, + options=None, + stream, ): - """Discard a managed-memory allocation range and prefetch it to a target location. + """Discard one or more managed-memory ranges and prefetch them to a target location. Parameters ---------- - target : :class:`Buffer` - Managed allocation to operate on. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None``. - A location is required for discard_prefetch. - stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder` - Keyword argument specifying the stream for the asynchronous operation. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to discard and re-prefetch. + location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + Target location(s). A single location applies to all targets; + a sequence must match ``len(targets)``. + options : None + Reserved for future per-call flags. Must be ``None``. + stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` + Stream for the asynchronous operation (keyword-only). + + Raises + ------ + NotImplementedError + On a CUDA 12 build of ``cuda.core``. Discard-and-prefetch + requires CUDA 13+. """ - if not isinstance(target, Buffer): - raise TypeError(f"discard_prefetch target must be a Buffer, got {type(target).__name__}") - cdef Buffer buf = target - _require_managed_buffer(buf, "discard_prefetch") - _require_managed_discard_prefetch_support("discard_prefetch") + if options is not None: + raise TypeError( + f"discard_prefetch options must be None (reserved); " + f"got {type(options).__name__}" + ) + cdef tuple bufs = _coerce_buffer_targets(targets, "discard_prefetch") + cdef Py_ssize_t n = len(bufs) + cdef tuple locs = _broadcast_locations(location, n, False, "discard_prefetch") cdef Stream s = Stream_accept(stream) - cdef object ptr = buf.handle - cdef size_t nbytes = buf._size - cdef object batch_ptr = driver.CUdeviceptr(int(ptr)) - location = _normalize_managed_location( - location, - location_type, - "discard_prefetch", - ) - handle_return( - driver.cuMemDiscardAndPrefetchBatchAsync( - [batch_ptr], - [nbytes], - _SINGLE_RANGE_COUNT, - [location], - [_FIRST_PREFETCH_LOCATION_INDEX], - _SINGLE_PREFETCH_LOCATION_COUNT, - _MANAGED_OPERATION_FLAGS, - s.handle, + + cdef Buffer buf + for buf in bufs: + _require_managed_buffer(buf, "discard_prefetch") + + _do_batch_discard_prefetch(bufs, locs, s) + + +cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef Py_ssize_t n = len(bufs) + cdef cydriver.CUstream hstream = as_cu(s._h_stream) + cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( + n * sizeof(cydriver.CUdeviceptr) + ) + cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) + cdef cydriver.CUmemLocation* loc_arr = PyMem_Malloc( + n * sizeof(cydriver.CUmemLocation) + ) + cdef size_t* loc_indices = PyMem_Malloc(n * sizeof(size_t)) + if not (ptrs and sizes and loc_arr and loc_indices): + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + raise MemoryError() + cdef Buffer buf + cdef Py_ssize_t i + try: + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + loc_arr[i] = _to_cumemlocation(locs[i]) + loc_indices[i] = i + with nogil: + HANDLE_RETURN(cydriver.cuMemDiscardAndPrefetchBatchAsync( + ptrs, sizes, n, + loc_arr, loc_indices, n, + 0, hstream, + )) + finally: + PyMem_Free(ptrs) + PyMem_Free(sizes) + PyMem_Free(loc_arr) + PyMem_Free(loc_indices) + ELSE: + raise NotImplementedError( + "discard_prefetch requires a CUDA 13 build of cuda.core" ) - ) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index c18fa725198..627a60bb3f9 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -2151,3 +2151,73 @@ def test_options_must_be_none(self, init_cuda): with pytest.raises(TypeError, match="must be None"): discard(buf, options={}, stream=stream) buf.close() + + +class TestDiscardPrefetch: + def test_single_buffer(self, init_cuda): + from cuda.core.managed_memory import Location, discard_prefetch, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + prefetch(buf, Location.host(), stream=stream) + stream.sync() + discard_prefetch(buf, Location.device(device.device_id), stream=stream) + stream.sync() + + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_batched_same_location(self, init_cuda): + from cuda.core.managed_memory import Location, discard_prefetch, prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + prefetch(bufs, Location.host(), stream=stream) + stream.sync() + discard_prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + for buf in bufs: + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_length_mismatch(self, init_cuda): + from cuda.core.managed_memory import Location, discard_prefetch + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + with pytest.raises(ValueError, match="length"): + discard_prefetch(bufs, [Location.host()], stream=stream) + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.managed_memory import Location, discard_prefetch + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + discard_prefetch(buf, Location.host(), stream=stream) + buf.close() From 3bc10219dc3086d5449aa811e2f6086b73d915fb Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:08:40 -0700 Subject: [PATCH 26/68] feat(cuda.core): unified 1..N managed_memory.advise + drop legacy apparatus Rewrite advise() with the unified single-or-batched signature: advise(targets, advice, location=None, *, options=None) - targets accepts a single Buffer or a sequence - advice still accepts string aliases or driver.CUmem_advise enum values - location accepts Location dataclass, Device, int, None, or per-buffer sequence (None permitted only for set_read_mostly, unset_read_mostly, unset_preferred_location) - Per-advice allowed-kind validation ported to operate on Location.kind (matches CUDA driver constraints from existing tables) - options reserved for future per-call flags - For N>1, loops cydriver.cuMemAdvise per buffer (no batched advise API exists in CUDA) Internals: switch to cydriver.cuMemAdvise (Cython-level); use compile-time IF CUDA_CORE_BUILD_MAJOR >= 13 / ELSE for the 12/13 ABI split. Drop the legacy apparatus that all four functions previously shared: - _normalize_managed_location (returned Python driver.CUmemLocation) - _make_managed_location, _managed_location_enum - _managed_location_uses_v2_bindings + _V2_BINDINGS lazy cache - _managed_location_to_legacy_device + _LEGACY_LOC_DEVICE/HOST cache - _require_managed_discard_prefetch_support - Unused module-level constants (_HOST_NUMA_CURRENT_ID, _SINGLE_RANGE_COUNT, _MANAGED_OPERATION_FLAGS, etc.) Also drop test_managed_memory_advise_uses_legacy_bindings_signature and the _LEGACY_BINDINGS_VERSION constant; the runtime version switch is gone, replaced by compile-time IF/ELSE that the test could not exercise. The CUDA 12 vs CUDA 13 paths are now covered by the build-matrix CI job. Closes Task 8 (advise) and Task 9 (legacy-bindings test cleanup) from docs/superpowers/plans/2026-04-27-managed-memory-ops-batched.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 270 +++++------------- cuda_core/tests/test_memory.py | 91 ++++-- 2 files changed, 127 insertions(+), 234 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 21926883207..11236a1ecfb 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -13,26 +13,10 @@ from cuda.core._resource_handles cimport as_cu from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from cuda.core._utils.cuda_utils import driver, handle_return -from cuda.core._utils.version import binding_version -from cuda.core._device import Device +from cuda.core._utils.cuda_utils import driver from cuda.core._memory._managed_location import Location, _coerce_location -cdef tuple _VALID_MANAGED_LOCATION_TYPES = ( - "device", - "host", - "host_numa", - "host_numa_current", -) - -cdef dict _MANAGED_LOCATION_TYPE_ATTRS = { - "device": "CU_MEM_LOCATION_TYPE_DEVICE", - "host": "CU_MEM_LOCATION_TYPE_HOST", - "host_numa": "CU_MEM_LOCATION_TYPE_HOST_NUMA", - "host_numa_current": "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT", -} - cdef dict _MANAGED_ADVICE_ALIASES = { "set_read_mostly": "CU_MEM_ADVISE_SET_READ_MOSTLY", "unset_read_mostly": "CU_MEM_ADVISE_UNSET_READ_MOSTLY", @@ -61,43 +45,8 @@ cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { "unset_accessed_by": _DEVICE_HOST_ONLY, } -cdef int _HOST_NUMA_CURRENT_ID = 0 -cdef int _FIRST_PREFETCH_LOCATION_INDEX = 0 -cdef size_t _SINGLE_RANGE_COUNT = 1 -cdef size_t _SINGLE_PREFETCH_LOCATION_COUNT = 1 -cdef unsigned long long _MANAGED_OPERATION_FLAGS = 0 - -# Lazily cached values for immutable runtime properties. -cdef object _CU_DEVICE_CPU = None +# Lazily cached: maps driver.CUmem_advise enum value → string alias. cdef dict _ADVICE_ENUM_TO_ALIAS = None -_V2_BINDINGS = -1 -cdef int _DISCARD_PREFETCH_SUPPORTED = -1 - - -cdef object _managed_location_enum(str location_type): - cdef str attr_name = _MANAGED_LOCATION_TYPE_ATTRS[location_type] - cdef object result = getattr(driver.CUmemLocationType, attr_name, None) - if result is None: - raise RuntimeError( - f"Managed-memory location type {location_type!r} is not supported by the " - f"installed cuda.bindings package." - ) - return result - - -cdef object _make_managed_location(str location_type, int location_id): - global _CU_DEVICE_CPU - cdef object location = driver.CUmemLocation() - location.type = _managed_location_enum(location_type) - if location_type == "host": - if _CU_DEVICE_CPU is None: - _CU_DEVICE_CPU = int(getattr(driver, "CU_DEVICE_CPU", -1)) - location.id = _CU_DEVICE_CPU - elif location_type == "host_numa_current": - location.id = _HOST_NUMA_CURRENT_ID - else: - location.id = location_id - return location cdef tuple _normalize_managed_advice(object advice): @@ -131,104 +80,6 @@ cdef tuple _normalize_managed_advice(object advice): ) -cdef object _normalize_managed_location( - object location, - object location_type, - str what, - bint allow_none=False, - frozenset allowed_loctypes=_ALL_LOCATION_TYPES, -): - cdef object loc_type - cdef int loc_id - - if isinstance(location, Device): - location = location.device_id - - if location_type is not None and not isinstance(location_type, str): - raise TypeError(f"{what} location_type must be a string or None, got {type(location_type).__name__}") - - loc_type = None if location_type is None else (location_type).lower() - if loc_type is not None and loc_type not in _VALID_MANAGED_LOCATION_TYPES: - raise ValueError( - f"{what} location_type must be one of {_VALID_MANAGED_LOCATION_TYPES!r} " - f"or None, got {location_type!r}" - ) - - if loc_type is not None and loc_type not in allowed_loctypes: - raise ValueError(f"{what} does not support location_type='{loc_type}'") - - if loc_type is None: - if location is None: - if allow_none: - return _make_managed_location("host", -1) - raise ValueError(f"{what} requires a location") - if not isinstance(location, int): - raise TypeError( - f"{what} location must be a Device, int, or None, got {type(location).__name__}" - ) - loc_id = location - if loc_id == -1: - if "host" not in allowed_loctypes: - raise ValueError(f"{what} does not support host locations") - return _make_managed_location("host", -1) - elif loc_id >= 0: - return _make_managed_location("device", loc_id) - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0), -1 for host, or None; got {location!r}" - ) - elif loc_type == "device": - if isinstance(location, int) and location >= 0: - loc_id = location - else: - raise ValueError( - f"{what} location must be a device ordinal (>= 0) when location_type is 'device', got {location!r}" - ) - return _make_managed_location(loc_type, loc_id) - elif loc_type == "host": - if location not in (None, -1): - raise ValueError( - f"{what} location must be None or -1 when location_type is 'host', got {location!r}" - ) - return _make_managed_location(loc_type, -1) - elif loc_type == "host_numa": - if not isinstance(location, int) or location < 0: - raise ValueError( - f"{what} location must be a NUMA node ID (>= 0) when location_type is 'host_numa', got {location!r}" - ) - return _make_managed_location(loc_type, location) - else: - if location is not None: - raise ValueError( - f"{what} location must be None when location_type is 'host_numa_current', got {location!r}" - ) - return _make_managed_location(loc_type, _HOST_NUMA_CURRENT_ID) - - -cdef bint _managed_location_uses_v2_bindings(): - # cuda.bindings 13.x switches these APIs to CUmemLocation-based wrappers. - global _V2_BINDINGS - if _V2_BINDINGS < 0: - _V2_BINDINGS = 1 if binding_version() >= (13, 0) else 0 - return _V2_BINDINGS != 0 - - -cdef object _LEGACY_LOC_DEVICE = None -cdef object _LEGACY_LOC_HOST = None - -cdef int _managed_location_to_legacy_device(object location, str what): - global _LEGACY_LOC_DEVICE, _LEGACY_LOC_HOST - if _LEGACY_LOC_DEVICE is None: - _LEGACY_LOC_DEVICE = _managed_location_enum("device") - _LEGACY_LOC_HOST = _managed_location_enum("host") - cdef object loc_type = location.type - if loc_type == _LEGACY_LOC_DEVICE or loc_type == _LEGACY_LOC_HOST: - return location.id - raise RuntimeError( - f"{what} requires cuda.bindings 13.x for location_type={loc_type!r}" - ) - - cdef void _require_managed_buffer(Buffer self, str what): _init_mem_attrs(self) if not self._mem_attrs.is_managed: @@ -303,16 +154,6 @@ ELSE: ) -cdef void _require_managed_discard_prefetch_support(str what): - global _DISCARD_PREFETCH_SUPPORTED - if _DISCARD_PREFETCH_SUPPORTED < 0: - _DISCARD_PREFETCH_SUPPORTED = 1 if hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync") else 0 - if not _DISCARD_PREFETCH_SUPPORTED: - raise RuntimeError( - f"{what} requires cuda.bindings support for cuMemDiscardAndPrefetchBatchAsync" - ) - - def discard( targets, *, @@ -385,57 +226,80 @@ cdef void _do_batch_discard(tuple bufs, Stream s): def advise( - target: Buffer, - advice: driver.CUmem_advise | str, - location: Device | int | None = None, + targets, + advice, + location=None, *, - location_type: str | None = None, + options=None, ): - """Apply managed-memory advice to an allocation range. + """Apply managed-memory advice to one or more allocation ranges. Parameters ---------- - target : :class:`Buffer` - Managed allocation to operate on. - advice : :obj:`~driver.CUmem_advise` | str - Managed-memory advice to apply. String aliases such as - ``"set_read_mostly"``, ``"set_preferred_location"``, and - ``"set_accessed_by"`` are accepted. - location : :obj:`~_device.Device` | int | None, optional - Target location. When ``location_type`` is ``None``, values are - interpreted as a device ordinal, ``-1`` for host, or ``None`` for - advice values that ignore location. - location_type : str | None, optional - Explicit location kind. Supported values are ``"device"``, ``"host"``, - ``"host_numa"``, and ``"host_numa_current"``. + targets : :class:`Buffer` | Sequence[:class:`Buffer`] + One or more managed allocations to advise. + advice : str | :obj:`~driver.CUmem_advise` + Managed-memory advice. String aliases (``"set_read_mostly"``, + ``"unset_read_mostly"``, ``"set_preferred_location"``, + ``"unset_preferred_location"``, ``"set_accessed_by"``, + ``"unset_accessed_by"``) and ``CUmem_advise`` enum values are accepted. + location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + Target location(s). Required for advice values that consult a + location; ignored (may be ``None``) for ``set_read_mostly``, + ``unset_read_mostly``, and ``unset_preferred_location``. A sequence + must match ``len(targets)``. + options : None + Reserved for future per-call flags. Must be ``None``. """ - if not isinstance(target, Buffer): - raise TypeError(f"advise target must be a Buffer, got {type(target).__name__}") - cdef Buffer buf = target - _require_managed_buffer(buf, "advise") + if options is not None: + raise TypeError( + f"advise options must be None (reserved); got {type(options).__name__}" + ) cdef str advice_name - cdef object ptr = buf.handle - cdef size_t nbytes = buf._size + cdef object advice_value + advice_name, advice_value = _normalize_managed_advice(advice) + cdef bint allow_none = advice_name in _MANAGED_ADVICE_IGNORE_LOCATION + cdef frozenset allowed_kinds = _MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name] - advice_name, advice = _normalize_managed_advice(advice) - location = _normalize_managed_location( - location, - location_type, - "advise", - allow_none=advice_name in _MANAGED_ADVICE_IGNORE_LOCATION, - allowed_loctypes=_MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name], - ) - if _managed_location_uses_v2_bindings(): - handle_return(driver.cuMemAdvise(ptr, nbytes, advice, location)) - else: - handle_return( - driver.cuMemAdvise( - ptr, - nbytes, - advice, - _managed_location_to_legacy_device(location, "advise"), + cdef tuple bufs = _coerce_buffer_targets(targets, "advise") + cdef Py_ssize_t n = len(bufs) + cdef tuple locs = _broadcast_locations(location, n, allow_none, "advise") + + cdef Buffer buf + cdef object loc + for buf in bufs: + _require_managed_buffer(buf, "advise") + for loc in locs: + if loc is not None and loc.kind not in allowed_kinds: + raise ValueError( + f"advise '{advice_name}' does not support location_type='{loc.kind}'" ) - ) + + cdef Py_ssize_t i + for i in range(n): + _do_single_advise(bufs[i], advice_value, locs[i], allow_none) + + +cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none): + cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) + cdef size_t nbytes = buf._size + cdef cydriver.CUmem_advise advice_enum = (int(advice_value)) + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef cydriver.CUmemLocation cu_loc + if loc is None: + # Driver ignores location for read_mostly / unset_preferred_location + # advice values but still validates the CUmemLocation; pass a + # host placeholder. + cu_loc.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST + cu_loc.id = 0 + else: + cu_loc = _to_cumemlocation(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, cu_loc)) + ELSE: + cdef int dev_int = -1 if loc is None else _to_legacy_device(loc) + with nogil: + HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int)) def prefetch( diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 627a60bb3f9..a469c63a10d 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -54,7 +54,6 @@ _READ_MOSTLY_ENABLED = 1 _HOST_LOCATION_ID = -1 _INVALID_HOST_DEVICE_ORDINAL = 0 -_LEGACY_BINDINGS_VERSION = (12, 9) class DummyDeviceMemoryResource(MemoryResource): @@ -1264,6 +1263,8 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): + from cuda.core.managed_memory import Location + device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() @@ -1281,7 +1282,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): # cuda.bindings currently exposes the combined location attributes for # cuMemRangeGetAttribute, so use the legacy location query here. - managed_memory.advise(buffer, "set_preferred_location", location_type="host") + managed_memory.advise(buffer, "set_preferred_location", Location.host()) preferred_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, @@ -1359,30 +1360,6 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i buffer.close() -def test_managed_memory_advise_uses_legacy_bindings_signature(monkeypatch, init_cuda): - device = Device() - _skip_if_managed_allocation_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - calls = [] - - def fake_cuMemAdvise(ptr, size, advice, location): - calls.append((ptr, size, advice, location)) - return (driver.CUresult.CUDA_SUCCESS,) - - monkeypatch.setattr(_managed_memory_ops, "binding_version", lambda: _LEGACY_BINDINGS_VERSION) - monkeypatch.setattr(_managed_memory_ops, "_V2_BINDINGS", -1) - monkeypatch.setattr(_managed_memory_ops.driver, "cuMemAdvise", fake_cuMemAdvise) - - managed_memory.advise(buffer, "set_read_mostly") - - assert len(calls) == 1 - assert calls[0][3] == int(getattr(driver, "CU_DEVICE_CPU", _HOST_LOCATION_ID)) - - buffer.close() - - def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): device = Device() device.set_current() @@ -1411,14 +1388,17 @@ def test_managed_memory_operation_validation(init_cuda): with pytest.raises(ValueError, match="location is required"): managed_memory.prefetch(buffer, stream=stream) + from cuda.core.managed_memory import Location with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - managed_memory.advise(buffer, "set_accessed_by", _INVALID_HOST_DEVICE_ORDINAL, location_type="host_numa") + managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) buffer.close() def test_managed_memory_advise_location_validation(init_cuda): """Verify doc-specified location constraints for each advice kind.""" + from cuda.core.managed_memory import Location + device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() @@ -1431,16 +1411,16 @@ def test_managed_memory_advise_location_validation(init_cuda): # set_preferred_location requires a location; device ordinal works managed_memory.advise(buffer, "set_preferred_location", device.device_id) - # set_preferred_location with host location_type - managed_memory.advise(buffer, "set_preferred_location", location_type="host") + # set_preferred_location with host location + managed_memory.advise(buffer, "set_preferred_location", Location.host()) # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - managed_memory.advise(buffer, "set_accessed_by", 0, location_type="host_numa") + managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(0)) # set_accessed_by with host_numa_current also raises ValueError with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): - managed_memory.advise(buffer, "set_accessed_by", location_type="host_numa_current") + managed_memory.advise(buffer, "set_accessed_by", Location.host_numa_current()) # Inferred location from int: -1 maps to host, 0 maps to device managed_memory.advise(buffer, "set_preferred_location", -1) @@ -2221,3 +2201,52 @@ def test_rejects_non_managed(self, init_cuda): with pytest.raises(ValueError, match="managed-memory"): discard_prefetch(buf, Location.host(), stream=stream) buf.close() + + +class TestAdvise: + def test_batched_same_advice(self, init_cuda): + from cuda.core.managed_memory import advise, Location + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + bufs = [ + DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + for _ in range(2) + ] + advise(bufs, "set_read_mostly") + for buf in bufs: + assert ( + _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + buf.close() + + def test_batched_per_buffer_location(self, init_cuda): + from cuda.core.managed_memory import advise, Location + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + bufs = [ + DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + for _ in range(2) + ] + advise( + bufs, + "set_preferred_location", + [Location.host(), Location.device(device.device_id)], + ) + for buf in bufs: + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.managed_memory import advise + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + with pytest.raises(TypeError, match="must be None"): + advise(buf, "set_read_mostly", options={}) + buf.close() From fa238696802fc762b0008a20c091e998ab7e7b2b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:10:21 -0700 Subject: [PATCH 27/68] refactor(cuda.core): use Buffer.is_managed property in managed_memory ops _require_managed_buffer was poking at Buffer._mem_attrs.is_managed directly via _init_mem_attrs(). PR #1924 added the public Buffer.is_managed property which falls back to MemoryResource.is_managed when the pointer attribute query does not advertise managed memory (the case for pool- allocated managed memory). Switch _require_managed_buffer to the public property. This also fixes a latent bug where pool-allocated managed buffers were being rejected by the managed_memory ops despite Buffer.is_managed correctly reporting True. Drops the no-longer-needed cimport of _init_mem_attrs. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 11236a1ecfb..f4e13ef16e5 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -8,7 +8,7 @@ from cpython.mem cimport PyMem_Free, PyMem_Malloc from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver -from cuda.core._memory._buffer cimport Buffer, _init_mem_attrs +from cuda.core._memory._buffer cimport Buffer from cuda.core._resource_handles cimport as_cu from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -81,8 +81,10 @@ cdef tuple _normalize_managed_advice(object advice): cdef void _require_managed_buffer(Buffer self, str what): - _init_mem_attrs(self) - if not self._mem_attrs.is_managed: + # Buffer.is_managed handles both pointer-attribute and memory-resource + # paths (e.g. pool-allocated managed memory whose pointer attribute + # does not advertise CU_POINTER_ATTRIBUTE_IS_MANAGED). + if not self.is_managed: raise ValueError(f"{what} requires a managed-memory allocation") From 68bdd14357598b53dc7c0d7a2654b014d876f58f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:10:56 -0700 Subject: [PATCH 28/68] docs(cuda.core): document Location, discard, and 1..N managed_memory ops api.rst: add Location and discard to the managed_memory autosummary. 1.0.0-notes.rst: replace the placeholder bullet with a description of the unified 1..N API, the Location dataclass, and the dispatch to batched driver entry points on cuda.bindings 12.8+. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/docs/source/api.rst | 2 ++ cuda_core/docs/source/release/1.0.0-notes.rst | 15 ++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index b7df6d7b962..fd0e01dedfb 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -137,8 +137,10 @@ Managed memory .. autosummary:: :toctree: generated/ + Location advise prefetch + discard discard_prefetch .. module:: cuda.core diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 4008c86f5d6..25e90667611 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -17,11 +17,16 @@ New features ------------ - Added managed-memory range operations under :mod:`cuda.core.managed_memory`: - ``advise()``, ``prefetch()``, and ``discard_prefetch()``. These free - functions accept either a managed :class:`Buffer` or a raw pointer plus - ``size=``, validate that the target allocation is managed memory, and then - forward to the corresponding CUDA driver operations for range advice and - migration. + :class:`~managed_memory.Location`, :func:`~managed_memory.advise`, + :func:`~managed_memory.prefetch`, :func:`~managed_memory.discard`, and + :func:`~managed_memory.discard_prefetch`. Each operation accepts either a + single managed :class:`Buffer` or a sequence; with cuda.bindings 12.8+ + the N>1 case dispatches to the corresponding ``cuMem*BatchAsync`` driver + entry point, addressing the managed-memory portion of #1333. Locations + are expressed via the typed :class:`~managed_memory.Location` dataclass + (with classmethod constructors ``device``, ``host``, ``host_numa``, and + ``host_numa_current``); ``Device`` and ``int`` values are still accepted + for ergonomic compatibility. Fixes and enhancements From b4d9cbfa7270e7da9e260d457a1678f38bd2833d Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:20:19 -0700 Subject: [PATCH 29/68] chore(cuda.core): drop narrative comments and tighten _coerce_location docstring Per /simplify review, remove WHAT-only comments that just restate the function signature in front of _coerce_buffer_targets and _broadcast_locations. Tighten the _coerce_location docstring to lead with the conversion intent rather than restate the type annotation. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_managed_location.py | 5 ++--- cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index e081a8da32b..8d1605153f4 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -52,10 +52,9 @@ def host_numa_current(cls) -> "Location": def _coerce_location(value, *, allow_none: bool = False) -> Location | None: - """Coerce user input to a Location instance. + """Coerce ``Location`` / ``Device`` / int / ``None`` to ``Location``. - Accepts: Location (passthrough), Device (uses device_id), int (>=0 → device, - -1 → host), None (only if allow_none=True). + Maps int ``-1`` to host and other non-negative ints to that device ordinal. """ from cuda.core._device import Device # avoid import cycle at module load diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index f4e13ef16e5..90e5611a2d3 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -88,7 +88,6 @@ cdef void _require_managed_buffer(Buffer self, str what): raise ValueError(f"{what} requires a managed-memory allocation") -# Coerce ``targets`` (single Buffer or sequence) to a tuple[Buffer, ...]. cdef tuple _coerce_buffer_targets(object targets, str what): cdef list out if isinstance(targets, Buffer): @@ -110,8 +109,6 @@ cdef tuple _coerce_buffer_targets(object targets, str what): ) -# Broadcast a single location across ``n`` targets, or coerce a length-N -# sequence elementwise. cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what): cdef object coerced if isinstance(location, (list, tuple)): From ee967583b78d014723db47b9cc4b145bf9c031fa Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 18:41:19 -0700 Subject: [PATCH 30/68] chore(cuda.core): satisfy pre-commit hooks - ruff auto-applied: * Drop unused `_managed_memory_ops` test import (no longer needed after the legacy-bindings monkeypatch test was deleted) * Drop "Location" string-quoted forward refs in _managed_location.py (file already uses `from __future__ import annotations`) * Reformat string concatenations and add blank-line-after-import spacing - cython-lint auto-applied: * Drop unused libc.stdint cimport of `uintptr_t` * Drop unused `Location` Python import (only used in docstrings) * Drop unused `n` local in `discard()` * Move `cpython.mem cimport` of PyMem_Free / PyMem_Malloc inside the `IF CUDA_CORE_BUILD_MAJOR >= 13:` block where the symbols are actually used; cython-lint cannot see across compile-time branches. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_location.py | 16 +++--- .../cuda/core/_memory/_managed_memory_ops.pyx | 7 ++- cuda_core/tests/test_memory.py | 51 +++++++++++++++---- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index 8d1605153f4..0e89cb92e37 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -35,19 +35,19 @@ def __post_init__(self) -> None: raise ValueError(f"{self.kind} location must have id=None") @classmethod - def device(cls, device_id: int) -> "Location": + def device(cls, device_id: int) -> Location: return cls(kind="device", id=device_id) @classmethod - def host(cls) -> "Location": + def host(cls) -> Location: return cls(kind="host", id=None) @classmethod - def host_numa(cls, numa_id: int) -> "Location": + def host_numa(cls, numa_id: int) -> Location: return cls(kind="host_numa", id=numa_id) @classmethod - def host_numa_current(cls) -> "Location": + def host_numa_current(cls) -> Location: return cls(kind="host_numa_current", id=None) @@ -71,9 +71,5 @@ def _coerce_location(value, *, allow_none: bool = False) -> Location | None: return Location.host() if value >= 0: return Location.device(value) - raise ValueError( - f"device ordinal must be >= 0 (or -1 for host), got {value}" - ) - raise TypeError( - f"location must be a Location, Device, int, or None; got {type(value).__name__}" - ) + raise ValueError(f"device ordinal must be >= 0 (or -1 for host), got {value}") + raise TypeError(f"location must be a Location, Device, int, or None; got {type(value).__name__}") diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 90e5611a2d3..9926cbe67f8 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -4,8 +4,8 @@ from __future__ import annotations -from cpython.mem cimport PyMem_Free, PyMem_Malloc -from libc.stdint cimport uintptr_t +IF CUDA_CORE_BUILD_MAJOR >= 13: + from cpython.mem cimport PyMem_Free, PyMem_Malloc from cuda.bindings cimport cydriver from cuda.core._memory._buffer cimport Buffer @@ -14,7 +14,7 @@ from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils import driver -from cuda.core._memory._managed_location import Location, _coerce_location +from cuda.core._memory._managed_location import _coerce_location cdef dict _MANAGED_ADVICE_ALIASES = { @@ -182,7 +182,6 @@ def discard( f"discard options must be None (reserved); got {type(options).__name__}" ) cdef tuple bufs = _coerce_buffer_targets(targets, "discard") - cdef Py_ssize_t n = len(bufs) cdef Stream s = Stream_accept(stream) cdef Buffer buf diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index a469c63a10d..36fdfd03475 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -44,7 +44,7 @@ system as ccx_system, ) from cuda.core._dlpack import DLDeviceType -from cuda.core._memory import IPCBufferDescriptor, _managed_memory_ops +from cuda.core._memory import IPCBufferDescriptor from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView @@ -1389,6 +1389,7 @@ def test_managed_memory_operation_validation(init_cuda): with pytest.raises(ValueError, match="location is required"): managed_memory.prefetch(buffer, stream=stream) from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) @@ -1875,42 +1876,50 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): class TestLocation: def test_device_constructor(self): from cuda.core.managed_memory import Location + loc = Location.device(0) assert loc.kind == "device" assert loc.id == 0 def test_host_constructor(self): from cuda.core.managed_memory import Location + loc = Location.host() assert loc.kind == "host" assert loc.id is None def test_host_numa_constructor(self): from cuda.core.managed_memory import Location + loc = Location.host_numa(3) assert loc.kind == "host_numa" assert loc.id == 3 def test_host_numa_current_constructor(self): from cuda.core.managed_memory import Location + loc = Location.host_numa_current() assert loc.kind == "host_numa_current" assert loc.id is None def test_frozen(self): import dataclasses + from cuda.core.managed_memory import Location + loc = Location.device(0) with pytest.raises(dataclasses.FrozenInstanceError): loc.id = 1 def test_invalid_device_id(self): from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="device id must be >= 0"): Location.device(-1) def test_invalid_kind(self): from cuda.core.managed_memory import Location + with pytest.raises(ValueError, match="kind must be one of"): Location(kind="not_a_kind", id=None) @@ -1919,21 +1928,25 @@ class TestLocationCoerce: def test_passthrough(self): from cuda.core._memory._managed_location import _coerce_location from cuda.core.managed_memory import Location + loc = Location.device(0) assert _coerce_location(loc) is loc def test_int_device(self): from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(0).kind == "device" assert _coerce_location(0).id == 0 def test_int_minus_one_is_host(self): from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(-1).kind == "host" def test_device_object(self, init_cuda): from cuda.core import Device from cuda.core._memory._managed_location import _coerce_location + dev = Device() loc = _coerce_location(dev) assert loc.kind == "device" @@ -1941,20 +1954,24 @@ def test_device_object(self, init_cuda): def test_none_when_disallowed(self): from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(ValueError, match="location is required"): _coerce_location(None, allow_none=False) def test_none_when_allowed(self): from cuda.core._memory._managed_location import _coerce_location + assert _coerce_location(None, allow_none=True) is None def test_bad_int(self): from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(ValueError, match="device ordinal"): _coerce_location(-2) def test_bad_type(self): from cuda.core._memory._managed_location import _coerce_location + with pytest.raises(TypeError, match="Location, Device, int, or None"): _coerce_location("device") @@ -1962,6 +1979,7 @@ def test_bad_type(self): class TestPrefetch: def test_single_with_location_host(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -1980,6 +1998,7 @@ def test_single_with_location_host(self, init_cuda): def test_batched_same_location(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemPrefetchBatchAsync"): @@ -2002,6 +2021,7 @@ def test_batched_same_location(self, init_cuda): def test_batched_per_buffer_location(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemPrefetchBatchAsync"): @@ -2029,6 +2049,7 @@ def test_batched_per_buffer_location(self, init_cuda): def test_length_mismatch(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2043,6 +2064,7 @@ def test_length_mismatch(self, init_cuda): def test_rejects_non_managed(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() device.set_current() buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -2053,6 +2075,7 @@ def test_rejects_non_managed(self, init_cuda): def test_location_required(self, init_cuda): from cuda.core.managed_memory import prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2065,6 +2088,7 @@ def test_location_required(self, init_cuda): def test_options_must_be_none(self, init_cuda): from cuda.core.managed_memory import Location, prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2079,6 +2103,7 @@ def test_options_must_be_none(self, init_cuda): class TestDiscard: def test_single_buffer(self, init_cuda): from cuda.core.managed_memory import Location, discard, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemDiscardBatchAsync"): @@ -2095,6 +2120,7 @@ def test_single_buffer(self, init_cuda): def test_batched(self, init_cuda): from cuda.core.managed_memory import Location, discard, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemDiscardBatchAsync"): @@ -2112,6 +2138,7 @@ def test_batched(self, init_cuda): def test_rejects_non_managed(self, init_cuda): from cuda.core.managed_memory import discard + device = Device() device.set_current() buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -2122,6 +2149,7 @@ def test_rejects_non_managed(self, init_cuda): def test_options_must_be_none(self, init_cuda): from cuda.core.managed_memory import discard + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2136,6 +2164,7 @@ def test_options_must_be_none(self, init_cuda): class TestDiscardPrefetch: def test_single_buffer(self, init_cuda): from cuda.core.managed_memory import Location, discard_prefetch, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): @@ -2159,6 +2188,7 @@ def test_single_buffer(self, init_cuda): def test_batched_same_location(self, init_cuda): from cuda.core.managed_memory import Location, discard_prefetch, prefetch + device = Device() skip_if_managed_memory_unsupported(device) if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): @@ -2181,6 +2211,7 @@ def test_batched_same_location(self, init_cuda): def test_length_mismatch(self, init_cuda): from cuda.core.managed_memory import Location, discard_prefetch + device = Device() skip_if_managed_memory_unsupported(device) device.set_current() @@ -2194,6 +2225,7 @@ def test_length_mismatch(self, init_cuda): def test_rejects_non_managed(self, init_cuda): from cuda.core.managed_memory import Location, discard_prefetch + device = Device() device.set_current() buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) @@ -2205,14 +2237,12 @@ def test_rejects_non_managed(self, init_cuda): class TestAdvise: def test_batched_same_advice(self, init_cuda): - from cuda.core.managed_memory import advise, Location + from cuda.core.managed_memory import advise + device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - bufs = [ - DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - for _ in range(2) - ] + bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] advise(bufs, "set_read_mostly") for buf in bufs: assert ( @@ -2225,14 +2255,12 @@ def test_batched_same_advice(self, init_cuda): buf.close() def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.managed_memory import advise, Location + from cuda.core.managed_memory import Location, advise + device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - bufs = [ - DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - for _ in range(2) - ] + bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] advise( bufs, "set_preferred_location", @@ -2243,6 +2271,7 @@ def test_batched_per_buffer_location(self, init_cuda): def test_options_must_be_none(self, init_cuda): from cuda.core.managed_memory import advise + device = Device() _skip_if_managed_allocation_unsupported(device) device.set_current() From d6f60f247a8572de41a2abfc20d872898bdf71f8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 27 Apr 2026 19:08:33 -0700 Subject: [PATCH 31/68] refactor(cuda.core): move managed_memory ops to cuda.core.utils Per Leo's review request (https://github.com/NVIDIA/cuda-python/pull/1775#discussion_r2991209111), fold the managed-memory free functions and the Location dataclass into cuda.core.utils rather than maintaining a dedicated cuda.core.managed_memory namespace. - Re-export Location, advise, prefetch, discard, discard_prefetch from cuda.core.utils. - Delete cuda.core.managed_memory module. - Update cuda.core.__init__ to drop the managed_memory submodule import. - Update tests to import from cuda.core.utils. - Update api.rst: drop the dedicated Managed memory section; add the managed-memory entries to the Utility functions section. - Update 1.0.0-notes.rst accordingly. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/__init__.py | 2 +- cuda_core/cuda/core/managed_memory.py | 10 -- cuda_core/cuda/core/utils.py | 9 +- cuda_core/docs/source/api.rst | 23 +--- cuda_core/docs/source/release/1.0.0-notes.rst | 22 ++-- cuda_core/tests/test_memory.py | 108 +++++++++--------- 6 files changed, 79 insertions(+), 95 deletions(-) delete mode 100644 cuda_core/cuda/core/managed_memory.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 61315dda5ae..dfd52accea3 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -28,7 +28,7 @@ def _import_versioned_module(): del _import_versioned_module -from cuda.core import managed_memory, system, utils +from cuda.core import system, utils from cuda.core._device import Device from cuda.core._event import Event, EventOptions from cuda.core._graphics import GraphicsResource diff --git a/cuda_core/cuda/core/managed_memory.py b/cuda_core/cuda/core/managed_memory.py deleted file mode 100644 index 509e874ccc0..00000000000 --- a/cuda_core/cuda/core/managed_memory.py +++ /dev/null @@ -1,10 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -"""Managed-memory range operations.""" - -from cuda.core._memory._managed_location import Location -from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch - -__all__ = ["Location", "advise", "discard", "discard_prefetch", "prefetch"] diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py index f15d9242778..3d4b3e4c596 100644 --- a/cuda_core/cuda/core/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -1,7 +1,14 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 +from cuda.core._memory._managed_location import Location # noqa: F401 +from cuda.core._memory._managed_memory_ops import ( + advise, # noqa: F401 + discard, # noqa: F401 + discard_prefetch, # noqa: F401 + prefetch, # noqa: F401 +) from cuda.core._memoryview import ( StridedMemoryView, # noqa: F401 args_viewable_as_strided_memory, # noqa: F401 diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index fd0e01dedfb..fa17624fa5e 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -129,24 +129,6 @@ Each subclass exposes attributes unique to its operation type. graph.SwitchNode -.. module:: cuda.core.managed_memory - -Managed memory --------------- - -.. autosummary:: - :toctree: generated/ - - Location - advise - prefetch - discard - discard_prefetch - -.. module:: cuda.core - :no-index: - - Graphics interoperability ------------------------- @@ -265,7 +247,12 @@ Utility functions :toctree: generated/ args_viewable_as_strided_memory + advise + prefetch + discard + discard_prefetch :template: autosummary/cyclass.rst + Location StridedMemoryView diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 25e90667611..17696b616a1 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -16,17 +16,17 @@ Highlights New features ------------ -- Added managed-memory range operations under :mod:`cuda.core.managed_memory`: - :class:`~managed_memory.Location`, :func:`~managed_memory.advise`, - :func:`~managed_memory.prefetch`, :func:`~managed_memory.discard`, and - :func:`~managed_memory.discard_prefetch`. Each operation accepts either a - single managed :class:`Buffer` or a sequence; with cuda.bindings 12.8+ - the N>1 case dispatches to the corresponding ``cuMem*BatchAsync`` driver - entry point, addressing the managed-memory portion of #1333. Locations - are expressed via the typed :class:`~managed_memory.Location` dataclass - (with classmethod constructors ``device``, ``host``, ``host_numa``, and - ``host_numa_current``); ``Device`` and ``int`` values are still accepted - for ergonomic compatibility. +- Added managed-memory range operations to :mod:`cuda.core.utils`: + :class:`~utils.Location`, :func:`~utils.advise`, :func:`~utils.prefetch`, + :func:`~utils.discard`, and :func:`~utils.discard_prefetch`. Each + operation accepts either a single managed :class:`Buffer` or a + sequence; with cuda.bindings 12.8+ the N>1 case dispatches to the + corresponding ``cuMem*BatchAsync`` driver entry point, addressing the + managed-memory portion of #1333. Locations are expressed via the typed + :class:`~utils.Location` dataclass (with classmethod constructors + ``device``, ``host``, ``host_numa``, and ``host_numa_current``); + ``Device`` and ``int`` values are still accepted for ergonomic + compatibility. Fixes and enhancements diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 36fdfd03475..18f7bed1141 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -38,7 +38,7 @@ PinnedMemoryResourceOptions, VirtualMemoryResource, VirtualMemoryResourceOptions, - managed_memory, + utils, ) from cuda.core import ( system as ccx_system, @@ -1243,7 +1243,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( buffer, @@ -1251,7 +1251,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): ) assert last_location == _HOST_LOCATION_ID - managed_memory.prefetch(buffer, device, stream=stream) + utils.prefetch(buffer, device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( buffer, @@ -1263,7 +1263,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location device = Device() _skip_if_managed_location_ops_unsupported(device) @@ -1271,7 +1271,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - managed_memory.advise(buffer, "set_read_mostly") + utils.advise(buffer, "set_read_mostly") assert ( _get_int_mem_range_attr( buffer, @@ -1282,7 +1282,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): # cuda.bindings currently exposes the combined location attributes for # cuMemRangeGetAttribute, so use the legacy location query here. - managed_memory.advise(buffer, "set_preferred_location", Location.host()) + utils.advise(buffer, "set_preferred_location", Location.host()) preferred_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, @@ -1300,7 +1300,7 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - managed_memory.prefetch(buffer, device, stream=stream) + utils.prefetch(buffer, device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( @@ -1322,10 +1322,10 @@ def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_ buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - managed_memory.discard_prefetch(buffer, device, stream=stream) + utils.discard_prefetch(buffer, device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( @@ -1345,10 +1345,10 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - managed_memory.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) stream.sync() - managed_memory.discard_prefetch(buffer, device, stream=stream) + utils.discard_prefetch(buffer, device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( @@ -1368,11 +1368,11 @@ def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): stream = device.create_stream() with pytest.raises(ValueError, match="managed-memory allocation"): - managed_memory.advise(buffer, "set_read_mostly") + utils.advise(buffer, "set_read_mostly") with pytest.raises(ValueError, match="managed-memory allocation"): - managed_memory.prefetch(buffer, device, stream=stream) + utils.prefetch(buffer, device, stream=stream) with pytest.raises(ValueError, match="managed-memory allocation"): - managed_memory.discard_prefetch(buffer, device, stream=stream) + utils.discard_prefetch(buffer, device, stream=stream) buffer.close() @@ -1387,18 +1387,18 @@ def test_managed_memory_operation_validation(init_cuda): stream = device.create_stream() with pytest.raises(ValueError, match="location is required"): - managed_memory.prefetch(buffer, stream=stream) - from cuda.core.managed_memory import Location + utils.prefetch(buffer, stream=stream) + from cuda.core.utils import Location with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) + utils.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) buffer.close() def test_managed_memory_advise_location_validation(init_cuda): """Verify doc-specified location constraints for each advice kind.""" - from cuda.core.managed_memory import Location + from cuda.core.utils import Location device = Device() _skip_if_managed_location_ops_unsupported(device) @@ -1407,25 +1407,25 @@ def test_managed_memory_advise_location_validation(init_cuda): buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) # set_read_mostly works without a location (location is ignored) - managed_memory.advise(buffer, "set_read_mostly") + utils.advise(buffer, "set_read_mostly") # set_preferred_location requires a location; device ordinal works - managed_memory.advise(buffer, "set_preferred_location", device.device_id) + utils.advise(buffer, "set_preferred_location", device.device_id) # set_preferred_location with host location - managed_memory.advise(buffer, "set_preferred_location", Location.host()) + utils.advise(buffer, "set_preferred_location", Location.host()) # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - managed_memory.advise(buffer, "set_accessed_by", Location.host_numa(0)) + utils.advise(buffer, "set_accessed_by", Location.host_numa(0)) # set_accessed_by with host_numa_current also raises ValueError with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): - managed_memory.advise(buffer, "set_accessed_by", Location.host_numa_current()) + utils.advise(buffer, "set_accessed_by", Location.host_numa_current()) # Inferred location from int: -1 maps to host, 0 maps to device - managed_memory.advise(buffer, "set_preferred_location", -1) - managed_memory.advise(buffer, "set_preferred_location", 0) + utils.advise(buffer, "set_preferred_location", -1) + utils.advise(buffer, "set_preferred_location", 0) buffer.close() @@ -1439,7 +1439,7 @@ def test_managed_memory_advise_accepts_enum_value(init_cuda): buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY - managed_memory.advise(buffer, advice_enum) + utils.advise(buffer, advice_enum) assert ( _get_int_mem_range_attr( @@ -1461,10 +1461,10 @@ def test_managed_memory_advise_invalid_advice_values(init_cuda): buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) with pytest.raises(ValueError, match="advice must be one of"): - managed_memory.advise(buffer, "not_a_real_advice") + utils.advise(buffer, "not_a_real_advice") with pytest.raises(TypeError, match="advice must be"): - managed_memory.advise(buffer, 42) + utils.advise(buffer, 42) buffer.close() @@ -1875,28 +1875,28 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): class TestLocation: def test_device_constructor(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.device(0) assert loc.kind == "device" assert loc.id == 0 def test_host_constructor(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.host() assert loc.kind == "host" assert loc.id is None def test_host_numa_constructor(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.host_numa(3) assert loc.kind == "host_numa" assert loc.id == 3 def test_host_numa_current_constructor(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.host_numa_current() assert loc.kind == "host_numa_current" @@ -1905,20 +1905,20 @@ def test_host_numa_current_constructor(self): def test_frozen(self): import dataclasses - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.device(0) with pytest.raises(dataclasses.FrozenInstanceError): loc.id = 1 def test_invalid_device_id(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location with pytest.raises(ValueError, match="device id must be >= 0"): Location.device(-1) def test_invalid_kind(self): - from cuda.core.managed_memory import Location + from cuda.core.utils import Location with pytest.raises(ValueError, match="kind must be one of"): Location(kind="not_a_kind", id=None) @@ -1927,7 +1927,7 @@ def test_invalid_kind(self): class TestLocationCoerce: def test_passthrough(self): from cuda.core._memory._managed_location import _coerce_location - from cuda.core.managed_memory import Location + from cuda.core.utils import Location loc = Location.device(0) assert _coerce_location(loc) is loc @@ -1978,7 +1978,7 @@ def test_bad_type(self): class TestPrefetch: def test_single_with_location_host(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -1997,7 +1997,7 @@ def test_single_with_location_host(self, init_cuda): buf.close() def test_batched_same_location(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2020,7 +2020,7 @@ def test_batched_same_location(self, init_cuda): buf.close() def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2048,7 +2048,7 @@ def test_batched_per_buffer_location(self, init_cuda): buf.close() def test_length_mismatch(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2063,7 +2063,7 @@ def test_length_mismatch(self, init_cuda): buf.close() def test_rejects_non_managed(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() device.set_current() @@ -2074,7 +2074,7 @@ def test_rejects_non_managed(self, init_cuda): buf.close() def test_location_required(self, init_cuda): - from cuda.core.managed_memory import prefetch + from cuda.core.utils import prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2087,7 +2087,7 @@ def test_location_required(self, init_cuda): buf.close() def test_options_must_be_none(self, init_cuda): - from cuda.core.managed_memory import Location, prefetch + from cuda.core.utils import Location, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2102,7 +2102,7 @@ def test_options_must_be_none(self, init_cuda): class TestDiscard: def test_single_buffer(self, init_cuda): - from cuda.core.managed_memory import Location, discard, prefetch + from cuda.core.utils import Location, discard, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2119,7 +2119,7 @@ def test_single_buffer(self, init_cuda): buf.close() def test_batched(self, init_cuda): - from cuda.core.managed_memory import Location, discard, prefetch + from cuda.core.utils import Location, discard, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2137,7 +2137,7 @@ def test_batched(self, init_cuda): buf.close() def test_rejects_non_managed(self, init_cuda): - from cuda.core.managed_memory import discard + from cuda.core.utils import discard device = Device() device.set_current() @@ -2148,7 +2148,7 @@ def test_rejects_non_managed(self, init_cuda): buf.close() def test_options_must_be_none(self, init_cuda): - from cuda.core.managed_memory import discard + from cuda.core.utils import discard device = Device() skip_if_managed_memory_unsupported(device) @@ -2163,7 +2163,7 @@ def test_options_must_be_none(self, init_cuda): class TestDiscardPrefetch: def test_single_buffer(self, init_cuda): - from cuda.core.managed_memory import Location, discard_prefetch, prefetch + from cuda.core.utils import Location, discard_prefetch, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2187,7 +2187,7 @@ def test_single_buffer(self, init_cuda): buf.close() def test_batched_same_location(self, init_cuda): - from cuda.core.managed_memory import Location, discard_prefetch, prefetch + from cuda.core.utils import Location, discard_prefetch, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2210,7 +2210,7 @@ def test_batched_same_location(self, init_cuda): buf.close() def test_length_mismatch(self, init_cuda): - from cuda.core.managed_memory import Location, discard_prefetch + from cuda.core.utils import Location, discard_prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -2224,7 +2224,7 @@ def test_length_mismatch(self, init_cuda): buf.close() def test_rejects_non_managed(self, init_cuda): - from cuda.core.managed_memory import Location, discard_prefetch + from cuda.core.utils import Location, discard_prefetch device = Device() device.set_current() @@ -2237,7 +2237,7 @@ def test_rejects_non_managed(self, init_cuda): class TestAdvise: def test_batched_same_advice(self, init_cuda): - from cuda.core.managed_memory import advise + from cuda.core.utils import advise device = Device() _skip_if_managed_location_ops_unsupported(device) @@ -2255,7 +2255,7 @@ def test_batched_same_advice(self, init_cuda): buf.close() def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.managed_memory import Location, advise + from cuda.core.utils import Location, advise device = Device() _skip_if_managed_location_ops_unsupported(device) @@ -2270,7 +2270,7 @@ def test_batched_per_buffer_location(self, init_cuda): buf.close() def test_options_must_be_none(self, init_cuda): - from cuda.core.managed_memory import advise + from cuda.core.utils import advise device = Device() _skip_if_managed_allocation_unsupported(device) From 3176271b7f5bc8427cc567dc6673dea3e64bdfd7 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 28 Apr 2026 08:41:24 -0700 Subject: [PATCH 32/68] chore(cuda.core): use __all__ in utils instead of per-import noqa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace seven `# noqa: F401` comments with a single `__all__` block listing the public re-exports. Cleaner intent signal — these are deliberate facade exports, not accidental imports — and matches the existing __all__ convention used in cuda.core.system, _legacy.py, and typing.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/utils.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py index 3d4b3e4c596..08ffaff14ac 100644 --- a/cuda_core/cuda/core/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -2,14 +2,16 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core._memory._managed_location import Location # noqa: F401 -from cuda.core._memory._managed_memory_ops import ( - advise, # noqa: F401 - discard, # noqa: F401 - discard_prefetch, # noqa: F401 - prefetch, # noqa: F401 -) -from cuda.core._memoryview import ( - StridedMemoryView, # noqa: F401 - args_viewable_as_strided_memory, # noqa: F401 -) +from cuda.core._memory._managed_location import Location +from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch +from cuda.core._memoryview import StridedMemoryView, args_viewable_as_strided_memory + +__all__ = [ + "Location", + "StridedMemoryView", + "advise", + "args_viewable_as_strided_memory", + "discard", + "discard_prefetch", + "prefetch", +] From 782f6a9b9dca9dbf4dc42937e490c97c0d267791 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 28 Apr 2026 08:42:40 -0700 Subject: [PATCH 33/68] chore(cuda.core): collapse nested if in Location.__post_init__ (SIM102) ruff SIM102 flagged the host/host_numa_current branch: elif self.kind in ("host", "host_numa_current"): if self.id is not None: raise ValueError(...) into a single condition with `and`. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_managed_location.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index 0e89cb92e37..9d2eee23b21 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -30,9 +30,8 @@ def __post_init__(self) -> None: elif self.kind == "host_numa": if not isinstance(self.id, int) or self.id < 0: raise ValueError("host_numa id must be >= 0") - elif self.kind in ("host", "host_numa_current"): - if self.id is not None: - raise ValueError(f"{self.kind} location must have id=None") + elif self.kind in ("host", "host_numa_current") and self.id is not None: + raise ValueError(f"{self.kind} location must have id=None") @classmethod def device(cls, device_id: int) -> Location: From 0789bf629922a080f268d2f29786bf33d31efff6 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 28 Apr 2026 10:12:21 -0700 Subject: [PATCH 34/68] test(cuda.core): share one DummyUnifiedMemoryResource per batched test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both batched-advise tests previously created a throwaway DummyUnifiedMemoryResource per allocation inside a list comprehension: bufs = [DummyUnifiedMemoryResource(device).allocate(size) for _ in range(2)] The Buffer holds a reference to the throwaway MR via mr=self, so the MR should stay alive — but on CUDA 12.9.1 CI test_batched_same_advice fails with bufs[0] showing ptr=0x0 size=0 (the post-close state). On CUDA 13 the same pattern works. Switch to one MR shared across both allocations. This is cleaner anyway and removes the throwaway-per-iteration pattern as a possible source of the cu12 issue. If the failure persists, we'll know the MR lifetime wasn't the cause. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/tests/test_memory.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 18f7bed1141..b4c3ca0d576 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -2242,7 +2242,8 @@ def test_batched_same_advice(self, init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + mr = DummyUnifiedMemoryResource(device) + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] advise(bufs, "set_read_mostly") for buf in bufs: assert ( @@ -2260,7 +2261,8 @@ def test_batched_per_buffer_location(self, init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() - bufs = [DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + mr = DummyUnifiedMemoryResource(device) + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] advise( bufs, "set_preferred_location", From e0c782a547d4f521e1eb7713e7f6bfad91a3366c Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 28 Apr 2026 10:42:14 -0700 Subject: [PATCH 35/68] test(cuda.core): query all buffers before closing in test_batched_same_advice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On CUDA 12, freeing one managed allocation appears to clear the read-mostly advice on neighboring ranges. The original test interleaved query-then-close inside one loop, so the second iteration would query bufs[1] *after* bufs[0] had been freed and observe a cleared advice flag — causing assert 0 == 1. Move the queries into a list comprehension that runs before any close, then close all buffers, then assert. Decouples the verification from the deallocation order. CUDA 13 was unaffected because its managed-memory bookkeeping does not exhibit the cross-range invalidation on free. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/tests/test_memory.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index b4c3ca0d576..2844c7f1680 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -2245,15 +2245,21 @@ def test_batched_same_advice(self, init_cuda): mr = DummyUnifiedMemoryResource(device) bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] advise(bufs, "set_read_mostly") - for buf in bufs: - assert ( - _get_int_mem_range_attr( - buf, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - ) - == _READ_MOSTLY_ENABLED + # Query all attributes BEFORE closing any buffer. On CUDA 12, freeing + # a managed allocation can clear read-mostly advice on neighboring + # ranges; close-then-query in a single loop falsely flags the later + # iterations as having lost the advice. + results = [ + _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, ) + for buf in bufs + ] + for buf in bufs: buf.close() + for r in results: + assert r == _READ_MOSTLY_ENABLED def test_batched_per_buffer_location(self, init_cuda): from cuda.core.utils import Location, advise From 10de998e6f49b83e71bd88db8666f40f725fe8ac Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 11:55:36 -0700 Subject: [PATCH 36/68] review(cuda.core): address PR #1775 feedback - Drop defensive cuInit retry in _query_memory_attrs (Andy): we don't auto-init CUDA elsewhere; let HANDLE_RETURN propagate the error. - Use checked Cython cast `t` in _coerce_buffer_targets (Leo) in place of the manual isinstance loop. - Introduce *Options dataclasses (AdviseOptions, PrefetchOptions, DiscardOptions, DiscardPrefetchOptions) per cuda.core convention (Leo). Functions accept None or the matching dataclass; tests updated to match the new error message. --- cuda_core/cuda/core/_memory/_buffer.pyx | 6 -- .../cuda/core/_memory/_managed_memory_ops.pyx | 55 +++++++++++-------- .../core/_memory/_managed_memory_options.py | 43 +++++++++++++++ cuda_core/cuda/core/utils.py | 10 ++++ cuda_core/tests/test_memory.py | 6 +- 5 files changed, 89 insertions(+), 31 deletions(-) create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_options.py diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 4ca8650e8db..0032c605163 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -36,7 +36,6 @@ else: from cuda.core._dlpack import classify_dl_device, make_py_capsule from cuda.core._utils.cuda_utils import driver -from cuda.core._device import Device # ============================================================================= @@ -449,11 +448,6 @@ cdef inline int _query_memory_attrs( cdef cydriver.CUresult ret ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) - if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED: - with cython.gil: - # Device class handles the cuInit call internally - Device() - ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) HANDLE_RETURN(ret) # TODO: HMM/ATS-enabled sysmem should also report is_managed=True; the diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 9926cbe67f8..5b64a740a60 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -15,6 +15,12 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils import driver from cuda.core._memory._managed_location import _coerce_location +from cuda.core._memory._managed_memory_options import ( + AdviseOptions, + DiscardOptions, + DiscardPrefetchOptions, + PrefetchOptions, +) cdef dict _MANAGED_ADVICE_ALIASES = { @@ -89,6 +95,7 @@ cdef void _require_managed_buffer(Buffer self, str what): cdef tuple _coerce_buffer_targets(object targets, str what): + cdef Buffer buf cdef list out if isinstance(targets, Buffer): return (targets,) @@ -97,11 +104,8 @@ cdef tuple _coerce_buffer_targets(object targets, str what): raise ValueError(f"{what}: empty targets sequence") out = [] for t in targets: - if not isinstance(t, Buffer): - raise TypeError( - f"{what}: each target must be a Buffer, got {type(t).__name__}" - ) - out.append(t) + buf = t + out.append(buf) return tuple(out) raise TypeError( f"{what}: targets must be a Buffer or sequence of Buffer, " @@ -167,8 +171,9 @@ def discard( One or more managed allocations to discard. Their resident pages are released without prefetching new contents; subsequent access is satisfied by lazy migration. - options : None - Reserved for future per-call flags. Must be ``None``. + options : :class:`DiscardOptions`, optional + Reserved for future per-call flags. ``None`` (default) and + ``DiscardOptions()`` are equivalent. stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` Stream for the asynchronous discard (keyword-only). @@ -177,9 +182,10 @@ def discard( NotImplementedError On a CUDA 12 build of ``cuda.core``. Discard requires CUDA 13+. """ - if options is not None: + if options is not None and not isinstance(options, DiscardOptions): raise TypeError( - f"discard options must be None (reserved); got {type(options).__name__}" + "discard options must be a DiscardOptions instance or None, " + f"got {type(options).__name__}" ) cdef tuple bufs = _coerce_buffer_targets(targets, "discard") cdef Stream s = Stream_accept(stream) @@ -246,12 +252,14 @@ def advise( location; ignored (may be ``None``) for ``set_read_mostly``, ``unset_read_mostly``, and ``unset_preferred_location``. A sequence must match ``len(targets)``. - options : None - Reserved for future per-call flags. Must be ``None``. + options : :class:`AdviseOptions`, optional + Reserved for future per-call flags. ``None`` (default) and + ``AdviseOptions()`` are equivalent. """ - if options is not None: + if options is not None and not isinstance(options, AdviseOptions): raise TypeError( - f"advise options must be None (reserved); got {type(options).__name__}" + "advise options must be an AdviseOptions instance or None, " + f"got {type(options).__name__}" ) cdef str advice_name cdef object advice_value @@ -317,8 +325,9 @@ def prefetch( Target location(s). A single location applies to all targets; a sequence must match ``len(targets)``. ``Device`` and ``int`` values are coerced to :class:`Location` (``-1`` maps to host). - options : None - Reserved for future per-call flags. Must be ``None``. + options : :class:`PrefetchOptions`, optional + Reserved for future per-call flags. ``None`` (default) and + ``PrefetchOptions()`` are equivalent. stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` Stream for the asynchronous prefetch (keyword-only). @@ -327,9 +336,10 @@ def prefetch( NotImplementedError If ``len(targets) > 1`` on a CUDA 12 build of ``cuda.core``. """ - if options is not None: + if options is not None and not isinstance(options, PrefetchOptions): raise TypeError( - f"prefetch options must be None (reserved); got {type(options).__name__}" + "prefetch options must be a PrefetchOptions instance or None, " + f"got {type(options).__name__}" ) cdef tuple bufs = _coerce_buffer_targets(targets, "prefetch") cdef Py_ssize_t n = len(bufs) @@ -420,8 +430,9 @@ def discard_prefetch( location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] Target location(s). A single location applies to all targets; a sequence must match ``len(targets)``. - options : None - Reserved for future per-call flags. Must be ``None``. + options : :class:`DiscardPrefetchOptions`, optional + Reserved for future per-call flags. ``None`` (default) and + ``DiscardPrefetchOptions()`` are equivalent. stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` Stream for the asynchronous operation (keyword-only). @@ -431,10 +442,10 @@ def discard_prefetch( On a CUDA 12 build of ``cuda.core``. Discard-and-prefetch requires CUDA 13+. """ - if options is not None: + if options is not None and not isinstance(options, DiscardPrefetchOptions): raise TypeError( - f"discard_prefetch options must be None (reserved); " - f"got {type(options).__name__}" + "discard_prefetch options must be a DiscardPrefetchOptions " + f"instance or None, got {type(options).__name__}" ) cdef tuple bufs = _coerce_buffer_targets(targets, "discard_prefetch") cdef Py_ssize_t n = len(bufs) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_options.py b/cuda_core/cuda/core/_memory/_managed_memory_options.py new file mode 100644 index 00000000000..68754f2731b --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_options.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class AdviseOptions: + """Per-call options for :func:`cuda.core.utils.advise`. + + Reserved for future advise flags. Currently has no fields; pass + ``AdviseOptions()`` or ``None`` to use driver defaults. + """ + + +@dataclass(frozen=True) +class PrefetchOptions: + """Per-call options for :func:`cuda.core.utils.prefetch`. + + Reserved for future prefetch flags. Currently has no fields; pass + ``PrefetchOptions()`` or ``None`` to use driver defaults. + """ + + +@dataclass(frozen=True) +class DiscardOptions: + """Per-call options for :func:`cuda.core.utils.discard`. + + Reserved for future discard flags. Currently has no fields; pass + ``DiscardOptions()`` or ``None`` to use driver defaults. + """ + + +@dataclass(frozen=True) +class DiscardPrefetchOptions: + """Per-call options for :func:`cuda.core.utils.discard_prefetch`. + + Reserved for future discard-and-prefetch flags. Currently has no + fields; pass ``DiscardPrefetchOptions()`` or ``None`` to use driver + defaults. + """ diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py index 08ffaff14ac..ece40ad807c 100644 --- a/cuda_core/cuda/core/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -4,10 +4,20 @@ from cuda.core._memory._managed_location import Location from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch +from cuda.core._memory._managed_memory_options import ( + AdviseOptions, + DiscardOptions, + DiscardPrefetchOptions, + PrefetchOptions, +) from cuda.core._memoryview import StridedMemoryView, args_viewable_as_strided_memory __all__ = [ + "AdviseOptions", + "DiscardOptions", + "DiscardPrefetchOptions", "Location", + "PrefetchOptions", "StridedMemoryView", "advise", "args_viewable_as_strided_memory", diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 2844c7f1680..0639e4505d7 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -2095,7 +2095,7 @@ def test_options_must_be_none(self, init_cuda): mr = create_managed_memory_resource_or_skip() buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(TypeError, match="must be None"): + with pytest.raises(TypeError, match="must be a .*Options instance or None"): prefetch(buf, Location.host(), options={}, stream=stream) buf.close() @@ -2156,7 +2156,7 @@ def test_options_must_be_none(self, init_cuda): mr = create_managed_memory_resource_or_skip() buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(TypeError, match="must be None"): + with pytest.raises(TypeError, match="must be a .*Options instance or None"): discard(buf, options={}, stream=stream) buf.close() @@ -2284,6 +2284,6 @@ def test_options_must_be_none(self, init_cuda): _skip_if_managed_allocation_unsupported(device) device.set_current() buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - with pytest.raises(TypeError, match="must be None"): + with pytest.raises(TypeError, match="must be a .*Options instance or None"): advise(buf, "set_read_mostly", options={}) buf.close() From ab9a3abdd17ab3a3ec64ee03cf676b2eb0579d1e Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 12:15:25 -0700 Subject: [PATCH 37/68] test(cuda.core): split managed-memory ops tests into tests/memory/ Move the managed-memory advise/prefetch/discard/discard_prefetch tests (plus their TestLocation/TestLocationCoerce/TestPrefetch/TestDiscard/ TestDiscardPrefetch/TestAdvise classes and skip helpers) from test_memory.py into tests/memory/test_managed_ops.py per Andy's nit. Promote DummyDeviceMemoryResource to helpers.buffers so both files can import it; the remaining DummyHost/DummyPinned/NullMemoryResource stay in test_memory.py since they're only used there. Broader memory-tests reorg ("and siblings": buffer/managed_resource/ pinned/vmm) tracked as a follow-up cleanup PR to keep this diff focused. --- cuda_core/tests/helpers/buffers.py | 25 + cuda_core/tests/memory/test_managed_ops.py | 711 ++++++++++++++++++++ cuda_core/tests/test_memory.py | 716 +-------------------- 3 files changed, 737 insertions(+), 715 deletions(-) create mode 100644 cuda_core/tests/memory/test_managed_ops.py diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py index fbd5428c28b..299955b2180 100644 --- a/cuda_core/tests/helpers/buffers.py +++ b/cuda_core/tests/helpers/buffers.py @@ -9,6 +9,7 @@ from . import libc __all__ = [ + "DummyDeviceMemoryResource", "DummyUnifiedMemoryResource", "PatternGen", "TrackingMR", @@ -18,6 +19,30 @@ ] +class DummyDeviceMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(driver.cuMemAlloc(size)) + return Buffer.from_handle(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + handle_return(driver.cuMemFree(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + @property + def device_id(self) -> int: + return 0 + + class DummyUnifiedMemoryResource(MemoryResource): def __init__(self, device): self.device = device diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py new file mode 100644 index 00000000000..bea4776688e --- /dev/null +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -0,0 +1,711 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from helpers.buffers import DummyDeviceMemoryResource, DummyUnifiedMemoryResource + +from conftest import ( + create_managed_memory_resource_or_skip, + skip_if_managed_memory_unsupported, +) +from cuda.core import Device, utils +from cuda.core._utils.cuda_utils import handle_return + +try: + from cuda.bindings import driver +except ImportError: + from cuda import cuda as driver + + +_MANAGED_TEST_ALLOCATION_SIZE = 4096 +_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4 +_READ_MOSTLY_ENABLED = 1 +_HOST_LOCATION_ID = -1 +_INVALID_HOST_DEVICE_ORDINAL = 0 + + +def _get_mem_range_attr(buffer, attribute, data_size): + # cuMemRangeGetAttribute returns a raw integer when data_size <= 4. + return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) + + +def _get_int_mem_range_attr(buffer, attribute): + return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE) + + +def _skip_if_managed_allocation_unsupported(device): + try: + if not device.properties.managed_memory: + pytest.skip("Device does not support managed memory operations") + except AttributeError: + pytest.skip("Managed-memory buffer operations require CUDA support") + + +def _skip_if_managed_location_ops_unsupported(device): + _skip_if_managed_allocation_unsupported(device) + try: + if not device.properties.concurrent_managed_access: + pytest.skip("Device does not support concurrent managed memory access") + except AttributeError: + pytest.skip("Managed-memory location operations require CUDA support") + + +def _skip_if_managed_discard_prefetch_unsupported(device): + _skip_if_managed_location_ops_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("discard-prefetch requires cuda.bindings support") + + visible_devices = Device.get_all_devices() + if not all(dev.properties.concurrent_managed_access for dev in visible_devices): + pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") + + +def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == _HOST_LOCATION_ID + + utils.prefetch(buffer, device, stream=stream) + stream.sync() + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): + from cuda.core.utils import Location + + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + utils.advise(buffer, "set_read_mostly") + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + # cuda.bindings currently exposes the combined location attributes for + # cuMemRangeGetAttribute, so use the legacy location query here. + utils.advise(buffer, "set_preferred_location", Location.host()) + preferred_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, + ) + assert preferred_location == _HOST_LOCATION_ID + + buffer.close() + + +def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + utils.prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + + utils.discard_prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): + device = Device() + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + stream.sync() + + utils.discard_prefetch(buffer, device, stream=stream) + stream.sync() + + last_location = _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last_location == device.device_id + + buffer.close() + + +def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): + device = Device() + device.set_current() + + buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + with pytest.raises(ValueError, match="managed-memory allocation"): + utils.advise(buffer, "set_read_mostly") + with pytest.raises(ValueError, match="managed-memory allocation"): + utils.prefetch(buffer, device, stream=stream) + with pytest.raises(ValueError, match="managed-memory allocation"): + utils.discard_prefetch(buffer, device, stream=stream) + + buffer.close() + + +def test_managed_memory_operation_validation(init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + with pytest.raises(ValueError, match="location is required"): + utils.prefetch(buffer, stream=stream) + from cuda.core.utils import Location + + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + utils.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) + + buffer.close() + + +def test_managed_memory_advise_location_validation(init_cuda): + """Verify doc-specified location constraints for each advice kind.""" + from cuda.core.utils import Location + + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + # set_read_mostly works without a location (location is ignored) + utils.advise(buffer, "set_read_mostly") + + # set_preferred_location requires a location; device ordinal works + utils.advise(buffer, "set_preferred_location", device.device_id) + + # set_preferred_location with host location + utils.advise(buffer, "set_preferred_location", Location.host()) + + # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) + with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + utils.advise(buffer, "set_accessed_by", Location.host_numa(0)) + + # set_accessed_by with host_numa_current also raises ValueError + with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): + utils.advise(buffer, "set_accessed_by", Location.host_numa_current()) + + # Inferred location from int: -1 maps to host, 0 maps to device + utils.advise(buffer, "set_preferred_location", -1) + utils.advise(buffer, "set_preferred_location", 0) + + buffer.close() + + +def test_managed_memory_advise_accepts_enum_value(init_cuda): + """advise() accepts CUmem_advise enum values directly, not just string aliases.""" + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY + utils.advise(buffer, advice_enum) + + assert ( + _get_int_mem_range_attr( + buffer, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + == _READ_MOSTLY_ENABLED + ) + + buffer.close() + + +def test_managed_memory_advise_invalid_advice_values(init_cuda): + """advise() rejects invalid advice strings and wrong types.""" + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + + buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + + with pytest.raises(ValueError, match="advice must be one of"): + utils.advise(buffer, "not_a_real_advice") + + with pytest.raises(TypeError, match="advice must be"): + utils.advise(buffer, 42) + + buffer.close() + + +class TestLocation: + def test_device_constructor(self): + from cuda.core.utils import Location + + loc = Location.device(0) + assert loc.kind == "device" + assert loc.id == 0 + + def test_host_constructor(self): + from cuda.core.utils import Location + + loc = Location.host() + assert loc.kind == "host" + assert loc.id is None + + def test_host_numa_constructor(self): + from cuda.core.utils import Location + + loc = Location.host_numa(3) + assert loc.kind == "host_numa" + assert loc.id == 3 + + def test_host_numa_current_constructor(self): + from cuda.core.utils import Location + + loc = Location.host_numa_current() + assert loc.kind == "host_numa_current" + assert loc.id is None + + def test_frozen(self): + import dataclasses + + from cuda.core.utils import Location + + loc = Location.device(0) + with pytest.raises(dataclasses.FrozenInstanceError): + loc.id = 1 + + def test_invalid_device_id(self): + from cuda.core.utils import Location + + with pytest.raises(ValueError, match="device id must be >= 0"): + Location.device(-1) + + def test_invalid_kind(self): + from cuda.core.utils import Location + + with pytest.raises(ValueError, match="kind must be one of"): + Location(kind="not_a_kind", id=None) + + +class TestLocationCoerce: + def test_passthrough(self): + from cuda.core._memory._managed_location import _coerce_location + from cuda.core.utils import Location + + loc = Location.device(0) + assert _coerce_location(loc) is loc + + def test_int_device(self): + from cuda.core._memory._managed_location import _coerce_location + + assert _coerce_location(0).kind == "device" + assert _coerce_location(0).id == 0 + + def test_int_minus_one_is_host(self): + from cuda.core._memory._managed_location import _coerce_location + + assert _coerce_location(-1).kind == "host" + + def test_device_object(self, init_cuda): + from cuda.core import Device + from cuda.core._memory._managed_location import _coerce_location + + dev = Device() + loc = _coerce_location(dev) + assert loc.kind == "device" + assert loc.id == dev.device_id + + def test_none_when_disallowed(self): + from cuda.core._memory._managed_location import _coerce_location + + with pytest.raises(ValueError, match="location is required"): + _coerce_location(None, allow_none=False) + + def test_none_when_allowed(self): + from cuda.core._memory._managed_location import _coerce_location + + assert _coerce_location(None, allow_none=True) is None + + def test_bad_int(self): + from cuda.core._memory._managed_location import _coerce_location + + with pytest.raises(ValueError, match="device ordinal"): + _coerce_location(-2) + + def test_bad_type(self): + from cuda.core._memory._managed_location import _coerce_location + + with pytest.raises(TypeError, match="Location, Device, int, or None"): + _coerce_location("device") + + +class TestPrefetch: + def test_single_with_location_host(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + prefetch(buf, Location.host(), stream=stream) + stream.sync() + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == _HOST_LOCATION_ID + buf.close() + + def test_batched_same_location(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemPrefetchBatchAsync"): + pytest.skip("cuMemPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] + stream = device.create_stream() + + prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + + for buf in bufs: + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_batched_per_buffer_location(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemPrefetchBatchAsync"): + pytest.skip("cuMemPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + + prefetch(bufs, [Location.host(), Location.device(device.device_id)], stream=stream) + stream.sync() + + last0 = _get_int_mem_range_attr( + bufs[0], + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + last1 = _get_int_mem_range_attr( + bufs[1], + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last0 == _HOST_LOCATION_ID + assert last1 == device.device_id + for buf in bufs: + buf.close() + + def test_length_mismatch(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + + with pytest.raises(ValueError, match="length"): + prefetch(bufs, [Location.host()], stream=stream) + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + prefetch(buf, Location.host(), stream=stream) + buf.close() + + def test_location_required(self, init_cuda): + from cuda.core.utils import prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="location is required"): + prefetch(buf, None, stream=stream) + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.utils import Location, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(TypeError, match="must be a .*Options instance or None"): + prefetch(buf, Location.host(), options={}, stream=stream) + buf.close() + + +class TestDiscard: + def test_single_buffer(self, init_cuda): + from cuda.core.utils import Location, discard, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardBatchAsync"): + pytest.skip("cuMemDiscardBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + prefetch(buf, Location.device(device.device_id), stream=stream) + stream.sync() + discard(buf, stream=stream) + stream.sync() + buf.close() + + def test_batched(self, init_cuda): + from cuda.core.utils import Location, discard, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardBatchAsync"): + pytest.skip("cuMemDiscardBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] + stream = device.create_stream() + prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + discard(bufs, stream=stream) + stream.sync() + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.utils import discard + + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + discard(buf, stream=stream) + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.utils import discard + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(TypeError, match="must be a .*Options instance or None"): + discard(buf, options={}, stream=stream) + buf.close() + + +class TestDiscardPrefetch: + def test_single_buffer(self, init_cuda): + from cuda.core.utils import Location, discard_prefetch, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + + prefetch(buf, Location.host(), stream=stream) + stream.sync() + discard_prefetch(buf, Location.device(device.device_id), stream=stream) + stream.sync() + + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_batched_same_location(self, init_cuda): + from cuda.core.utils import Location, discard_prefetch, prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): + pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + prefetch(bufs, Location.host(), stream=stream) + stream.sync() + discard_prefetch(bufs, Location.device(device.device_id), stream=stream) + stream.sync() + for buf in bufs: + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + buf.close() + + def test_length_mismatch(self, init_cuda): + from cuda.core.utils import Location, discard_prefetch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + stream = device.create_stream() + with pytest.raises(ValueError, match="length"): + discard_prefetch(bufs, [Location.host()], stream=stream) + for buf in bufs: + buf.close() + + def test_rejects_non_managed(self, init_cuda): + from cuda.core.utils import Location, discard_prefetch + + device = Device() + device.set_current() + buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + with pytest.raises(ValueError, match="managed-memory"): + discard_prefetch(buf, Location.host(), stream=stream) + buf.close() + + +class TestAdvise: + def test_batched_same_advice(self, init_cuda): + from cuda.core.utils import advise + + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + mr = DummyUnifiedMemoryResource(device) + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + advise(bufs, "set_read_mostly") + # Query all attributes BEFORE closing any buffer. On CUDA 12, freeing + # a managed allocation can clear read-mostly advice on neighboring + # ranges; close-then-query in a single loop falsely flags the later + # iterations as having lost the advice. + results = [ + _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + ) + for buf in bufs + ] + for buf in bufs: + buf.close() + for r in results: + assert r == _READ_MOSTLY_ENABLED + + def test_batched_per_buffer_location(self, init_cuda): + from cuda.core.utils import Location, advise + + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + mr = DummyUnifiedMemoryResource(device) + bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] + advise( + bufs, + "set_preferred_location", + [Location.host(), Location.device(device.device_id)], + ) + for buf in bufs: + buf.close() + + def test_options_must_be_none(self, init_cuda): + from cuda.core.utils import advise + + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + with pytest.raises(TypeError, match="must be a .*Options instance or None"): + advise(buf, "set_read_mostly", options={}) + buf.close() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 0639e4505d7..7804c31780e 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -18,7 +18,7 @@ import pytest from helpers import IS_WINDOWS, supports_ipc_mempool -from helpers.buffers import DummyUnifiedMemoryResource, TrackingMR +from helpers.buffers import DummyDeviceMemoryResource, DummyUnifiedMemoryResource, TrackingMR from conftest import ( create_managed_memory_resource_or_skip, @@ -49,35 +49,6 @@ from cuda.core.utils import StridedMemoryView POOL_SIZE = 2097152 # 2MB size -_MANAGED_TEST_ALLOCATION_SIZE = 4096 -_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4 -_READ_MOSTLY_ENABLED = 1 -_HOST_LOCATION_ID = -1 -_INVALID_HOST_DEVICE_ORDINAL = 0 - - -class DummyDeviceMemoryResource(MemoryResource): - def __init__(self, device): - self.device = device - - def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(driver.cuMemAlloc(size)) - return Buffer.from_handle(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - handle_return(driver.cuMemFree(ptr)) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return False - - @property - def device_id(self) -> int: - return 0 class DummyHostMemoryResource(MemoryResource): @@ -1198,277 +1169,6 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): ) -def _get_mem_range_attr(buffer, attribute, data_size): - # cuMemRangeGetAttribute returns a raw integer when data_size <= 4. - return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) - - -def _get_int_mem_range_attr(buffer, attribute): - return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE) - - -def _skip_if_managed_allocation_unsupported(device): - try: - if not device.properties.managed_memory: - pytest.skip("Device does not support managed memory operations") - except AttributeError: - pytest.skip("Managed-memory buffer operations require CUDA support") - - -def _skip_if_managed_location_ops_unsupported(device): - _skip_if_managed_allocation_unsupported(device) - try: - if not device.properties.concurrent_managed_access: - pytest.skip("Device does not support concurrent managed memory access") - except AttributeError: - pytest.skip("Managed-memory location operations require CUDA support") - - -def _skip_if_managed_discard_prefetch_unsupported(device): - _skip_if_managed_location_ops_unsupported(device) - if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): - pytest.skip("discard-prefetch requires cuda.bindings support") - - visible_devices = Device.get_all_devices() - if not all(dev.properties.concurrent_managed_access for dev in visible_devices): - pytest.skip("discard-prefetch requires concurrent managed access on all visible devices") - - -def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - - mr = create_managed_memory_resource_or_skip() - buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) - stream.sync() - last_location = _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last_location == _HOST_LOCATION_ID - - utils.prefetch(buffer, device, stream=stream) - stream.sync() - last_location = _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last_location == device.device_id - - buffer.close() - - -def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): - from cuda.core.utils import Location - - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - - utils.advise(buffer, "set_read_mostly") - assert ( - _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - ) - == _READ_MOSTLY_ENABLED - ) - - # cuda.bindings currently exposes the combined location attributes for - # cuMemRangeGetAttribute, so use the legacy location query here. - utils.advise(buffer, "set_preferred_location", Location.host()) - preferred_location = _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, - ) - assert preferred_location == _HOST_LOCATION_ID - - buffer.close() - - -def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda): - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - utils.prefetch(buffer, device, stream=stream) - stream.sync() - - last_location = _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last_location == device.device_id - - buffer.close() - - -def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda): - device = Device() - skip_if_managed_memory_unsupported(device) - _skip_if_managed_discard_prefetch_unsupported(device) - device.set_current() - - mr = create_managed_memory_resource_or_skip() - buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) - stream.sync() - - utils.discard_prefetch(buffer, device, stream=stream) - stream.sync() - - last_location = _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last_location == device.device_id - - buffer.close() - - -def test_managed_memory_discard_prefetch_supports_external_managed_allocations(init_cuda): - device = Device() - _skip_if_managed_discard_prefetch_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) - stream.sync() - - utils.discard_prefetch(buffer, device, stream=stream) - stream.sync() - - last_location = _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last_location == device.device_id - - buffer.close() - - -def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): - device = Device() - device.set_current() - - buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - with pytest.raises(ValueError, match="managed-memory allocation"): - utils.advise(buffer, "set_read_mostly") - with pytest.raises(ValueError, match="managed-memory allocation"): - utils.prefetch(buffer, device, stream=stream) - with pytest.raises(ValueError, match="managed-memory allocation"): - utils.discard_prefetch(buffer, device, stream=stream) - - buffer.close() - - -def test_managed_memory_operation_validation(init_cuda): - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - - mr = create_managed_memory_resource_or_skip() - buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - with pytest.raises(ValueError, match="location is required"): - utils.prefetch(buffer, stream=stream) - from cuda.core.utils import Location - - with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - utils.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) - - buffer.close() - - -def test_managed_memory_advise_location_validation(init_cuda): - """Verify doc-specified location constraints for each advice kind.""" - from cuda.core.utils import Location - - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - - # set_read_mostly works without a location (location is ignored) - utils.advise(buffer, "set_read_mostly") - - # set_preferred_location requires a location; device ordinal works - utils.advise(buffer, "set_preferred_location", device.device_id) - - # set_preferred_location with host location - utils.advise(buffer, "set_preferred_location", Location.host()) - - # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) - with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - utils.advise(buffer, "set_accessed_by", Location.host_numa(0)) - - # set_accessed_by with host_numa_current also raises ValueError - with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): - utils.advise(buffer, "set_accessed_by", Location.host_numa_current()) - - # Inferred location from int: -1 maps to host, 0 maps to device - utils.advise(buffer, "set_preferred_location", -1) - utils.advise(buffer, "set_preferred_location", 0) - - buffer.close() - - -def test_managed_memory_advise_accepts_enum_value(init_cuda): - """advise() accepts CUmem_advise enum values directly, not just string aliases.""" - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - - advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY - utils.advise(buffer, advice_enum) - - assert ( - _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - ) - == _READ_MOSTLY_ENABLED - ) - - buffer.close() - - -def test_managed_memory_advise_invalid_advice_values(init_cuda): - """advise() rejects invalid advice strings and wrong types.""" - device = Device() - _skip_if_managed_allocation_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - - with pytest.raises(ValueError, match="advice must be one of"): - utils.advise(buffer, "not_a_real_advice") - - with pytest.raises(TypeError, match="advice must be"): - utils.advise(buffer, 42) - - buffer.close() - - def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch @@ -1873,417 +1573,3 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): assert buffer.device_id == mr.device_id -class TestLocation: - def test_device_constructor(self): - from cuda.core.utils import Location - - loc = Location.device(0) - assert loc.kind == "device" - assert loc.id == 0 - - def test_host_constructor(self): - from cuda.core.utils import Location - - loc = Location.host() - assert loc.kind == "host" - assert loc.id is None - - def test_host_numa_constructor(self): - from cuda.core.utils import Location - - loc = Location.host_numa(3) - assert loc.kind == "host_numa" - assert loc.id == 3 - - def test_host_numa_current_constructor(self): - from cuda.core.utils import Location - - loc = Location.host_numa_current() - assert loc.kind == "host_numa_current" - assert loc.id is None - - def test_frozen(self): - import dataclasses - - from cuda.core.utils import Location - - loc = Location.device(0) - with pytest.raises(dataclasses.FrozenInstanceError): - loc.id = 1 - - def test_invalid_device_id(self): - from cuda.core.utils import Location - - with pytest.raises(ValueError, match="device id must be >= 0"): - Location.device(-1) - - def test_invalid_kind(self): - from cuda.core.utils import Location - - with pytest.raises(ValueError, match="kind must be one of"): - Location(kind="not_a_kind", id=None) - - -class TestLocationCoerce: - def test_passthrough(self): - from cuda.core._memory._managed_location import _coerce_location - from cuda.core.utils import Location - - loc = Location.device(0) - assert _coerce_location(loc) is loc - - def test_int_device(self): - from cuda.core._memory._managed_location import _coerce_location - - assert _coerce_location(0).kind == "device" - assert _coerce_location(0).id == 0 - - def test_int_minus_one_is_host(self): - from cuda.core._memory._managed_location import _coerce_location - - assert _coerce_location(-1).kind == "host" - - def test_device_object(self, init_cuda): - from cuda.core import Device - from cuda.core._memory._managed_location import _coerce_location - - dev = Device() - loc = _coerce_location(dev) - assert loc.kind == "device" - assert loc.id == dev.device_id - - def test_none_when_disallowed(self): - from cuda.core._memory._managed_location import _coerce_location - - with pytest.raises(ValueError, match="location is required"): - _coerce_location(None, allow_none=False) - - def test_none_when_allowed(self): - from cuda.core._memory._managed_location import _coerce_location - - assert _coerce_location(None, allow_none=True) is None - - def test_bad_int(self): - from cuda.core._memory._managed_location import _coerce_location - - with pytest.raises(ValueError, match="device ordinal"): - _coerce_location(-2) - - def test_bad_type(self): - from cuda.core._memory._managed_location import _coerce_location - - with pytest.raises(TypeError, match="Location, Device, int, or None"): - _coerce_location("device") - - -class TestPrefetch: - def test_single_with_location_host(self, init_cuda): - from cuda.core.utils import Location, prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - prefetch(buf, Location.host(), stream=stream) - stream.sync() - last = _get_int_mem_range_attr( - buf, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last == _HOST_LOCATION_ID - buf.close() - - def test_batched_same_location(self, init_cuda): - from cuda.core.utils import Location, prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemPrefetchBatchAsync"): - pytest.skip("cuMemPrefetchBatchAsync unavailable") - device.set_current() - mr = create_managed_memory_resource_or_skip() - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] - stream = device.create_stream() - - prefetch(bufs, Location.device(device.device_id), stream=stream) - stream.sync() - - for buf in bufs: - last = _get_int_mem_range_attr( - buf, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last == device.device_id - buf.close() - - def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.utils import Location, prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemPrefetchBatchAsync"): - pytest.skip("cuMemPrefetchBatchAsync unavailable") - device.set_current() - mr = create_managed_memory_resource_or_skip() - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] - stream = device.create_stream() - - prefetch(bufs, [Location.host(), Location.device(device.device_id)], stream=stream) - stream.sync() - - last0 = _get_int_mem_range_attr( - bufs[0], - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - last1 = _get_int_mem_range_attr( - bufs[1], - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last0 == _HOST_LOCATION_ID - assert last1 == device.device_id - for buf in bufs: - buf.close() - - def test_length_mismatch(self, init_cuda): - from cuda.core.utils import Location, prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - mr = create_managed_memory_resource_or_skip() - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] - stream = device.create_stream() - - with pytest.raises(ValueError, match="length"): - prefetch(bufs, [Location.host()], stream=stream) - for buf in bufs: - buf.close() - - def test_rejects_non_managed(self, init_cuda): - from cuda.core.utils import Location, prefetch - - device = Device() - device.set_current() - buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - with pytest.raises(ValueError, match="managed-memory"): - prefetch(buf, Location.host(), stream=stream) - buf.close() - - def test_location_required(self, init_cuda): - from cuda.core.utils import prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - with pytest.raises(ValueError, match="location is required"): - prefetch(buf, None, stream=stream) - buf.close() - - def test_options_must_be_none(self, init_cuda): - from cuda.core.utils import Location, prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - with pytest.raises(TypeError, match="must be a .*Options instance or None"): - prefetch(buf, Location.host(), options={}, stream=stream) - buf.close() - - -class TestDiscard: - def test_single_buffer(self, init_cuda): - from cuda.core.utils import Location, discard, prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemDiscardBatchAsync"): - pytest.skip("cuMemDiscardBatchAsync unavailable") - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - prefetch(buf, Location.device(device.device_id), stream=stream) - stream.sync() - discard(buf, stream=stream) - stream.sync() - buf.close() - - def test_batched(self, init_cuda): - from cuda.core.utils import Location, discard, prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemDiscardBatchAsync"): - pytest.skip("cuMemDiscardBatchAsync unavailable") - device.set_current() - mr = create_managed_memory_resource_or_skip() - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] - stream = device.create_stream() - prefetch(bufs, Location.device(device.device_id), stream=stream) - stream.sync() - discard(bufs, stream=stream) - stream.sync() - for buf in bufs: - buf.close() - - def test_rejects_non_managed(self, init_cuda): - from cuda.core.utils import discard - - device = Device() - device.set_current() - buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - with pytest.raises(ValueError, match="managed-memory"): - discard(buf, stream=stream) - buf.close() - - def test_options_must_be_none(self, init_cuda): - from cuda.core.utils import discard - - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - with pytest.raises(TypeError, match="must be a .*Options instance or None"): - discard(buf, options={}, stream=stream) - buf.close() - - -class TestDiscardPrefetch: - def test_single_buffer(self, init_cuda): - from cuda.core.utils import Location, discard_prefetch, prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): - pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - prefetch(buf, Location.host(), stream=stream) - stream.sync() - discard_prefetch(buf, Location.device(device.device_id), stream=stream) - stream.sync() - - last = _get_int_mem_range_attr( - buf, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last == device.device_id - buf.close() - - def test_batched_same_location(self, init_cuda): - from cuda.core.utils import Location, discard_prefetch, prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): - pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") - device.set_current() - mr = create_managed_memory_resource_or_skip() - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] - stream = device.create_stream() - prefetch(bufs, Location.host(), stream=stream) - stream.sync() - discard_prefetch(bufs, Location.device(device.device_id), stream=stream) - stream.sync() - for buf in bufs: - last = _get_int_mem_range_attr( - buf, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last == device.device_id - buf.close() - - def test_length_mismatch(self, init_cuda): - from cuda.core.utils import Location, discard_prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - mr = create_managed_memory_resource_or_skip() - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] - stream = device.create_stream() - with pytest.raises(ValueError, match="length"): - discard_prefetch(bufs, [Location.host()], stream=stream) - for buf in bufs: - buf.close() - - def test_rejects_non_managed(self, init_cuda): - from cuda.core.utils import Location, discard_prefetch - - device = Device() - device.set_current() - buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - with pytest.raises(ValueError, match="managed-memory"): - discard_prefetch(buf, Location.host(), stream=stream) - buf.close() - - -class TestAdvise: - def test_batched_same_advice(self, init_cuda): - from cuda.core.utils import advise - - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - mr = DummyUnifiedMemoryResource(device) - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] - advise(bufs, "set_read_mostly") - # Query all attributes BEFORE closing any buffer. On CUDA 12, freeing - # a managed allocation can clear read-mostly advice on neighboring - # ranges; close-then-query in a single loop falsely flags the later - # iterations as having lost the advice. - results = [ - _get_int_mem_range_attr( - buf, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - ) - for buf in bufs - ] - for buf in bufs: - buf.close() - for r in results: - assert r == _READ_MOSTLY_ENABLED - - def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.utils import Location, advise - - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - mr = DummyUnifiedMemoryResource(device) - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] - advise( - bufs, - "set_preferred_location", - [Location.host(), Location.device(device.device_id)], - ) - for buf in bufs: - buf.close() - - def test_options_must_be_none(self, init_cuda): - from cuda.core.utils import advise - - device = Device() - _skip_if_managed_allocation_unsupported(device) - device.set_current() - buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - with pytest.raises(TypeError, match="must be a .*Options instance or None"): - advise(buf, "set_read_mostly", options={}) - buf.close() From a3f342f88933d8bd46694a4a7658b7f59c7066d1 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 12:42:42 -0700 Subject: [PATCH 38/68] test(cuda.core): fix options regex for AdviseOptions ("an" vs "a") The advise() error message reads "must be an AdviseOptions instance or None" (vowel triggers "an"), but the regex matched only "must be a ". Relax to "must be an?" so all three op tests pass. --- cuda_core/tests/memory/test_managed_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index bea4776688e..f820a5e226f 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -517,7 +517,7 @@ def test_options_must_be_none(self, init_cuda): mr = create_managed_memory_resource_or_skip() buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(TypeError, match="must be a .*Options instance or None"): + with pytest.raises(TypeError, match="must be an? .*Options instance or None"): prefetch(buf, Location.host(), options={}, stream=stream) buf.close() @@ -578,7 +578,7 @@ def test_options_must_be_none(self, init_cuda): mr = create_managed_memory_resource_or_skip() buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(TypeError, match="must be a .*Options instance or None"): + with pytest.raises(TypeError, match="must be an? .*Options instance or None"): discard(buf, options={}, stream=stream) buf.close() @@ -706,6 +706,6 @@ def test_options_must_be_none(self, init_cuda): _skip_if_managed_allocation_unsupported(device) device.set_current() buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - with pytest.raises(TypeError, match="must be a .*Options instance or None"): + with pytest.raises(TypeError, match="must be an? .*Options instance or None"): advise(buf, "set_read_mostly", options={}) buf.close() From c2a966298eb68d068117d08f23949405b27777b2 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 12:45:13 -0700 Subject: [PATCH 39/68] chore(cuda.core): drop unused utils import + trailing blank lines Pre-commit cleanup after splitting managed-memory ops tests out of test_memory.py: the `cuda.core.utils` import is no longer used here, and ruff trimmed trailing blank lines. --- cuda_core/tests/test_memory.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 7804c31780e..68e17ee1443 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -38,7 +38,6 @@ PinnedMemoryResourceOptions, VirtualMemoryResource, VirtualMemoryResourceOptions, - utils, ) from cuda.core import ( system as ccx_system, @@ -1571,5 +1570,3 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): assert buffer.handle >= 0 assert buffer.size == 0 assert buffer.device_id == mr.device_id - - From bede674d0e04a9ca80013a8da4b4ea35e06af701 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 13:35:07 -0700 Subject: [PATCH 40/68] feat(cuda.core): add ManagedBuffer subclass + Host location Land Andy's ManagedBuffer + Device/Host design (review #3976251223, #3164213789). The free-function shape introduced earlier in this PR is preserved; ManagedBuffer methods delegate into it, so existing call sites keep working. ManagedBuffer - Subclass of Buffer returned by ManagedMemoryResource.allocate, also constructable from an external pointer via ManagedBuffer.from_handle. - Property-style advice API: - read_mostly (bool, driver-backed get/set) - preferred_location (Device | Host | None, get/set; None unsets) - accessed_by (live AccessedBySet view: __contains__/__iter__/len query the driver, add()/discard() issue advice; setter diffs and advises only the deltas) - Instance methods prefetch / discard / discard_prefetch delegate to the matching cuda.core.utils functions. Host - New top-level class symmetric to Device. Host(), Host(numa_id=N), Host.numa_current(). Replaces Location.host()/host_numa()/etc. Location -> Device|Host|int - Drop the public Location dataclass and its classmethod constructors. - _coerce_location now accepts Device | Host | int | None and produces an internal _LocSpec record; advise/prefetch/discard/discard_prefetch signatures and docstrings updated accordingly. - int still accepted for ergonomic compatibility (-1 = host, >=0 = device ordinal). Plumbing - Buffer_from_deviceptr_handle takes an optional `cls` parameter so the pool allocator can materialize Buffer subclasses; _MP_allocate threads the same parameter through; ManagedMemoryResource.allocate passes ManagedBuffer. Tests - TestHost replaces TestLocation; TestLocationCoerce adapted to the new coerce signature. New TestManagedBuffer covers from_handle, isinstance(allocate(), ManagedBuffer), read_mostly/preferred_location/ accessed_by roundtrips, and instance methods. Property tests use external (cuMemAllocManaged) backing wrapped via from_handle, since some driver/device combinations decline cuMemAdvise on pool-allocated managed memory. - Use cuDeviceGetCount in AccessedBySet._query so the read path doesn't pull in NVML. Docs - 1.0.0 notes describe Host, ManagedBuffer, the property API, and the Device/Host location inputs. api.rst lists Host, ManagedBuffer, and the *Options dataclasses; Location is removed. --- cuda_core/cuda/core/__init__.py | 2 + cuda_core/cuda/core/_host.py | 44 +++ cuda_core/cuda/core/_memory/__init__.py | 1 + cuda_core/cuda/core/_memory/_buffer.pxd | 7 +- cuda_core/cuda/core/_memory/_buffer.pyx | 9 +- .../cuda/core/_memory/_managed_buffer.py | 199 ++++++++++ .../cuda/core/_memory/_managed_location.py | 73 ++-- .../cuda/core/_memory/_managed_memory_ops.pyx | 12 +- .../core/_memory/_managed_memory_resource.pyx | 19 +- cuda_core/cuda/core/_memory/_memory_pool.pxd | 9 +- cuda_core/cuda/core/_memory/_memory_pool.pyx | 4 +- cuda_core/cuda/core/utils.py | 2 - cuda_core/docs/source/api.rst | 10 +- cuda_core/docs/source/release/1.0.0-notes.rst | 42 ++- cuda_core/tests/memory/test_managed_ops.py | 352 +++++++++++++----- cuda_core/tests/test_memory.py | 1 + 16 files changed, 610 insertions(+), 176 deletions(-) create mode 100644 cuda_core/cuda/core/_host.py create mode 100644 cuda_core/cuda/core/_memory/_managed_buffer.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index dfd52accea3..44ff14cbdaa 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -32,6 +32,7 @@ def _import_versioned_module(): from cuda.core._device import Device from cuda.core._event import Event, EventOptions from cuda.core._graphics import GraphicsResource +from cuda.core._host import Host from cuda.core._launch_config import LaunchConfig from cuda.core._launcher import launch from cuda.core._linker import Linker, LinkerOptions @@ -41,6 +42,7 @@ def _import_versioned_module(): DeviceMemoryResourceOptions, GraphMemoryResource, LegacyPinnedMemoryResource, + ManagedBuffer, ManagedMemoryResource, ManagedMemoryResourceOptions, MemoryResource, diff --git a/cuda_core/cuda/core/_host.py b/cuda_core/cuda/core/_host.py new file mode 100644 index 00000000000..e3aa4aebe5e --- /dev/null +++ b/cuda_core/cuda/core/_host.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Host: + """Host (CPU) location for managed-memory operations. + + Use one of the three forms: + + * ``Host()`` — generic host (any NUMA node). + * ``Host(numa_id=N)`` — specific NUMA node ``N``. + * ``Host.numa_current()`` — NUMA node of the calling thread. + + ``Host`` is the symmetric counterpart of :class:`~cuda.core.Device` + for managed-memory `prefetch`, `advise`, and `discard_prefetch` + targets. Pass either a ``Device`` or a ``Host`` to those operations + and to ``ManagedBuffer.preferred_location`` / ``accessed_by``. + """ + + numa_id: int | None = None + is_numa_current: bool = False + + def __post_init__(self) -> None: + if self.is_numa_current and self.numa_id is not None: + raise ValueError("Host.numa_current() cannot have an explicit numa_id") + if self.numa_id is not None and (not isinstance(self.numa_id, int) or self.numa_id < 0): + raise ValueError(f"numa_id must be a non-negative int, got {self.numa_id!r}") + + @classmethod + def numa_current(cls) -> Host: + """Construct a ``Host`` referring to the calling thread's NUMA node.""" + return cls(is_numa_current=True) + + def __repr__(self) -> str: + if self.is_numa_current: + return "Host.numa_current()" + if self.numa_id is None: + return "Host()" + return f"Host(numa_id={self.numa_id})" diff --git a/cuda_core/cuda/core/_memory/__init__.py b/cuda_core/cuda/core/_memory/__init__.py index 068a6526b7a..bf40a643f8c 100644 --- a/cuda_core/cuda/core/_memory/__init__.py +++ b/cuda_core/cuda/core/_memory/__init__.py @@ -7,6 +7,7 @@ from ._graph_memory_resource import * from ._ipc import * from ._legacy import * +from ._managed_buffer import ManagedBuffer from ._managed_memory_resource import * from ._pinned_memory_resource import * from ._virtual_memory_resource import * diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 9065da77eb8..748e9642741 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -32,12 +32,15 @@ cdef class MemoryResource: pass -# Helper function to create a Buffer from a DevicePtrHandle +# Helper function to create a Buffer from a DevicePtrHandle. +# `cls` lets callers materialize Buffer subclasses (e.g. ManagedBuffer for +# managed-memory allocations); defaults to Buffer. cdef Buffer Buffer_from_deviceptr_handle( DevicePtrHandle h_ptr, size_t size, MemoryResource mr, - object ipc_descriptor = * + object ipc_descriptor = *, + type cls = *, ) # Memory attribute query helpers (used by _managed_memory_ops) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 0032c605163..18ebbfa7801 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -548,14 +548,15 @@ cdef class MemoryResource: # Buffer Implementation Helpers # ----------------------------- -cdef inline Buffer Buffer_from_deviceptr_handle( +cdef Buffer Buffer_from_deviceptr_handle( DevicePtrHandle h_ptr, size_t size, MemoryResource mr, - object ipc_descriptor = None + object ipc_descriptor = None, + type cls = Buffer, ): - """Create a Buffer from an existing DevicePtrHandle.""" - cdef Buffer buf = Buffer.__new__(Buffer) + """Create a Buffer (or subclass instance) from an existing DevicePtrHandle.""" + cdef Buffer buf = cls.__new__(cls) buf._h_ptr = h_ptr buf._size = size buf._memory_resource = mr diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py new file mode 100644 index 00000000000..3ce5e5b2afb --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -0,0 +1,199 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from cuda.core._device import Device +from cuda.core._host import Host +from cuda.core._memory._buffer import Buffer +from cuda.core._utils.cuda_utils import driver, handle_return + +if TYPE_CHECKING: + from cuda.core._stream import Stream + from cuda.core.graph import GraphBuilder + + +_INT_SIZE = 4 + + +def _get_int_attr(buf: Buffer, attribute) -> int: + return handle_return(driver.cuMemRangeGetAttribute(_INT_SIZE, attribute, buf.handle, buf.size)) + + +class AccessedBySet: + """Live driver-backed view of ``set_accessed_by`` advice for a managed buffer. + + Reads (``__contains__``, ``__iter__``, ``len(...)``) call + ``cuMemRangeGetAttribute``; writes (``add``, ``discard``) call + ``cuMemAdvise``. There is no in-memory mirror, so the view always + reflects the current driver state. + + Note + ---- + The driver's read-back path returns integer device ordinals (``-1`` for + host); host NUMA distinctions applied via ``Host(numa_id=...)`` are not + distinguishable from a generic ``Host()`` when iterating this set. + """ + + __slots__ = ("_buf",) + + def __init__(self, buf: ManagedBuffer): + self._buf = buf + + def _query(self) -> list[Device | Host]: + # Driver fills the array with device ordinals: device id, -1 = host, + # -2 = empty slot. Size must accommodate every CUDA-visible device + # plus a slot for the host. We use cuDeviceGetCount (driver-side) to + # stay independent of NVML availability. + num_devices = handle_return(driver.cuDeviceGetCount()) + n = num_devices + 1 + raw = handle_return( + driver.cuMemRangeGetAttribute( + n * _INT_SIZE, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, + self._buf.handle, + self._buf.size, + ) + ) + result: list[Device | Host] = [] + for v in raw: + if v == -2: # CU_DEVICE_INVALID — empty slot + continue + result.append(Host() if v == -1 else Device(v)) + return result + + def __contains__(self, location) -> bool: + return location in self._query() + + def __iter__(self): + return iter(self._query()) + + def __len__(self) -> int: + return len(self._query()) + + def __eq__(self, other) -> bool: + if isinstance(other, AccessedBySet): + return set(self._query()) == set(other._query()) + if isinstance(other, (set, frozenset)): + return set(self._query()) == other + return NotImplemented + + def __repr__(self) -> str: + return f"AccessedBySet({set(self._query())!r})" + + def add(self, location: Device | Host) -> None: + """Apply ``set_accessed_by`` advice for ``location``.""" + from cuda.core.utils import advise + + advise(self._buf, "set_accessed_by", location) + + def discard(self, location: Device | Host) -> None: + """Apply ``unset_accessed_by`` advice for ``location``.""" + from cuda.core.utils import advise + + advise(self._buf, "unset_accessed_by", location) + + +class ManagedBuffer(Buffer): + """Managed (unified) memory buffer with a property-style advice API. + + Returned by :meth:`ManagedMemoryResource.allocate`. Wrap an external + managed-memory pointer with :meth:`ManagedBuffer.from_handle`. + + Examples + -------- + >>> buf = mr.allocate(size) + >>> buf.read_mostly = True + >>> buf.preferred_location = Device(0) + >>> buf.accessed_by.add(Device(1)) + >>> buf.prefetch(Device(0), stream=stream) + + Note + ---- + The driver's read-back path for ``preferred_location`` and + ``accessed_by`` returns integer device ordinals; host NUMA distinctions + applied via ``Host(numa_id=...)`` collapse to a generic ``Host()`` when + queried. Setters preserve full NUMA information when issuing advice. + """ + + @classmethod + def from_handle( + cls, + ptr, + size: int, + mr=None, + owner=None, + ) -> ManagedBuffer: + """Wrap an existing managed-memory pointer in a :class:`ManagedBuffer`.""" + return cls._init(ptr, size, mr=mr, owner=owner) + + @property + def read_mostly(self) -> bool: + """Whether ``set_read_mostly`` advice is currently applied to this range.""" + return _get_int_attr(self, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY) != 0 + + @read_mostly.setter + def read_mostly(self, value: bool) -> None: + from cuda.core.utils import advise + + advise(self, "set_read_mostly" if value else "unset_read_mostly") + + @property + def preferred_location(self) -> Device | Host | None: + """Currently applied ``set_preferred_location`` target, or ``None`` if unset.""" + # The legacy PREFERRED_LOCATION attribute returns a single int: + # -2 = invalid (no preferred location), -1 = host, >=0 = device ordinal. + # NUMA-specific preferences round-trip as a generic Host (CUDA driver + # limitation of the legacy query path). + loc_id = _get_int_attr(self, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION) + if loc_id == -2: + return None + if loc_id == -1: + return Host() + return Device(loc_id) + + @preferred_location.setter + def preferred_location(self, value: Device | Host | None) -> None: + from cuda.core.utils import advise + + if value is None: + advise(self, "unset_preferred_location") + else: + advise(self, "set_preferred_location", value) + + @property + def accessed_by(self) -> AccessedBySet: + """Live set-like view of ``set_accessed_by`` locations.""" + return AccessedBySet(self) + + @accessed_by.setter + def accessed_by(self, locations) -> None: + # Diff against the current driver state and advise only the deltas. + from cuda.core.utils import advise + + current = set(AccessedBySet(self)) + target = set(locations) + for loc in current - target: + advise(self, "unset_accessed_by", loc) + for loc in target - current: + advise(self, "set_accessed_by", loc) + + def prefetch(self, location: Device | Host | int, *, stream: Stream | GraphBuilder) -> None: + """Prefetch this range to ``location`` on ``stream``.""" + from cuda.core.utils import prefetch as _prefetch + + _prefetch(self, location, stream=stream) + + def discard(self, *, stream: Stream | GraphBuilder) -> None: + """Discard this range's resident pages on ``stream`` (CUDA 13+).""" + from cuda.core.utils import discard as _discard + + _discard(self, stream=stream) + + def discard_prefetch(self, location: Device | Host | int, *, stream: Stream | GraphBuilder) -> None: + """Discard this range and prefetch to ``location`` on ``stream`` (CUDA 13+).""" + from cuda.core.utils import discard_prefetch as _discard_prefetch + + _discard_prefetch(self, location, stream=stream) diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index 9d2eee23b21..5e23a968cc3 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -6,69 +6,52 @@ from dataclasses import dataclass from typing import Literal -_VALID_KINDS = ("device", "host", "host_numa", "host_numa_current") -LocationKind = Literal["device", "host", "host_numa", "host_numa_current"] +_LocationKind = Literal["device", "host", "host_numa", "host_numa_current"] @dataclass(frozen=True) -class Location: - """Typed managed-memory location. +class _LocSpec: + """Internal location record produced by :func:`_coerce_location`. - Use the classmethod constructors (``device``, ``host``, ``host_numa``, - ``host_numa_current``) rather than constructing directly. + Carries the discriminator (``kind``) and the integer payload (``id``) + that the Cython layer in ``_managed_memory_ops.pyx`` consumes when + building ``CUmemLocation`` structs (CUDA 13+) or legacy device + ordinals (CUDA 12). """ - kind: LocationKind - id: int | None = None + kind: _LocationKind + id: int = 0 - def __post_init__(self) -> None: - if self.kind not in _VALID_KINDS: - raise ValueError(f"kind must be one of {_VALID_KINDS!r}, got {self.kind!r}") - if self.kind == "device": - if not isinstance(self.id, int) or self.id < 0: - raise ValueError("device id must be >= 0") - elif self.kind == "host_numa": - if not isinstance(self.id, int) or self.id < 0: - raise ValueError("host_numa id must be >= 0") - elif self.kind in ("host", "host_numa_current") and self.id is not None: - raise ValueError(f"{self.kind} location must have id=None") - @classmethod - def device(cls, device_id: int) -> Location: - return cls(kind="device", id=device_id) +def _coerce_location(value, *, allow_none: bool = False) -> _LocSpec | None: + """Coerce :class:`Device` / :class:`Host` / int / ``None`` to ``_LocSpec``. - @classmethod - def host(cls) -> Location: - return cls(kind="host", id=None) - - @classmethod - def host_numa(cls, numa_id: int) -> Location: - return cls(kind="host_numa", id=numa_id) - - @classmethod - def host_numa_current(cls) -> Location: - return cls(kind="host_numa_current", id=None) - - -def _coerce_location(value, *, allow_none: bool = False) -> Location | None: - """Coerce ``Location`` / ``Device`` / int / ``None`` to ``Location``. - - Maps int ``-1`` to host and other non-negative ints to that device ordinal. + Maps int ``-1`` to host and other non-negative ints to that device + ordinal. ``Host()``, ``Host(numa_id=N)``, and ``Host.numa_current()`` + map to the corresponding NUMA-aware kinds. """ - from cuda.core._device import Device # avoid import cycle at module load + # Local imports to avoid import cycles (Device pulls in CUDA init). + from cuda.core._device import Device + from cuda.core._host import Host - if isinstance(value, Location): + if isinstance(value, _LocSpec): return value if isinstance(value, Device): - return Location.device(value.device_id) + return _LocSpec(kind="device", id=value.device_id) + if isinstance(value, Host): + if value.is_numa_current: + return _LocSpec(kind="host_numa_current") + if value.numa_id is not None: + return _LocSpec(kind="host_numa", id=value.numa_id) + return _LocSpec(kind="host") if value is None: if allow_none: return None raise ValueError("location is required") if isinstance(value, int): if value == -1: - return Location.host() + return _LocSpec(kind="host") if value >= 0: - return Location.device(value) + return _LocSpec(kind="device", id=value) raise ValueError(f"device ordinal must be >= 0 (or -1 for host), got {value}") - raise TypeError(f"location must be a Location, Device, int, or None; got {type(value).__name__}") + raise TypeError(f"location must be a Device, Host, int, or None; got {type(value).__name__}") diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 5b64a740a60..f78eabad9d2 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -127,7 +127,7 @@ cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, IF CUDA_CORE_BUILD_MAJOR >= 13: - # Convert a Location dataclass to a cydriver.CUmemLocation struct. + # Convert a _LocSpec dataclass to a cydriver.CUmemLocation struct. cdef inline cydriver.CUmemLocation _to_cumemlocation(object loc): cdef cydriver.CUmemLocation out cdef str kind = loc.kind @@ -247,7 +247,7 @@ def advise( ``"unset_read_mostly"``, ``"set_preferred_location"``, ``"unset_preferred_location"``, ``"set_accessed_by"``, ``"unset_accessed_by"``) and ``CUmem_advise`` enum values are accepted. - location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | int | Sequence[...] Target location(s). Required for advice values that consult a location; ignored (may be ``None``) for ``set_read_mostly``, ``unset_read_mostly``, and ``unset_preferred_location``. A sequence @@ -321,10 +321,10 @@ def prefetch( ---------- targets : :class:`Buffer` | Sequence[:class:`Buffer`] One or more managed allocations to operate on. - location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | int | Sequence[...] Target location(s). A single location applies to all targets; a - sequence must match ``len(targets)``. ``Device`` and ``int`` values - are coerced to :class:`Location` (``-1`` maps to host). + sequence must match ``len(targets)``. ``int`` values are coerced + to a location (``-1`` maps to host, ``>=0`` to that device ordinal). options : :class:`PrefetchOptions`, optional Reserved for future per-call flags. ``None`` (default) and ``PrefetchOptions()`` are equivalent. @@ -427,7 +427,7 @@ def discard_prefetch( ---------- targets : :class:`Buffer` | Sequence[:class:`Buffer`] One or more managed allocations to discard and re-prefetch. - location : :class:`Location` | :obj:`~_device.Device` | int | Sequence[...] + location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | int | Sequence[...] Target location(s). A single location applies to all targets; a sequence must match ``len(targets)``. options : :class:`DiscardPrefetchOptions`, optional diff --git a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx index 205d3c77545..e69c834efc9 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx @@ -6,8 +6,9 @@ from __future__ import annotations from cuda.bindings cimport cydriver -from cuda.core._memory._memory_pool cimport _MemPool +from cuda.core._memory._memory_pool cimport _MemPool, _MP_allocate from cuda.core._memory._memory_pool cimport MP_init_create_pool, MP_init_current_pool # no-cython-lint +from cuda.core._stream cimport Stream, Stream_accept, default_stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils cimport check_or_create_options # no-cython-lint from cuda.core._utils.cuda_utils import CUDAError # no-cython-lint @@ -89,6 +90,22 @@ cdef class ManagedMemoryResource(_MemPool): def __init__(self, options=None): _MMR_init(self, options) + def allocate(self, size_t size, stream: Stream | None = None): + """Allocate a managed-memory buffer of the requested size. + + Returns a :class:`ManagedBuffer` (a :class:`Buffer` subclass) that + exposes the property-style advice API + (``read_mostly``, ``preferred_location``, ``accessed_by``) and + instance methods (``prefetch``, ``discard``, ``discard_prefetch``). + """ + # Lazy import: ManagedBuffer is pure Python and lives outside this + # Cython module. + from cuda.core._memory._managed_buffer import ManagedBuffer + if self.is_mapped: + raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") + cdef Stream s = Stream_accept(stream) if stream is not None else default_stream() + return _MP_allocate(self, size, s, ManagedBuffer) + @property def device_id(self) -> int: """The preferred device ordinal, or -1 if the preferred location is not a device.""" diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pxd b/cuda_core/cuda/core/_memory/_memory_pool.pxd index 45062826e4d..aa9cf833da3 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pxd +++ b/cuda_core/cuda/core/_memory/_memory_pool.pxd @@ -3,9 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver -from cuda.core._memory._buffer cimport MemoryResource +from cuda.core._memory._buffer cimport Buffer, MemoryResource from cuda.core._memory._ipc cimport IPCDataForMR from cuda.core._resource_handles cimport MemoryPoolHandle +from cuda.core._stream cimport Stream cdef class _MemPool(MemoryResource): @@ -36,6 +37,12 @@ cdef int MP_init_current_pool( cdef int MP_raise_release_threshold(_MemPool self) except? -1 +# Allocate from this pool, returning an instance of `cls` (defaulting to +# Buffer). Subclasses (e.g. ManagedMemoryResource) pass their own buffer +# subclass so their `allocate` returns the typed object. +cdef Buffer _MP_allocate(_MemPool self, size_t size, Stream stream, type cls = *) + + cdef class _MemPoolAttributes: cdef: MemoryPoolHandle _h_pool diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index f8f3b683d12..83f40fa9332 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -299,7 +299,7 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil: "a capturing stream (consider using GraphMemoryResource).") -cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream): +cdef Buffer _MP_allocate(_MemPool self, size_t size, Stream stream, type cls = Buffer): cdef cydriver.CUstream s = as_cu(stream._h_stream) cdef DevicePtrHandle h_ptr with nogil: @@ -307,7 +307,7 @@ cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream): h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream) if not h_ptr: raise RuntimeError("Failed to allocate memory from pool") - return Buffer_from_deviceptr_handle(h_ptr, size, self, None) + return Buffer_from_deviceptr_handle(h_ptr, size, self, None, cls) cdef inline void _MP_deallocate( diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py index ece40ad807c..a252e3d935f 100644 --- a/cuda_core/cuda/core/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core._memory._managed_location import Location from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch from cuda.core._memory._managed_memory_options import ( AdviseOptions, @@ -16,7 +15,6 @@ "AdviseOptions", "DiscardOptions", "DiscardPrefetchOptions", - "Location", "PrefetchOptions", "StridedMemoryView", "advise", diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index fa17624fa5e..20237aaf3a6 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -20,6 +20,7 @@ Devices and execution :toctree: generated/ Device + Host launch :template: autosummary/cyclass.rst @@ -55,6 +56,7 @@ Memory management :template: autosummary/cyclass.rst Buffer + ManagedBuffer MemoryResource DeviceMemoryResource GraphMemoryResource @@ -252,7 +254,13 @@ Utility functions discard discard_prefetch + :template: dataclass.rst + + AdviseOptions + PrefetchOptions + DiscardOptions + DiscardPrefetchOptions + :template: autosummary/cyclass.rst - Location StridedMemoryView diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 17696b616a1..35d200a9ad4 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -16,17 +16,39 @@ Highlights New features ------------ +- Added :class:`Host` as the symmetric counterpart of :class:`Device` for + expressing managed-memory locations: ``Host()`` (any host), + ``Host(numa_id=N)`` (specific NUMA node), and ``Host.numa_current()`` + (calling thread's NUMA node). + +- Added :class:`ManagedBuffer`, a :class:`Buffer` subclass returned by + :meth:`ManagedMemoryResource.allocate` that exposes a property-style + advice API: + + - ``buf.read_mostly`` (bool) — driver-backed get/set. + - ``buf.preferred_location`` (:class:`Device` | :class:`Host` | None) — + driver-backed get/set; assigning ``None`` unsets. + - ``buf.accessed_by`` — a live, set-like view; ``add()`` / ``discard()`` + issue advice, iteration queries the driver. + - ``buf.prefetch(location, *, stream)``, ``buf.discard(*, stream)``, + ``buf.discard_prefetch(location, *, stream)`` — instance methods that + delegate to the matching free functions. + + Use :meth:`ManagedBuffer.from_handle` to wrap an existing managed-memory + pointer. + - Added managed-memory range operations to :mod:`cuda.core.utils`: - :class:`~utils.Location`, :func:`~utils.advise`, :func:`~utils.prefetch`, - :func:`~utils.discard`, and :func:`~utils.discard_prefetch`. Each - operation accepts either a single managed :class:`Buffer` or a - sequence; with cuda.bindings 12.8+ the N>1 case dispatches to the - corresponding ``cuMem*BatchAsync`` driver entry point, addressing the - managed-memory portion of #1333. Locations are expressed via the typed - :class:`~utils.Location` dataclass (with classmethod constructors - ``device``, ``host``, ``host_numa``, and ``host_numa_current``); - ``Device`` and ``int`` values are still accepted for ergonomic - compatibility. + :func:`~utils.advise`, :func:`~utils.prefetch`, :func:`~utils.discard`, + and :func:`~utils.discard_prefetch`. Each operation accepts either a + single managed :class:`Buffer` or a sequence; with cuda.bindings 12.8+ + the N>1 case dispatches to the corresponding ``cuMem*BatchAsync`` driver + entry point, addressing the managed-memory portion of #1333. Locations + are expressed via :class:`Device` or :class:`Host`; ``int`` values are + also accepted (``-1`` maps to host, ``>=0`` to that device ordinal). + Per-call options use frozen dataclasses + (:class:`~utils.AdviseOptions`, :class:`~utils.PrefetchOptions`, + :class:`~utils.DiscardOptions`, :class:`~utils.DiscardPrefetchOptions`) + reserved for future flags. Fixes and enhancements diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index f820a5e226f..1bec7caa855 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -8,7 +8,7 @@ create_managed_memory_resource_or_skip, skip_if_managed_memory_unsupported, ) -from cuda.core import Device, utils +from cuda.core import Device, Host, ManagedBuffer, utils from cuda.core._utils.cuda_utils import handle_return try: @@ -25,7 +25,6 @@ def _get_mem_range_attr(buffer, attribute, data_size): - # cuMemRangeGetAttribute returns a raw integer when data_size <= 4. return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) @@ -89,8 +88,6 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): - from cuda.core.utils import Location - device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() @@ -106,9 +103,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): == _READ_MOSTLY_ENABLED ) - # cuda.bindings currently exposes the combined location attributes for - # cuMemRangeGetAttribute, so use the legacy location query here. - utils.advise(buffer, "set_preferred_location", Location.host()) + utils.advise(buffer, "set_preferred_location", Host()) preferred_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, @@ -214,18 +209,15 @@ def test_managed_memory_operation_validation(init_cuda): with pytest.raises(ValueError, match="location is required"): utils.prefetch(buffer, stream=stream) - from cuda.core.utils import Location with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - utils.advise(buffer, "set_accessed_by", Location.host_numa(_INVALID_HOST_DEVICE_ORDINAL)) + utils.advise(buffer, "set_accessed_by", Host(numa_id=_INVALID_HOST_DEVICE_ORDINAL)) buffer.close() def test_managed_memory_advise_location_validation(init_cuda): """Verify doc-specified location constraints for each advice kind.""" - from cuda.core.utils import Location - device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() @@ -239,15 +231,15 @@ def test_managed_memory_advise_location_validation(init_cuda): utils.advise(buffer, "set_preferred_location", device.device_id) # set_preferred_location with host location - utils.advise(buffer, "set_preferred_location", Location.host()) + utils.advise(buffer, "set_preferred_location", Host()) # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - utils.advise(buffer, "set_accessed_by", Location.host_numa(0)) + utils.advise(buffer, "set_accessed_by", Host(numa_id=0)) # set_accessed_by with host_numa_current also raises ValueError with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): - utils.advise(buffer, "set_accessed_by", Location.host_numa_current()) + utils.advise(buffer, "set_accessed_by", Host.numa_current()) # Inferred location from int: -1 maps to host, 0 maps to device utils.advise(buffer, "set_preferred_location", -1) @@ -295,85 +287,88 @@ def test_managed_memory_advise_invalid_advice_values(init_cuda): buffer.close() -class TestLocation: - def test_device_constructor(self): - from cuda.core.utils import Location +class TestHost: + def test_default(self): + h = Host() + assert h.numa_id is None + assert h.is_numa_current is False - loc = Location.device(0) - assert loc.kind == "device" - assert loc.id == 0 + def test_numa(self): + h = Host(numa_id=3) + assert h.numa_id == 3 + assert h.is_numa_current is False - def test_host_constructor(self): - from cuda.core.utils import Location + def test_numa_current(self): + h = Host.numa_current() + assert h.is_numa_current is True + assert h.numa_id is None - loc = Location.host() - assert loc.kind == "host" - assert loc.id is None + def test_invalid_numa_id(self): + with pytest.raises(ValueError, match="numa_id must be a non-negative int"): + Host(numa_id=-1) - def test_host_numa_constructor(self): - from cuda.core.utils import Location + def test_numa_current_with_id_rejected(self): + with pytest.raises(ValueError, match="numa_current"): + Host(numa_id=0, is_numa_current=True) - loc = Location.host_numa(3) - assert loc.kind == "host_numa" - assert loc.id == 3 + def test_frozen(self): + import dataclasses - def test_host_numa_current_constructor(self): - from cuda.core.utils import Location + h = Host(numa_id=2) + with pytest.raises(dataclasses.FrozenInstanceError): + h.numa_id = 3 - loc = Location.host_numa_current() - assert loc.kind == "host_numa_current" - assert loc.id is None + def test_eq_hash(self): + # Frozen dataclass equality is structural. + assert Host() == Host() + assert Host(numa_id=1) == Host(numa_id=1) + assert Host() != Host(numa_id=0) + assert Host.numa_current() != Host() + assert hash(Host(numa_id=1)) == hash(Host(numa_id=1)) - def test_frozen(self): - import dataclasses - from cuda.core.utils import Location +class TestLocationCoerce: + """The coerce helper is internal; verify Device/Host/int/None inputs.""" - loc = Location.device(0) - with pytest.raises(dataclasses.FrozenInstanceError): - loc.id = 1 + def test_device_passthrough(self, init_cuda): + from cuda.core._memory._managed_location import _coerce_location - def test_invalid_device_id(self): - from cuda.core.utils import Location + dev = Device() + spec = _coerce_location(dev) + assert spec.kind == "device" + assert spec.id == dev.device_id - with pytest.raises(ValueError, match="device id must be >= 0"): - Location.device(-1) + def test_host_passthrough(self): + from cuda.core._memory._managed_location import _coerce_location - def test_invalid_kind(self): - from cuda.core.utils import Location + spec = _coerce_location(Host()) + assert spec.kind == "host" - with pytest.raises(ValueError, match="kind must be one of"): - Location(kind="not_a_kind", id=None) + def test_host_numa_passthrough(self): + from cuda.core._memory._managed_location import _coerce_location + spec = _coerce_location(Host(numa_id=3)) + assert spec.kind == "host_numa" + assert spec.id == 3 -class TestLocationCoerce: - def test_passthrough(self): + def test_host_numa_current_passthrough(self): from cuda.core._memory._managed_location import _coerce_location - from cuda.core.utils import Location - loc = Location.device(0) - assert _coerce_location(loc) is loc + spec = _coerce_location(Host.numa_current()) + assert spec.kind == "host_numa_current" def test_int_device(self): from cuda.core._memory._managed_location import _coerce_location - assert _coerce_location(0).kind == "device" - assert _coerce_location(0).id == 0 + spec = _coerce_location(0) + assert spec.kind == "device" + assert spec.id == 0 def test_int_minus_one_is_host(self): from cuda.core._memory._managed_location import _coerce_location assert _coerce_location(-1).kind == "host" - def test_device_object(self, init_cuda): - from cuda.core import Device - from cuda.core._memory._managed_location import _coerce_location - - dev = Device() - loc = _coerce_location(dev) - assert loc.kind == "device" - assert loc.id == dev.device_id - def test_none_when_disallowed(self): from cuda.core._memory._managed_location import _coerce_location @@ -394,13 +389,13 @@ def test_bad_int(self): def test_bad_type(self): from cuda.core._memory._managed_location import _coerce_location - with pytest.raises(TypeError, match="Location, Device, int, or None"): + with pytest.raises(TypeError, match="Device, Host, int, or None"): _coerce_location("device") class TestPrefetch: - def test_single_with_location_host(self, init_cuda): - from cuda.core.utils import Location, prefetch + def test_single_with_host_location(self, init_cuda): + from cuda.core.utils import prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -409,7 +404,7 @@ def test_single_with_location_host(self, init_cuda): buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - prefetch(buf, Location.host(), stream=stream) + prefetch(buf, Host(), stream=stream) stream.sync() last = _get_int_mem_range_attr( buf, @@ -419,7 +414,7 @@ def test_single_with_location_host(self, init_cuda): buf.close() def test_batched_same_location(self, init_cuda): - from cuda.core.utils import Location, prefetch + from cuda.core.utils import prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -430,7 +425,7 @@ def test_batched_same_location(self, init_cuda): bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] stream = device.create_stream() - prefetch(bufs, Location.device(device.device_id), stream=stream) + prefetch(bufs, device, stream=stream) stream.sync() for buf in bufs: @@ -442,7 +437,7 @@ def test_batched_same_location(self, init_cuda): buf.close() def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.utils import Location, prefetch + from cuda.core.utils import prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -453,7 +448,7 @@ def test_batched_per_buffer_location(self, init_cuda): bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] stream = device.create_stream() - prefetch(bufs, [Location.host(), Location.device(device.device_id)], stream=stream) + prefetch(bufs, [Host(), device], stream=stream) stream.sync() last0 = _get_int_mem_range_attr( @@ -470,7 +465,7 @@ def test_batched_per_buffer_location(self, init_cuda): buf.close() def test_length_mismatch(self, init_cuda): - from cuda.core.utils import Location, prefetch + from cuda.core.utils import prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -480,19 +475,19 @@ def test_length_mismatch(self, init_cuda): stream = device.create_stream() with pytest.raises(ValueError, match="length"): - prefetch(bufs, [Location.host()], stream=stream) + prefetch(bufs, [Host()], stream=stream) for buf in bufs: buf.close() def test_rejects_non_managed(self, init_cuda): - from cuda.core.utils import Location, prefetch + from cuda.core.utils import prefetch device = Device() device.set_current() buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() with pytest.raises(ValueError, match="managed-memory"): - prefetch(buf, Location.host(), stream=stream) + prefetch(buf, Host(), stream=stream) buf.close() def test_location_required(self, init_cuda): @@ -508,8 +503,8 @@ def test_location_required(self, init_cuda): prefetch(buf, None, stream=stream) buf.close() - def test_options_must_be_none(self, init_cuda): - from cuda.core.utils import Location, prefetch + def test_options_must_be_options_dataclass_or_none(self, init_cuda): + from cuda.core.utils import prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -518,13 +513,13 @@ def test_options_must_be_none(self, init_cuda): buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() with pytest.raises(TypeError, match="must be an? .*Options instance or None"): - prefetch(buf, Location.host(), options={}, stream=stream) + prefetch(buf, Host(), options={}, stream=stream) buf.close() class TestDiscard: def test_single_buffer(self, init_cuda): - from cuda.core.utils import Location, discard, prefetch + from cuda.core.utils import discard, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -534,14 +529,14 @@ def test_single_buffer(self, init_cuda): mr = create_managed_memory_resource_or_skip() buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - prefetch(buf, Location.device(device.device_id), stream=stream) + prefetch(buf, device, stream=stream) stream.sync() discard(buf, stream=stream) stream.sync() buf.close() def test_batched(self, init_cuda): - from cuda.core.utils import Location, discard, prefetch + from cuda.core.utils import discard, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -551,7 +546,7 @@ def test_batched(self, init_cuda): mr = create_managed_memory_resource_or_skip() bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] stream = device.create_stream() - prefetch(bufs, Location.device(device.device_id), stream=stream) + prefetch(bufs, device, stream=stream) stream.sync() discard(bufs, stream=stream) stream.sync() @@ -569,7 +564,7 @@ def test_rejects_non_managed(self, init_cuda): discard(buf, stream=stream) buf.close() - def test_options_must_be_none(self, init_cuda): + def test_options_must_be_options_dataclass_or_none(self, init_cuda): from cuda.core.utils import discard device = Device() @@ -585,7 +580,7 @@ def test_options_must_be_none(self, init_cuda): class TestDiscardPrefetch: def test_single_buffer(self, init_cuda): - from cuda.core.utils import Location, discard_prefetch, prefetch + from cuda.core.utils import discard_prefetch, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -596,9 +591,9 @@ def test_single_buffer(self, init_cuda): buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - prefetch(buf, Location.host(), stream=stream) + prefetch(buf, Host(), stream=stream) stream.sync() - discard_prefetch(buf, Location.device(device.device_id), stream=stream) + discard_prefetch(buf, device, stream=stream) stream.sync() last = _get_int_mem_range_attr( @@ -609,7 +604,7 @@ def test_single_buffer(self, init_cuda): buf.close() def test_batched_same_location(self, init_cuda): - from cuda.core.utils import Location, discard_prefetch, prefetch + from cuda.core.utils import discard_prefetch, prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -619,9 +614,9 @@ def test_batched_same_location(self, init_cuda): mr = create_managed_memory_resource_or_skip() bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] stream = device.create_stream() - prefetch(bufs, Location.host(), stream=stream) + prefetch(bufs, Host(), stream=stream) stream.sync() - discard_prefetch(bufs, Location.device(device.device_id), stream=stream) + discard_prefetch(bufs, device, stream=stream) stream.sync() for buf in bufs: last = _get_int_mem_range_attr( @@ -632,7 +627,7 @@ def test_batched_same_location(self, init_cuda): buf.close() def test_length_mismatch(self, init_cuda): - from cuda.core.utils import Location, discard_prefetch + from cuda.core.utils import discard_prefetch device = Device() skip_if_managed_memory_unsupported(device) @@ -641,19 +636,19 @@ def test_length_mismatch(self, init_cuda): bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] stream = device.create_stream() with pytest.raises(ValueError, match="length"): - discard_prefetch(bufs, [Location.host()], stream=stream) + discard_prefetch(bufs, [Host()], stream=stream) for buf in bufs: buf.close() def test_rejects_non_managed(self, init_cuda): - from cuda.core.utils import Location, discard_prefetch + from cuda.core.utils import discard_prefetch device = Device() device.set_current() buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() with pytest.raises(ValueError, match="managed-memory"): - discard_prefetch(buf, Location.host(), stream=stream) + discard_prefetch(buf, Host(), stream=stream) buf.close() @@ -684,22 +679,18 @@ def test_batched_same_advice(self, init_cuda): assert r == _READ_MOSTLY_ENABLED def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.utils import Location, advise + from cuda.core.utils import advise device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() mr = DummyUnifiedMemoryResource(device) bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] - advise( - bufs, - "set_preferred_location", - [Location.host(), Location.device(device.device_id)], - ) + advise(bufs, "set_preferred_location", [Host(), device]) for buf in bufs: buf.close() - def test_options_must_be_none(self, init_cuda): + def test_options_must_be_options_dataclass_or_none(self, init_cuda): from cuda.core.utils import advise device = Device() @@ -709,3 +700,160 @@ def test_options_must_be_none(self, init_cuda): with pytest.raises(TypeError, match="must be an? .*Options instance or None"): advise(buf, "set_read_mostly", options={}) buf.close() + + +class TestManagedBuffer: + """Property-style API on ManagedBuffer subclass.""" + + def test_allocate_returns_managed_buffer(self, init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + try: + assert isinstance(buf, ManagedBuffer) + finally: + buf.close() + + def test_from_handle(self, init_cuda): + from cuda.core import Buffer + + device = Device() + _skip_if_managed_allocation_unsupported(device) + device.set_current() + # Allocate an external managed pointer through the dummy MR, then + # adopt it as a ManagedBuffer via from_handle. + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + try: + mbuf = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) + assert isinstance(mbuf, ManagedBuffer) + assert isinstance(mbuf, Buffer) + assert mbuf.size == plain.size + finally: + plain.close() + + def test_read_mostly_roundtrip(self, init_cuda): + # cuMemAdvise is exercised against an external managed allocation + # (cuMemAllocManaged); pool-allocated managed memory may decline + # certain advice on some driver/device combos. + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + try: + buf = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) + assert buf.read_mostly is False + buf.read_mostly = True + assert buf.read_mostly is True + buf.read_mostly = False + assert buf.read_mostly is False + finally: + plain.close() + + def test_preferred_location_roundtrip(self, init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + try: + buf = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) + buf.preferred_location = device + got = buf.preferred_location + assert isinstance(got, Device) + assert got.device_id == device.device_id + + buf.preferred_location = Host() + assert buf.preferred_location == Host() + + buf.preferred_location = None + assert buf.preferred_location is None + finally: + plain.close() + + def test_accessed_by_add_discard(self, init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + try: + buf = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) + assert device not in buf.accessed_by + + buf.accessed_by.add(device) + assert device in buf.accessed_by + + buf.accessed_by.discard(device) + assert device not in buf.accessed_by + finally: + plain.close() + + def test_accessed_by_set_assignment(self, init_cuda): + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + try: + buf = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) + buf.accessed_by = {device} + assert device in buf.accessed_by + + buf.accessed_by = set() + assert device not in buf.accessed_by + finally: + plain.close() + + def test_instance_prefetch(self, init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + try: + buf.prefetch(device, stream=stream) + stream.sync() + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + finally: + buf.close() + + def test_instance_discard(self, init_cuda): + device = Device() + skip_if_managed_memory_unsupported(device) + if not hasattr(driver, "cuMemDiscardBatchAsync"): + pytest.skip("cuMemDiscardBatchAsync unavailable") + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + try: + buf.prefetch(device, stream=stream) + stream.sync() + buf.discard(stream=stream) + stream.sync() + finally: + buf.close() + + def test_instance_discard_prefetch(self, init_cuda): + device = Device() + _skip_if_managed_discard_prefetch_unsupported(device) + device.set_current() + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) + stream = device.create_stream() + try: + buf.prefetch(Host(), stream=stream) + stream.sync() + buf.discard_prefetch(device, stream=stream) + stream.sync() + last = _get_int_mem_range_attr( + buf, + driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ) + assert last == device.device_id + finally: + buf.close() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 68e17ee1443..65209df6a65 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -116,6 +116,7 @@ def test_package_contents(): "IPCBufferDescriptor", "IPCAllocationHandle", "LegacyPinnedMemoryResource", + "ManagedBuffer", "ManagedMemoryResource", "ManagedMemoryResourceOptions", "PinnedMemoryResourceOptions", From f59af4e1d1913db66cdc10351cc6cbecfb5a7b96 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 14:26:39 -0700 Subject: [PATCH 41/68] chore(cuda.core): simplify ManagedBuffer per /simplify review - Buffer.from_handle is now a classmethod that dispatches via cls._init, so subclasses inherit it: ManagedBuffer.from_handle(...) returns a ManagedBuffer with no override needed. Drop ManagedBuffer.from_handle. - Hoist `advise / prefetch / discard / discard_prefetch` imports from per-method lazy imports to module-level (no circular import: they live in cuda.core._memory._managed_memory_ops, not cuda.core.utils). - Cache the CUmem_advise and CUmem_range_attribute enum lookups at module level and pass enum constants directly to advise() instead of re-resolving from string aliases on every property write. - Extract _query_accessed_by as a module-level helper; AccessedBySet delegates and the accessed_by setter calls it directly instead of constructing a throwaway view. --- cuda_core/cuda/core/_memory/_buffer.pyx | 8 +- .../cuda/core/_memory/_managed_buffer.py | 142 ++++++++---------- 2 files changed, 65 insertions(+), 85 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 18ebbfa7801..5334ee1194a 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -130,8 +130,9 @@ cdef class Buffer: # Must not serialize the parent's stream! return Buffer._reduce_helper, (self.memory_resource, self.get_ipc_descriptor()) - @staticmethod + @classmethod def from_handle( + cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, owner: object | None = None, ) -> Buffer: @@ -157,8 +158,11 @@ cdef class Buffer: When neither ``mr`` nor ``owner`` is specified, this creates a non-owning reference. The pointer will NOT be freed when the :class:`Buffer` is closed or garbage collected. + + Subclasses inherit this method via :meth:`Buffer._init`, so e.g. + ``ManagedBuffer.from_handle(ptr, size)`` returns a ``ManagedBuffer``. """ - return Buffer._init(ptr, size, mr=mr, owner=owner) + return cls._init(ptr, size, mr=mr, owner=owner) @classmethod def from_ipc_descriptor( diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index 3ce5e5b2afb..140c79748fe 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -8,6 +8,7 @@ from cuda.core._device import Device from cuda.core._host import Host from cuda.core._memory._buffer import Buffer +from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch from cuda.core._utils.cuda_utils import driver, handle_return if TYPE_CHECKING: @@ -17,11 +18,37 @@ _INT_SIZE = 4 +# Enum aliases — referenced once per property write, so cache the lookup. +_ADV = driver.CUmem_advise +_SET_READ_MOSTLY = _ADV.CU_MEM_ADVISE_SET_READ_MOSTLY +_UNSET_READ_MOSTLY = _ADV.CU_MEM_ADVISE_UNSET_READ_MOSTLY +_SET_PREFERRED = _ADV.CU_MEM_ADVISE_SET_PREFERRED_LOCATION +_UNSET_PREFERRED = _ADV.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION +_SET_ACCESSED_BY = _ADV.CU_MEM_ADVISE_SET_ACCESSED_BY +_UNSET_ACCESSED_BY = _ADV.CU_MEM_ADVISE_UNSET_ACCESSED_BY + +_RANGE = driver.CUmem_range_attribute +_ATTR_READ_MOSTLY = _RANGE.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY +_ATTR_PREFERRED = _RANGE.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION +_ATTR_ACCESSED_BY = _RANGE.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY + def _get_int_attr(buf: Buffer, attribute) -> int: return handle_return(driver.cuMemRangeGetAttribute(_INT_SIZE, attribute, buf.handle, buf.size)) +def _query_accessed_by(buf: Buffer) -> list[Device | Host]: + """Read the live ``CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY`` list. + + Driver fills an int32 array: device id, ``-1`` = host, ``-2`` = empty. + Sized to ``cuDeviceGetCount() + 1`` (every visible device plus host). + """ + num_devices = handle_return(driver.cuDeviceGetCount()) + n = num_devices + 1 + raw = handle_return(driver.cuMemRangeGetAttribute(n * _INT_SIZE, _ATTR_ACCESSED_BY, buf.handle, buf.size)) + return [Host() if v == -1 else Device(v) for v in raw if v != -2] + + class AccessedBySet: """Live driver-backed view of ``set_accessed_by`` advice for a managed buffer. @@ -32,9 +59,9 @@ class AccessedBySet: Note ---- - The driver's read-back path returns integer device ordinals (``-1`` for - host); host NUMA distinctions applied via ``Host(numa_id=...)`` are not - distinguishable from a generic ``Host()`` when iterating this set. + The driver returns integer device ordinals (``-1`` for host); host + NUMA distinctions applied via ``Host(numa_id=...)`` collapse to a + generic ``Host()`` when iterating this set. """ __slots__ = ("_buf",) @@ -42,65 +69,41 @@ class AccessedBySet: def __init__(self, buf: ManagedBuffer): self._buf = buf - def _query(self) -> list[Device | Host]: - # Driver fills the array with device ordinals: device id, -1 = host, - # -2 = empty slot. Size must accommodate every CUDA-visible device - # plus a slot for the host. We use cuDeviceGetCount (driver-side) to - # stay independent of NVML availability. - num_devices = handle_return(driver.cuDeviceGetCount()) - n = num_devices + 1 - raw = handle_return( - driver.cuMemRangeGetAttribute( - n * _INT_SIZE, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, - self._buf.handle, - self._buf.size, - ) - ) - result: list[Device | Host] = [] - for v in raw: - if v == -2: # CU_DEVICE_INVALID — empty slot - continue - result.append(Host() if v == -1 else Device(v)) - return result - def __contains__(self, location) -> bool: - return location in self._query() + return location in _query_accessed_by(self._buf) def __iter__(self): - return iter(self._query()) + return iter(_query_accessed_by(self._buf)) def __len__(self) -> int: - return len(self._query()) + return len(_query_accessed_by(self._buf)) def __eq__(self, other) -> bool: if isinstance(other, AccessedBySet): - return set(self._query()) == set(other._query()) + return set(_query_accessed_by(self._buf)) == set(_query_accessed_by(other._buf)) if isinstance(other, (set, frozenset)): - return set(self._query()) == other + return set(_query_accessed_by(self._buf)) == other return NotImplemented def __repr__(self) -> str: - return f"AccessedBySet({set(self._query())!r})" + return f"AccessedBySet({set(_query_accessed_by(self._buf))!r})" def add(self, location: Device | Host) -> None: """Apply ``set_accessed_by`` advice for ``location``.""" - from cuda.core.utils import advise - - advise(self._buf, "set_accessed_by", location) + advise(self._buf, _SET_ACCESSED_BY, location) def discard(self, location: Device | Host) -> None: """Apply ``unset_accessed_by`` advice for ``location``.""" - from cuda.core.utils import advise - - advise(self._buf, "unset_accessed_by", location) + advise(self._buf, _UNSET_ACCESSED_BY, location) class ManagedBuffer(Buffer): """Managed (unified) memory buffer with a property-style advice API. - Returned by :meth:`ManagedMemoryResource.allocate`. Wrap an external - managed-memory pointer with :meth:`ManagedBuffer.from_handle`. + Returned by :meth:`ManagedMemoryResource.allocate`, or wrap an + existing managed-memory pointer with :meth:`Buffer.from_handle` + (which dispatches by class — ``ManagedBuffer.from_handle(...)`` + returns a ``ManagedBuffer``). Examples -------- @@ -112,42 +115,25 @@ class ManagedBuffer(Buffer): Note ---- - The driver's read-back path for ``preferred_location`` and - ``accessed_by`` returns integer device ordinals; host NUMA distinctions - applied via ``Host(numa_id=...)`` collapse to a generic ``Host()`` when - queried. Setters preserve full NUMA information when issuing advice. + The legacy ``cuMemRangeGetAttribute`` query path returns integer + device ordinals, so ``Host(numa_id=...)`` collapses to ``Host()`` + on read-back. Setters preserve full NUMA information when issuing + advice. """ - @classmethod - def from_handle( - cls, - ptr, - size: int, - mr=None, - owner=None, - ) -> ManagedBuffer: - """Wrap an existing managed-memory pointer in a :class:`ManagedBuffer`.""" - return cls._init(ptr, size, mr=mr, owner=owner) - @property def read_mostly(self) -> bool: - """Whether ``set_read_mostly`` advice is currently applied to this range.""" - return _get_int_attr(self, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY) != 0 + """Whether ``set_read_mostly`` advice is currently applied.""" + return _get_int_attr(self, _ATTR_READ_MOSTLY) != 0 @read_mostly.setter def read_mostly(self, value: bool) -> None: - from cuda.core.utils import advise - - advise(self, "set_read_mostly" if value else "unset_read_mostly") + advise(self, _SET_READ_MOSTLY if value else _UNSET_READ_MOSTLY) @property def preferred_location(self) -> Device | Host | None: - """Currently applied ``set_preferred_location`` target, or ``None`` if unset.""" - # The legacy PREFERRED_LOCATION attribute returns a single int: - # -2 = invalid (no preferred location), -1 = host, >=0 = device ordinal. - # NUMA-specific preferences round-trip as a generic Host (CUDA driver - # limitation of the legacy query path). - loc_id = _get_int_attr(self, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION) + """Currently applied ``set_preferred_location`` target, or ``None``.""" + loc_id = _get_int_attr(self, _ATTR_PREFERRED) if loc_id == -2: return None if loc_id == -1: @@ -156,12 +142,10 @@ def preferred_location(self) -> Device | Host | None: @preferred_location.setter def preferred_location(self, value: Device | Host | None) -> None: - from cuda.core.utils import advise - if value is None: - advise(self, "unset_preferred_location") + advise(self, _UNSET_PREFERRED) else: - advise(self, "set_preferred_location", value) + advise(self, _SET_PREFERRED, value) @property def accessed_by(self) -> AccessedBySet: @@ -171,29 +155,21 @@ def accessed_by(self) -> AccessedBySet: @accessed_by.setter def accessed_by(self, locations) -> None: # Diff against the current driver state and advise only the deltas. - from cuda.core.utils import advise - - current = set(AccessedBySet(self)) + current = set(_query_accessed_by(self)) target = set(locations) for loc in current - target: - advise(self, "unset_accessed_by", loc) + advise(self, _UNSET_ACCESSED_BY, loc) for loc in target - current: - advise(self, "set_accessed_by", loc) + advise(self, _SET_ACCESSED_BY, loc) def prefetch(self, location: Device | Host | int, *, stream: Stream | GraphBuilder) -> None: """Prefetch this range to ``location`` on ``stream``.""" - from cuda.core.utils import prefetch as _prefetch - - _prefetch(self, location, stream=stream) + prefetch(self, location, stream=stream) def discard(self, *, stream: Stream | GraphBuilder) -> None: """Discard this range's resident pages on ``stream`` (CUDA 13+).""" - from cuda.core.utils import discard as _discard - - _discard(self, stream=stream) + discard(self, stream=stream) def discard_prefetch(self, location: Device | Host | int, *, stream: Stream | GraphBuilder) -> None: """Discard this range and prefetch to ``location`` on ``stream`` (CUDA 13+).""" - from cuda.core.utils import discard_prefetch as _discard_prefetch - - _discard_prefetch(self, location, stream=stream) + discard_prefetch(self, location, stream=stream) From 5147a7d46ddb74a37b95d5b27a92e1a7ab785163 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 15:20:26 -0700 Subject: [PATCH 42/68] ci: re-trigger CI (transient cuInit INVALID_DEVICE on l4 runner) From 2151e6109ce46a10d1c60690f778e4c5429a3a89 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 17:26:56 -0700 Subject: [PATCH 43/68] refactor(cuda.core): use libcpp.vector for batched-op C arrays (R14) Per Andy's review nit (PR #1775, _managed_memory_ops.pyx:207), replace the manual PyMem_Malloc / PyMem_Free pattern in the three batch helpers (_do_batch_discard, _do_batch_prefetch, _do_batch_discard_prefetch) with libcpp.vector. RAII handles cleanup, eliminating the manual try/finally and removing a leak window if _to_cumemlocation raised mid-fill. Matches the precedent used in _program.pyx, _linker.pyx, _kernel_arg_handler.pyx, _graph_node.pyx, and others. Net change: 53 insertions, 85 deletions. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 138 +++++++----------- 1 file changed, 53 insertions(+), 85 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index f78eabad9d2..3b36f6eff90 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -5,7 +5,7 @@ from __future__ import annotations IF CUDA_CORE_BUILD_MAJOR >= 13: - from cpython.mem cimport PyMem_Free, PyMem_Malloc + from libcpp.vector cimport vector from cuda.bindings cimport cydriver from cuda.core._memory._buffer cimport Buffer @@ -201,28 +201,20 @@ cdef void _do_batch_discard(tuple bufs, Stream s): IF CUDA_CORE_BUILD_MAJOR >= 13: cdef Py_ssize_t n = len(bufs) cdef cydriver.CUstream hstream = as_cu(s._h_stream) - cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( - n * sizeof(cydriver.CUdeviceptr) - ) - cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) - if not (ptrs and sizes): - PyMem_Free(ptrs) - PyMem_Free(sizes) - raise MemoryError() + cdef vector[cydriver.CUdeviceptr] ptrs + cdef vector[size_t] sizes + ptrs.resize(n) + sizes.resize(n) cdef Buffer buf cdef Py_ssize_t i - try: - for i in range(n): - buf = bufs[i] - ptrs[i] = as_cu(buf._h_ptr) - sizes[i] = buf._size - with nogil: - HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync( - ptrs, sizes, n, 0, hstream, - )) - finally: - PyMem_Free(ptrs) - PyMem_Free(sizes) + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + with nogil: + HANDLE_RETURN(cydriver.cuMemDiscardBatchAsync( + ptrs.data(), sizes.data(), n, 0, hstream, + )) ELSE: raise NotImplementedError( "discard requires a CUDA 13 build of cuda.core" @@ -374,40 +366,28 @@ cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): IF CUDA_CORE_BUILD_MAJOR >= 13: cdef Py_ssize_t n = len(bufs) cdef cydriver.CUstream hstream = as_cu(s._h_stream) - cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( - n * sizeof(cydriver.CUdeviceptr) - ) - cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) - cdef cydriver.CUmemLocation* loc_arr = PyMem_Malloc( - n * sizeof(cydriver.CUmemLocation) - ) - cdef size_t* loc_indices = PyMem_Malloc(n * sizeof(size_t)) - if not (ptrs and sizes and loc_arr and loc_indices): - PyMem_Free(ptrs) - PyMem_Free(sizes) - PyMem_Free(loc_arr) - PyMem_Free(loc_indices) - raise MemoryError() + cdef vector[cydriver.CUdeviceptr] ptrs + cdef vector[size_t] sizes + cdef vector[cydriver.CUmemLocation] loc_arr + cdef vector[size_t] loc_indices + ptrs.resize(n) + sizes.resize(n) + loc_arr.resize(n) + loc_indices.resize(n) cdef Buffer buf cdef Py_ssize_t i - try: - for i in range(n): - buf = bufs[i] - ptrs[i] = as_cu(buf._h_ptr) - sizes[i] = buf._size - loc_arr[i] = _to_cumemlocation(locs[i]) - loc_indices[i] = i - with nogil: - HANDLE_RETURN(cydriver.cuMemPrefetchBatchAsync( - ptrs, sizes, n, - loc_arr, loc_indices, n, - 0, hstream, - )) - finally: - PyMem_Free(ptrs) - PyMem_Free(sizes) - PyMem_Free(loc_arr) - PyMem_Free(loc_indices) + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + loc_arr[i] = _to_cumemlocation(locs[i]) + loc_indices[i] = i + with nogil: + HANDLE_RETURN(cydriver.cuMemPrefetchBatchAsync( + ptrs.data(), sizes.data(), n, + loc_arr.data(), loc_indices.data(), n, + 0, hstream, + )) ELSE: raise NotImplementedError( "batched prefetch requires a CUDA 13 build of cuda.core" @@ -463,40 +443,28 @@ cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s): IF CUDA_CORE_BUILD_MAJOR >= 13: cdef Py_ssize_t n = len(bufs) cdef cydriver.CUstream hstream = as_cu(s._h_stream) - cdef cydriver.CUdeviceptr* ptrs = PyMem_Malloc( - n * sizeof(cydriver.CUdeviceptr) - ) - cdef size_t* sizes = PyMem_Malloc(n * sizeof(size_t)) - cdef cydriver.CUmemLocation* loc_arr = PyMem_Malloc( - n * sizeof(cydriver.CUmemLocation) - ) - cdef size_t* loc_indices = PyMem_Malloc(n * sizeof(size_t)) - if not (ptrs and sizes and loc_arr and loc_indices): - PyMem_Free(ptrs) - PyMem_Free(sizes) - PyMem_Free(loc_arr) - PyMem_Free(loc_indices) - raise MemoryError() + cdef vector[cydriver.CUdeviceptr] ptrs + cdef vector[size_t] sizes + cdef vector[cydriver.CUmemLocation] loc_arr + cdef vector[size_t] loc_indices + ptrs.resize(n) + sizes.resize(n) + loc_arr.resize(n) + loc_indices.resize(n) cdef Buffer buf cdef Py_ssize_t i - try: - for i in range(n): - buf = bufs[i] - ptrs[i] = as_cu(buf._h_ptr) - sizes[i] = buf._size - loc_arr[i] = _to_cumemlocation(locs[i]) - loc_indices[i] = i - with nogil: - HANDLE_RETURN(cydriver.cuMemDiscardAndPrefetchBatchAsync( - ptrs, sizes, n, - loc_arr, loc_indices, n, - 0, hstream, - )) - finally: - PyMem_Free(ptrs) - PyMem_Free(sizes) - PyMem_Free(loc_arr) - PyMem_Free(loc_indices) + for i in range(n): + buf = bufs[i] + ptrs[i] = as_cu(buf._h_ptr) + sizes[i] = buf._size + loc_arr[i] = _to_cumemlocation(locs[i]) + loc_indices[i] = i + with nogil: + HANDLE_RETURN(cydriver.cuMemDiscardAndPrefetchBatchAsync( + ptrs.data(), sizes.data(), n, + loc_arr.data(), loc_indices.data(), n, + 0, hstream, + )) ELSE: raise NotImplementedError( "discard_prefetch requires a CUDA 13 build of cuda.core" From 5c6d054043f905fbcf5726f6edf55da4e7afccf9 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 17:32:49 -0700 Subject: [PATCH 44/68] fix(cuda.core): restore CUDA_ERROR_NOT_INITIALIZED auto-init in _query_memory_attrs (R4) Per Leo's review on PR #1775 (_buffer.pyx:455), restore the auto-init retry that was removed in 10de998e6f. cuPointerGetAttributes is the first driver call _query_memory_attrs makes, and a NOT_INITIALIZED result here would otherwise propagate out of every is_managed / is_host_accessible / is_device_accessible query before the user has called any other Device API. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 5334ee1194a..18eaabf7ce6 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -36,6 +36,7 @@ else: from cuda.core._dlpack import classify_dl_device, make_py_capsule from cuda.core._utils.cuda_utils import driver +from cuda.core._device import Device # ============================================================================= @@ -452,6 +453,11 @@ cdef inline int _query_memory_attrs( cdef cydriver.CUresult ret ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) + if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED: + with cython.gil: + # Device class handles the cuInit call internally + Device() + ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) HANDLE_RETURN(ret) # TODO: HMM/ATS-enabled sysmem should also report is_managed=True; the From 47d5609e98fefeb7896e9a9dcc38c708370990d2 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 17:44:57 -0700 Subject: [PATCH 45/68] refactor(cuda.core): make Host a plain class instead of a dataclass (R1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Leo's review on PR #1775 (_host.py:9), drop the @dataclass(frozen=True) in favor of a hand-written class with property accessors. Matches Leo's original sketch from the 2026-04-28 drive-by comment and aligns with how Device is structured in this codebase. Behavior preserved: Host(), Host(numa_id=N), and Host.numa_current() all work identically. __eq__, __hash__, and immutability are hand-rolled rather than dataclass-generated. is_numa_current is no longer an __init__ kwarg — it's internal state settable only via the Host.numa_current() classmethod. Two existing TestHost cases updated: - test_numa_current_with_id_rejected → test_numa_current_only_via_classmethod - test_frozen → test_immutable (AttributeError instead of FrozenInstanceError) Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_host.py | 39 ++++++++++++++++------ cuda_core/tests/memory/test_managed_ops.py | 13 ++++---- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/cuda_core/cuda/core/_host.py b/cuda_core/cuda/core/_host.py index e3aa4aebe5e..b595222fa03 100644 --- a/cuda_core/cuda/core/_host.py +++ b/cuda_core/cuda/core/_host.py @@ -3,10 +3,7 @@ from __future__ import annotations -from dataclasses import dataclass - -@dataclass(frozen=True) class Host: """Host (CPU) location for managed-memory operations. @@ -22,19 +19,39 @@ class Host: and to ``ManagedBuffer.preferred_location`` / ``accessed_by``. """ - numa_id: int | None = None - is_numa_current: bool = False + __slots__ = ("_is_numa_current", "_numa_id") + + def __init__(self, numa_id: int | None = None) -> None: + if numa_id is not None and (not isinstance(numa_id, int) or numa_id < 0): + raise ValueError(f"numa_id must be a non-negative int, got {numa_id!r}") + object.__setattr__(self, "_numa_id", numa_id) + object.__setattr__(self, "_is_numa_current", False) + + @property + def numa_id(self) -> int | None: + return self._numa_id - def __post_init__(self) -> None: - if self.is_numa_current and self.numa_id is not None: - raise ValueError("Host.numa_current() cannot have an explicit numa_id") - if self.numa_id is not None and (not isinstance(self.numa_id, int) or self.numa_id < 0): - raise ValueError(f"numa_id must be a non-negative int, got {self.numa_id!r}") + @property + def is_numa_current(self) -> bool: + return self._is_numa_current @classmethod def numa_current(cls) -> Host: """Construct a ``Host`` referring to the calling thread's NUMA node.""" - return cls(is_numa_current=True) + h = cls() + object.__setattr__(h, "_is_numa_current", True) + return h + + def __setattr__(self, name: str, value) -> None: + raise AttributeError(f"{type(self).__name__} is immutable; cannot set {name!r}") + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Host): + return NotImplemented + return self._numa_id == other._numa_id and self._is_numa_current == other._is_numa_current + + def __hash__(self) -> int: + return hash((Host, self._numa_id, self._is_numa_current)) def __repr__(self) -> str: if self.is_numa_current: diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index 1bec7caa855..1c6c045bdb3 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -307,15 +307,14 @@ def test_invalid_numa_id(self): with pytest.raises(ValueError, match="numa_id must be a non-negative int"): Host(numa_id=-1) - def test_numa_current_with_id_rejected(self): - with pytest.raises(ValueError, match="numa_current"): - Host(numa_id=0, is_numa_current=True) - - def test_frozen(self): - import dataclasses + def test_numa_current_only_via_classmethod(self): + # is_numa_current is internal state, only settable via Host.numa_current() + with pytest.raises(TypeError): + Host(is_numa_current=True) # type: ignore[call-arg] + def test_immutable(self): h = Host(numa_id=2) - with pytest.raises(dataclasses.FrozenInstanceError): + with pytest.raises(AttributeError): h.numa_id = 3 def test_eq_hash(self): From a40bb8145a95ebbed2d306a12b7a46422eae6c91 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 17:57:30 -0700 Subject: [PATCH 46/68] feat(cuda.core)!: drop int location shorthand from managed-memory ops (R6, R8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Leo's review on PR #1775 (_managed_buffer.py:165) and Andy's parallel question (line 144), drop the `int` shorthand for prefetch/discard_prefetch/advise locations. The previous design accepted `Device | Host | int` where `int >= 0` meant a device ordinal and `-1` magically meant host. With first-class `Device` and `Host`, the int form was redundant and the `-1 → Host` magic was surprising. Public API change: prefetch(buf, Device(0), stream=...) # was: prefetch(buf, 0, stream=...) prefetch(buf, Host(), stream=...) # was: prefetch(buf, -1, stream=...) This also resolves an inconsistency: ManagedBuffer.preferred_location already accepted only Device | Host | None, but prefetch() and discard_prefetch() accepted int. Now uniformly Device | Host. Pre-1.0 breaking change. Anyone using the int shorthand should switch to the explicit Device(N) / Host() form. Files touched: - _managed_location.py: drop the int branch from _coerce_location; TypeError now reads "Device, Host, or None" - _managed_buffer.py: type signatures `Device | Host | int` → `Device | Host` - _managed_memory_ops.pyx: docstring updates (3 occurrences) - tests/memory/test_managed_ops.py: replace int call sites with Host()/Device(N); collapse three int-branch tests into one test_int_rejected - 1.0.0-notes.rst: drop the "int values are also accepted" sentence Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_buffer.py | 4 +- .../cuda/core/_memory/_managed_location.py | 8 +--- .../cuda/core/_memory/_managed_memory_ops.pyx | 6 +-- cuda_core/docs/source/release/1.0.0-notes.rst | 3 +- cuda_core/tests/memory/test_managed_ops.py | 37 +++++++------------ 5 files changed, 20 insertions(+), 38 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index 140c79748fe..251ebd66dff 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -162,7 +162,7 @@ def accessed_by(self, locations) -> None: for loc in target - current: advise(self, _SET_ACCESSED_BY, loc) - def prefetch(self, location: Device | Host | int, *, stream: Stream | GraphBuilder) -> None: + def prefetch(self, location: Device | Host, *, stream: Stream | GraphBuilder) -> None: """Prefetch this range to ``location`` on ``stream``.""" prefetch(self, location, stream=stream) @@ -170,6 +170,6 @@ def discard(self, *, stream: Stream | GraphBuilder) -> None: """Discard this range's resident pages on ``stream`` (CUDA 13+).""" discard(self, stream=stream) - def discard_prefetch(self, location: Device | Host | int, *, stream: Stream | GraphBuilder) -> None: + def discard_prefetch(self, location: Device | Host, *, stream: Stream | GraphBuilder) -> None: """Discard this range and prefetch to ``location`` on ``stream`` (CUDA 13+).""" discard_prefetch(self, location, stream=stream) diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index 5e23a968cc3..4dae76f7479 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -48,10 +48,4 @@ def _coerce_location(value, *, allow_none: bool = False) -> _LocSpec | None: if allow_none: return None raise ValueError("location is required") - if isinstance(value, int): - if value == -1: - return _LocSpec(kind="host") - if value >= 0: - return _LocSpec(kind="device", id=value) - raise ValueError(f"device ordinal must be >= 0 (or -1 for host), got {value}") - raise TypeError(f"location must be a Device, Host, int, or None; got {type(value).__name__}") + raise TypeError(f"location must be a Device, Host, or None; got {type(value).__name__}") diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 3b36f6eff90..69d378424b4 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -239,7 +239,7 @@ def advise( ``"unset_read_mostly"``, ``"set_preferred_location"``, ``"unset_preferred_location"``, ``"set_accessed_by"``, ``"unset_accessed_by"``) and ``CUmem_advise`` enum values are accepted. - location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | int | Sequence[...] + location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] Target location(s). Required for advice values that consult a location; ignored (may be ``None``) for ``set_read_mostly``, ``unset_read_mostly``, and ``unset_preferred_location``. A sequence @@ -313,7 +313,7 @@ def prefetch( ---------- targets : :class:`Buffer` | Sequence[:class:`Buffer`] One or more managed allocations to operate on. - location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | int | Sequence[...] + location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] Target location(s). A single location applies to all targets; a sequence must match ``len(targets)``. ``int`` values are coerced to a location (``-1`` maps to host, ``>=0`` to that device ordinal). @@ -407,7 +407,7 @@ def discard_prefetch( ---------- targets : :class:`Buffer` | Sequence[:class:`Buffer`] One or more managed allocations to discard and re-prefetch. - location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | int | Sequence[...] + location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] Target location(s). A single location applies to all targets; a sequence must match ``len(targets)``. options : :class:`DiscardPrefetchOptions`, optional diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 35d200a9ad4..d3d2119196a 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -43,8 +43,7 @@ New features single managed :class:`Buffer` or a sequence; with cuda.bindings 12.8+ the N>1 case dispatches to the corresponding ``cuMem*BatchAsync`` driver entry point, addressing the managed-memory portion of #1333. Locations - are expressed via :class:`Device` or :class:`Host`; ``int`` values are - also accepted (``-1`` maps to host, ``>=0`` to that device ordinal). + are expressed via :class:`Device` or :class:`Host`. Per-call options use frozen dataclasses (:class:`~utils.AdviseOptions`, :class:`~utils.PrefetchOptions`, :class:`~utils.DiscardOptions`, :class:`~utils.DiscardPrefetchOptions`) diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index 1c6c045bdb3..16f6be92989 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -68,7 +68,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + utils.prefetch(buffer, Host(), stream=stream) stream.sync() last_location = _get_int_mem_range_attr( buffer, @@ -143,7 +143,7 @@ def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_ buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + utils.prefetch(buffer, Host(), stream=stream) stream.sync() utils.discard_prefetch(buffer, device, stream=stream) @@ -166,7 +166,7 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - utils.prefetch(buffer, _HOST_LOCATION_ID, stream=stream) + utils.prefetch(buffer, Host(), stream=stream) stream.sync() utils.discard_prefetch(buffer, device, stream=stream) @@ -227,8 +227,8 @@ def test_managed_memory_advise_location_validation(init_cuda): # set_read_mostly works without a location (location is ignored) utils.advise(buffer, "set_read_mostly") - # set_preferred_location requires a location; device ordinal works - utils.advise(buffer, "set_preferred_location", device.device_id) + # set_preferred_location requires a location; Device works + utils.advise(buffer, "set_preferred_location", device) # set_preferred_location with host location utils.advise(buffer, "set_preferred_location", Host()) @@ -241,9 +241,9 @@ def test_managed_memory_advise_location_validation(init_cuda): with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): utils.advise(buffer, "set_accessed_by", Host.numa_current()) - # Inferred location from int: -1 maps to host, 0 maps to device - utils.advise(buffer, "set_preferred_location", -1) - utils.advise(buffer, "set_preferred_location", 0) + # Both Host and Device locations are accepted + utils.advise(buffer, "set_preferred_location", Host()) + utils.advise(buffer, "set_preferred_location", Device(0)) buffer.close() @@ -356,18 +356,6 @@ def test_host_numa_current_passthrough(self): spec = _coerce_location(Host.numa_current()) assert spec.kind == "host_numa_current" - def test_int_device(self): - from cuda.core._memory._managed_location import _coerce_location - - spec = _coerce_location(0) - assert spec.kind == "device" - assert spec.id == 0 - - def test_int_minus_one_is_host(self): - from cuda.core._memory._managed_location import _coerce_location - - assert _coerce_location(-1).kind == "host" - def test_none_when_disallowed(self): from cuda.core._memory._managed_location import _coerce_location @@ -379,16 +367,17 @@ def test_none_when_allowed(self): assert _coerce_location(None, allow_none=True) is None - def test_bad_int(self): + def test_int_rejected(self): from cuda.core._memory._managed_location import _coerce_location - with pytest.raises(ValueError, match="device ordinal"): - _coerce_location(-2) + # int shorthand was removed in favor of explicit Device/Host + with pytest.raises(TypeError, match="Device, Host, or None"): + _coerce_location(0) def test_bad_type(self): from cuda.core._memory._managed_location import _coerce_location - with pytest.raises(TypeError, match="Device, Host, int, or None"): + with pytest.raises(TypeError, match="Device, Host, or None"): _coerce_location("device") From c43e81eaca48b6785a5671eac7a48e04ec968ee5 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 18:00:36 -0700 Subject: [PATCH 47/68] docs(cuda.core): add AccessedBySet to api_private.rst (R5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Andy's review on PR #1775 (_managed_buffer.py:52), document `AccessedBySet` in the private API reference. It is returned by `ManagedBuffer.accessed_by` but not directly instantiable by users — matches the existing `_memory._ipc.*` entries in the same section. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/docs/source/api_private.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index 141773967e8..cc7408096b5 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -32,6 +32,7 @@ CUDA runtime _device.DeviceProperties _memory._ipc.IPCAllocationHandle _memory._ipc.IPCBufferDescriptor + _memory._managed_buffer.AccessedBySet CUDA graphs From 71e9daac636e281bf9f120faa959f8deb511208e Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Thu, 30 Apr 2026 18:27:21 -0700 Subject: [PATCH 48/68] docs(cuda.core): note the legacy NUMA round-trip limitation on preferred_location (R2, R7) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Leo's questions on PR #1775 (_host.py:26 and _managed_buffer.py:140): R2 (Host numa_id): the dataclass surface is intentional. Three forms already cover the use cases — Host() / Host(numa_id=N) / Host.numa_current(). Auto-inferring numa_id at Host() construction would conflict with the "generic host" semantic. R7 (preferred_location getter): the underlying limitation is real but upstream-blocked. The legacy CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION returns only a single int (device id, -1 host, -2 none) — no NUMA. CUDA 13 added _PREFERRED_LOCATION_TYPE / _ID for full round-trip, and they are exposed in cydriver, but cuda.bindings' _HelperCUmem_range_attribute does not yet recognize them — calling driver.cuMemRangeGetAttribute with the new attributes raises "Unsupported attribute". Once cuda.bindings adds them, this getter can query the v2 attributes and return Host(numa_id=N). Add a docstring note documenting the limitation so users aren't surprised by the lossy round-trip. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_managed_buffer.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index 251ebd66dff..af26a0382eb 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -132,7 +132,18 @@ def read_mostly(self, value: bool) -> None: @property def preferred_location(self) -> Device | Host | None: - """Currently applied ``set_preferred_location`` target, or ``None``.""" + """Currently applied ``set_preferred_location`` target, or ``None``. + + .. note:: + The legacy ``CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION`` carries + only a device ordinal (or ``-1`` for host) and cannot represent + a specific NUMA node. As a result, ``Host(numa_id=N)`` set via + the setter currently round-trips back as ``Host()``. The CUDA 13 + driver added ``..._PREFERRED_LOCATION_TYPE`` / ``..._ID`` for + full ``CUmemLocation`` round-trip, but ``cuda.bindings`` does + not yet expose these via ``cuMemRangeGetAttribute``; once it + does, this getter will be upgraded. + """ loc_id = _get_int_attr(self, _ATTR_PREFERRED) if loc_id == -2: return None From df928a00aa608964e38f1368657767d16d74d167 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 07:37:57 -0700 Subject: [PATCH 49/68] refactor(cuda.core): use collections.abc.Sequence for input checks (R12, R13) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Andy's review on PR #1775 (_managed_memory_ops.pyx:102 and :118), replace `isinstance(x, (list, tuple))` with `isinstance(x, Sequence)` in `_coerce_buffer_targets` and `_broadcast_locations`. Matches the existing precedent in `cuda.core._utils.cuda_utils.is_sequence()`. The widened input set also accepts `str`, but neither `Buffer` nor `Location` is stringly-typed, so a `str` input still raises — just with a different message (Buffer cast error or Location TypeError from `_coerce_location`). Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 69d378424b4..1ef96e3f653 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -4,6 +4,8 @@ from __future__ import annotations +from collections.abc import Sequence + IF CUDA_CORE_BUILD_MAJOR >= 13: from libcpp.vector cimport vector @@ -99,7 +101,7 @@ cdef tuple _coerce_buffer_targets(object targets, str what): cdef list out if isinstance(targets, Buffer): return (targets,) - if isinstance(targets, (list, tuple)): + if isinstance(targets, Sequence): if not targets: raise ValueError(f"{what}: empty targets sequence") out = [] @@ -115,7 +117,7 @@ cdef tuple _coerce_buffer_targets(object targets, str what): cdef tuple _broadcast_locations(object location, Py_ssize_t n, bint allow_none, str what): cdef object coerced - if isinstance(location, (list, tuple)): + if isinstance(location, Sequence): if len(location) != n: raise ValueError( f"{what}: location length {len(location)} does not match " From f522916ed0ab2ce11d34188e686fceeb62fd9ace Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 07:54:26 -0700 Subject: [PATCH 50/68] refactor(cuda.core): narrow Buffer.from_handle to Buffer-only (R3) Per Leo's review on PR #1775 (_buffer.pyx:135), make Buffer.from_handle a @staticmethod that always returns Buffer. Subclass-aware construction stays available via the private @classmethod Buffer._init, which is what Leo asked for ("use a private method for handling subclasses for now"). ManagedBuffer gains its own @classmethod from_handle that wraps cls._init, so user-facing call sites like ManagedBuffer.from_handle(ptr, size, owner=plain) continue to work unchanged. The narrowly-scoped subclass factory is on the subclass itself, not bolted onto Buffer's public surface. This addresses R3's spirit: cuda.core's public APIs no longer advertise generic subclass-construction support that conflicts with the broader subclassing story tracked in #750 / #1989. No test changes; behavior preserved. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 8 ++--- .../cuda/core/_memory/_managed_buffer.py | 34 +++++++++++++++++-- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 18eaabf7ce6..f0a1ec10745 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -131,9 +131,8 @@ cdef class Buffer: # Must not serialize the parent's stream! return Buffer._reduce_helper, (self.memory_resource, self.get_ipc_descriptor()) - @classmethod + @staticmethod def from_handle( - cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, owner: object | None = None, ) -> Buffer: @@ -159,11 +158,8 @@ cdef class Buffer: When neither ``mr`` nor ``owner`` is specified, this creates a non-owning reference. The pointer will NOT be freed when the :class:`Buffer` is closed or garbage collected. - - Subclasses inherit this method via :meth:`Buffer._init`, so e.g. - ``ManagedBuffer.from_handle(ptr, size)`` returns a ``ManagedBuffer``. """ - return cls._init(ptr, size, mr=mr, owner=owner) + return Buffer._init(ptr, size, mr=mr, owner=owner) @classmethod def from_ipc_descriptor( diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index af26a0382eb..bacfe3d7c36 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -12,6 +12,7 @@ from cuda.core._utils.cuda_utils import driver, handle_return if TYPE_CHECKING: + from cuda.core._memory._buffer import MemoryResource from cuda.core._stream import Stream from cuda.core.graph import GraphBuilder @@ -101,9 +102,7 @@ class ManagedBuffer(Buffer): """Managed (unified) memory buffer with a property-style advice API. Returned by :meth:`ManagedMemoryResource.allocate`, or wrap an - existing managed-memory pointer with :meth:`Buffer.from_handle` - (which dispatches by class — ``ManagedBuffer.from_handle(...)`` - returns a ``ManagedBuffer``). + existing managed-memory pointer with :meth:`ManagedBuffer.from_handle`. Examples -------- @@ -121,6 +120,35 @@ class ManagedBuffer(Buffer): advice. """ + @classmethod + def from_handle( + cls, + ptr, + size: int, + mr: MemoryResource | None = None, + owner: object | None = None, + ) -> ManagedBuffer: + """Wrap an existing managed-memory pointer in a :class:`ManagedBuffer`. + + Use this when you have an externally-allocated managed pointer + and want the property-style advice API (:attr:`read_mostly`, + :attr:`preferred_location`, :attr:`accessed_by`). + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + Pointer to a managed allocation. + size : int + Allocation size in bytes. + mr : :obj:`~_memory.MemoryResource`, optional + Memory resource that owns ``ptr``. When provided, its + ``deallocate`` is called when the buffer is closed. + owner : object, optional + An object that keeps the underlying allocation alive. + ``owner`` and ``mr`` cannot both be specified. + """ + return cls._init(ptr, size, mr=mr, owner=owner) + @property def read_mostly(self) -> bool: """Whether ``set_read_mostly`` advice is currently applied.""" From 6204c57e4a73f64febb2e0ef7f5971cc69e39cc6 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 08:55:24 -0700 Subject: [PATCH 51/68] refactor(cuda.core): single API surface per operation (R9, R10, R11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Leo's R11 ("if we prefer methods, don't expose free functions"): each managed-memory operation now has exactly one public surface, chosen by whether it acts on one buffer or many. Single buffer (instance methods + properties on ManagedBuffer): - buf.read_mostly = True - buf.preferred_location = Device(0) - buf.accessed_by.add(Device(1)) - buf.prefetch(Device(0), stream=stream) - buf.discard(stream=stream) - buf.discard_prefetch(Device(0), stream=stream) Multiple buffers (free functions in cuda.core.utils, CUDA 13+ only): - utils.prefetch_batch(buffers, locations, stream=stream) - utils.discard_batch(buffers, stream=stream) - utils.discard_prefetch_batch(buffers, locations, stream=stream) Removed: - cuda.core.utils.advise / prefetch / discard / discard_prefetch (single-buffer surfaces — replaced by ManagedBuffer methods/properties) - cuda.core._memory._managed_memory_options module and its four empty AdviseOptions / PrefetchOptions / DiscardOptions / DiscardPrefetchOptions dataclasses (R9 from Leo, R10 from Andy: empty placeholders that didn't carry information) - options=None parameter from every public surface - The single-buffer fast path inside the now-batched-only free functions; they always hit cuMem*BatchAsync now Internals: - Public def advise() deleted; _advise_one (cdef) is the new internal single-buffer entry point used by ManagedBuffer property setters. - Three new Python-level wrappers _do_single_prefetch_py / _do_single_discard_py / _do_single_discard_prefetch_py used by ManagedBuffer instance methods. These call the cdef _do_single_* helpers with the right Cython types after stream coercion. - _coerce_buffer_targets renamed to _coerce_batch_buffers; rejects a single Buffer with a TypeError pointing at the ManagedBuffer method. Tests: - TestPrefetch / TestDiscard / TestDiscardPrefetch / TestAdvise rewritten as TestPrefetchBatch / TestDiscardBatch / TestDiscardPrefetchBatch (batched-only, since single-buffer is covered by ManagedBuffer's TestManagedBuffer class) - Single-buffer external-allocation tests use ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) to wrap a DummyUnifiedMemoryResource buffer - options-related tests deleted (no options surface to test) - enum-value advise test deleted (property setters are typed; the string-alias / enum-value internal API isn't user-visible) Release notes updated. Closes R9, R10, R11. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_buffer.py | 27 +- .../cuda/core/_memory/_managed_memory_ops.pyx | 242 ++++++------- .../core/_memory/_managed_memory_options.py | 43 --- cuda_core/cuda/core/utils.py | 21 +- cuda_core/docs/source/release/1.0.0-notes.rst | 23 +- cuda_core/tests/memory/test_managed_ops.py | 338 +++++------------- 6 files changed, 221 insertions(+), 473 deletions(-) delete mode 100644 cuda_core/cuda/core/_memory/_managed_memory_options.py diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index bacfe3d7c36..a6649fbaa77 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -8,7 +8,12 @@ from cuda.core._device import Device from cuda.core._host import Host from cuda.core._memory._buffer import Buffer -from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch +from cuda.core._memory._managed_memory_ops import ( + _advise_one, + _do_single_discard_prefetch_py, + _do_single_discard_py, + _do_single_prefetch_py, +) from cuda.core._utils.cuda_utils import driver, handle_return if TYPE_CHECKING: @@ -91,11 +96,11 @@ def __repr__(self) -> str: def add(self, location: Device | Host) -> None: """Apply ``set_accessed_by`` advice for ``location``.""" - advise(self._buf, _SET_ACCESSED_BY, location) + _advise_one(self._buf, _SET_ACCESSED_BY, location) def discard(self, location: Device | Host) -> None: """Apply ``unset_accessed_by`` advice for ``location``.""" - advise(self._buf, _UNSET_ACCESSED_BY, location) + _advise_one(self._buf, _UNSET_ACCESSED_BY, location) class ManagedBuffer(Buffer): @@ -156,7 +161,7 @@ def read_mostly(self) -> bool: @read_mostly.setter def read_mostly(self, value: bool) -> None: - advise(self, _SET_READ_MOSTLY if value else _UNSET_READ_MOSTLY) + _advise_one(self, _SET_READ_MOSTLY if value else _UNSET_READ_MOSTLY, None) @property def preferred_location(self) -> Device | Host | None: @@ -182,9 +187,9 @@ def preferred_location(self) -> Device | Host | None: @preferred_location.setter def preferred_location(self, value: Device | Host | None) -> None: if value is None: - advise(self, _UNSET_PREFERRED) + _advise_one(self, _UNSET_PREFERRED, None) else: - advise(self, _SET_PREFERRED, value) + _advise_one(self, _SET_PREFERRED, value) @property def accessed_by(self) -> AccessedBySet: @@ -197,18 +202,18 @@ def accessed_by(self, locations) -> None: current = set(_query_accessed_by(self)) target = set(locations) for loc in current - target: - advise(self, _UNSET_ACCESSED_BY, loc) + _advise_one(self, _UNSET_ACCESSED_BY, loc) for loc in target - current: - advise(self, _SET_ACCESSED_BY, loc) + _advise_one(self, _SET_ACCESSED_BY, loc) def prefetch(self, location: Device | Host, *, stream: Stream | GraphBuilder) -> None: """Prefetch this range to ``location`` on ``stream``.""" - prefetch(self, location, stream=stream) + _do_single_prefetch_py(self, location, stream) def discard(self, *, stream: Stream | GraphBuilder) -> None: """Discard this range's resident pages on ``stream`` (CUDA 13+).""" - discard(self, stream=stream) + _do_single_discard_py(self, stream) def discard_prefetch(self, location: Device | Host, *, stream: Stream | GraphBuilder) -> None: """Discard this range and prefetch to ``location`` on ``stream`` (CUDA 13+).""" - discard_prefetch(self, location, stream=stream) + _do_single_discard_prefetch_py(self, location, stream) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 1ef96e3f653..cc45273641c 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -17,12 +17,6 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils import driver from cuda.core._memory._managed_location import _coerce_location -from cuda.core._memory._managed_memory_options import ( - AdviseOptions, - DiscardOptions, - DiscardPrefetchOptions, - PrefetchOptions, -) cdef dict _MANAGED_ADVICE_ALIASES = { @@ -96,22 +90,30 @@ cdef void _require_managed_buffer(Buffer self, str what): raise ValueError(f"{what} requires a managed-memory allocation") -cdef tuple _coerce_buffer_targets(object targets, str what): +cdef tuple _coerce_batch_buffers(object buffers, str what): + """Coerce ``buffers`` to a tuple[Buffer, ...]; rejects a single Buffer. + + For single-buffer operations, use the corresponding ManagedBuffer + instance method instead. + """ cdef Buffer buf cdef list out - if isinstance(targets, Buffer): - return (targets,) - if isinstance(targets, Sequence): - if not targets: - raise ValueError(f"{what}: empty targets sequence") + if isinstance(buffers, Buffer): + raise TypeError( + f"{what}: pass a sequence of Buffers; for a single buffer use " + f"the ManagedBuffer instance method" + ) + if isinstance(buffers, Sequence): + if not buffers: + raise ValueError(f"{what}: empty buffers sequence") out = [] - for t in targets: + for t in buffers: buf = t out.append(buf) return tuple(out) raise TypeError( - f"{what}: targets must be a Buffer or sequence of Buffer, " - f"got {type(targets).__name__}" + f"{what}: buffers must be a sequence of Buffer, " + f"got {type(buffers).__name__}" ) @@ -159,43 +161,43 @@ ELSE: ) -def discard( - targets, - *, - options=None, - stream, -): - """Discard one or more managed-memory ranges. +def discard_batch(buffers, *, stream): + """Discard a batch of managed-memory ranges. + + Requires CUDA 13+. For a single buffer, use + :meth:`ManagedBuffer.discard` instead. Parameters ---------- - targets : :class:`Buffer` | Sequence[:class:`Buffer`] - One or more managed allocations to discard. Their resident pages - are released without prefetching new contents; subsequent access - is satisfied by lazy migration. - options : :class:`DiscardOptions`, optional - Reserved for future per-call flags. ``None`` (default) and - ``DiscardOptions()`` are equivalent. + buffers : Sequence[:class:`Buffer`] + Two or more managed allocations to discard. Resident pages are + released without prefetching new contents; subsequent access is + satisfied by lazy migration. stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` Stream for the asynchronous discard (keyword-only). Raises ------ NotImplementedError - On a CUDA 12 build of ``cuda.core``. Discard requires CUDA 13+. + On a CUDA 12 build of ``cuda.core``. """ - if options is not None and not isinstance(options, DiscardOptions): - raise TypeError( - "discard options must be a DiscardOptions instance or None, " - f"got {type(options).__name__}" - ) - cdef tuple bufs = _coerce_buffer_targets(targets, "discard") + cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_batch") cdef Stream s = Stream_accept(stream) cdef Buffer buf for buf in bufs: - _require_managed_buffer(buf, "discard") + _require_managed_buffer(buf, "discard_batch") + + _do_batch_discard(bufs, s) + +def _do_single_discard_py(Buffer buf, stream): + """Internal: single-buffer discard for ManagedBuffer.discard().""" + _require_managed_buffer(buf, "discard") + cdef Stream s = Stream_accept(stream) + # No single-range cuMemDiscard exists; route through the batched call + # with count=1. + cdef tuple bufs = (buf,) _do_batch_discard(bufs, s) @@ -223,61 +225,24 @@ cdef void _do_batch_discard(tuple bufs, Stream s): ) -def advise( - targets, - advice, - location=None, - *, - options=None, -): - """Apply managed-memory advice to one or more allocation ranges. +def _advise_one(Buffer buf, advice, location): + """Internal: apply managed-memory advice to a single buffer. - Parameters - ---------- - targets : :class:`Buffer` | Sequence[:class:`Buffer`] - One or more managed allocations to advise. - advice : str | :obj:`~driver.CUmem_advise` - Managed-memory advice. String aliases (``"set_read_mostly"``, - ``"unset_read_mostly"``, ``"set_preferred_location"``, - ``"unset_preferred_location"``, ``"set_accessed_by"``, - ``"unset_accessed_by"``) and ``CUmem_advise`` enum values are accepted. - location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] - Target location(s). Required for advice values that consult a - location; ignored (may be ``None``) for ``set_read_mostly``, - ``unset_read_mostly``, and ``unset_preferred_location``. A sequence - must match ``len(targets)``. - options : :class:`AdviseOptions`, optional - Reserved for future per-call flags. ``None`` (default) and - ``AdviseOptions()`` are equivalent. + Used by :class:`ManagedBuffer` property setters. Not part of the + public API. """ - if options is not None and not isinstance(options, AdviseOptions): - raise TypeError( - "advise options must be an AdviseOptions instance or None, " - f"got {type(options).__name__}" - ) cdef str advice_name cdef object advice_value advice_name, advice_value = _normalize_managed_advice(advice) cdef bint allow_none = advice_name in _MANAGED_ADVICE_IGNORE_LOCATION cdef frozenset allowed_kinds = _MANAGED_ADVICE_ALLOWED_LOCTYPES[advice_name] - - cdef tuple bufs = _coerce_buffer_targets(targets, "advise") - cdef Py_ssize_t n = len(bufs) - cdef tuple locs = _broadcast_locations(location, n, allow_none, "advise") - - cdef Buffer buf - cdef object loc - for buf in bufs: - _require_managed_buffer(buf, "advise") - for loc in locs: - if loc is not None and loc.kind not in allowed_kinds: - raise ValueError( - f"advise '{advice_name}' does not support location_type='{loc.kind}'" - ) - - cdef Py_ssize_t i - for i in range(n): - _do_single_advise(bufs[i], advice_value, locs[i], allow_none) + cdef object loc = _coerce_location(location, allow_none=allow_none) + if loc is not None and loc.kind not in allowed_kinds: + raise ValueError( + f"advise '{advice_name}' does not support location_type='{loc.kind}'" + ) + _require_managed_buffer(buf, "advise") + _do_single_advise(buf, advice_value, loc, allow_none) cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint allow_none): @@ -302,52 +267,48 @@ cdef void _do_single_advise(Buffer buf, object advice_value, object loc, bint al HANDLE_RETURN(cydriver.cuMemAdvise(cu_ptr, nbytes, advice_enum, dev_int)) -def prefetch( - targets, - location=None, - *, - options=None, - stream, -): - """Prefetch one or more managed-memory ranges to a target location. +def prefetch_batch(buffers, locations, *, stream): + """Prefetch a batch of managed-memory ranges to target locations. + + Requires CUDA 13+. For a single buffer, use + :meth:`ManagedBuffer.prefetch` instead. Parameters ---------- - targets : :class:`Buffer` | Sequence[:class:`Buffer`] - One or more managed allocations to operate on. - location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] - Target location(s). A single location applies to all targets; a - sequence must match ``len(targets)``. ``int`` values are coerced - to a location (``-1`` maps to host, ``>=0`` to that device ordinal). - options : :class:`PrefetchOptions`, optional - Reserved for future per-call flags. ``None`` (default) and - ``PrefetchOptions()`` are equivalent. + buffers : Sequence[:class:`Buffer`] + Two or more managed allocations to operate on. + locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] + Target location(s). A single location applies to all buffers; a + sequence must match ``len(buffers)``. stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` Stream for the asynchronous prefetch (keyword-only). Raises ------ NotImplementedError - If ``len(targets) > 1`` on a CUDA 12 build of ``cuda.core``. + On a CUDA 12 build of ``cuda.core``. """ - if options is not None and not isinstance(options, PrefetchOptions): - raise TypeError( - "prefetch options must be a PrefetchOptions instance or None, " - f"got {type(options).__name__}" - ) - cdef tuple bufs = _coerce_buffer_targets(targets, "prefetch") + cdef tuple bufs = _coerce_batch_buffers(buffers, "prefetch_batch") cdef Py_ssize_t n = len(bufs) - cdef tuple locs = _broadcast_locations(location, n, False, "prefetch") + cdef tuple locs = _broadcast_locations(locations, n, False, "prefetch_batch") cdef Stream s = Stream_accept(stream) cdef Buffer buf for buf in bufs: - _require_managed_buffer(buf, "prefetch") + _require_managed_buffer(buf, "prefetch_batch") + + _do_batch_prefetch(bufs, locs, s) - if n == 1: - _do_single_prefetch(bufs[0], locs[0], s) - else: - _do_batch_prefetch(bufs, locs, s) + +def _do_single_prefetch_py(Buffer buf, location, stream): + """Internal: single-buffer prefetch for ManagedBuffer.prefetch(). + + Uses cuMemPrefetchAsync (works on CUDA 12 and 13). + """ + _require_managed_buffer(buf, "prefetch") + cdef object loc = _coerce_location(location, allow_none=False) + cdef Stream s = Stream_accept(stream) + _do_single_prefetch(buf, loc, s) cdef void _do_single_prefetch(Buffer buf, object loc, Stream s): @@ -396,51 +357,50 @@ cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): ) -def discard_prefetch( - targets, - location=None, - *, - options=None, - stream, -): - """Discard one or more managed-memory ranges and prefetch them to a target location. +def discard_prefetch_batch(buffers, locations, *, stream): + """Discard a batch of managed-memory ranges and prefetch them to target locations. + + Requires CUDA 13+. For a single buffer, use + :meth:`ManagedBuffer.discard_prefetch` instead. Parameters ---------- - targets : :class:`Buffer` | Sequence[:class:`Buffer`] - One or more managed allocations to discard and re-prefetch. - location : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] - Target location(s). A single location applies to all targets; - a sequence must match ``len(targets)``. - options : :class:`DiscardPrefetchOptions`, optional - Reserved for future per-call flags. ``None`` (default) and - ``DiscardPrefetchOptions()`` are equivalent. + buffers : Sequence[:class:`Buffer`] + Two or more managed allocations to discard and re-prefetch. + locations : :class:`~cuda.core.Device` | :class:`~cuda.core.Host` | Sequence[...] + Target location(s). A single location applies to all buffers; + a sequence must match ``len(buffers)``. stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` Stream for the asynchronous operation (keyword-only). Raises ------ NotImplementedError - On a CUDA 12 build of ``cuda.core``. Discard-and-prefetch - requires CUDA 13+. + On a CUDA 12 build of ``cuda.core``. """ - if options is not None and not isinstance(options, DiscardPrefetchOptions): - raise TypeError( - "discard_prefetch options must be a DiscardPrefetchOptions " - f"instance or None, got {type(options).__name__}" - ) - cdef tuple bufs = _coerce_buffer_targets(targets, "discard_prefetch") + cdef tuple bufs = _coerce_batch_buffers(buffers, "discard_prefetch_batch") cdef Py_ssize_t n = len(bufs) - cdef tuple locs = _broadcast_locations(location, n, False, "discard_prefetch") + cdef tuple locs = _broadcast_locations(locations, n, False, "discard_prefetch_batch") cdef Stream s = Stream_accept(stream) cdef Buffer buf for buf in bufs: - _require_managed_buffer(buf, "discard_prefetch") + _require_managed_buffer(buf, "discard_prefetch_batch") _do_batch_discard_prefetch(bufs, locs, s) +def _do_single_discard_prefetch_py(Buffer buf, location, stream): + """Internal: single-buffer discard+prefetch for + ManagedBuffer.discard_prefetch().""" + _require_managed_buffer(buf, "discard_prefetch") + cdef object loc = _coerce_location(location, allow_none=False) + cdef Stream s = Stream_accept(stream) + cdef tuple bufs = (buf,) + cdef tuple locs = (loc,) + _do_batch_discard_prefetch(bufs, locs, s) + + cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s): IF CUDA_CORE_BUILD_MAJOR >= 13: cdef Py_ssize_t n = len(bufs) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_options.py b/cuda_core/cuda/core/_memory/_managed_memory_options.py deleted file mode 100644 index 68754f2731b..00000000000 --- a/cuda_core/cuda/core/_memory/_managed_memory_options.py +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from dataclasses import dataclass - - -@dataclass(frozen=True) -class AdviseOptions: - """Per-call options for :func:`cuda.core.utils.advise`. - - Reserved for future advise flags. Currently has no fields; pass - ``AdviseOptions()`` or ``None`` to use driver defaults. - """ - - -@dataclass(frozen=True) -class PrefetchOptions: - """Per-call options for :func:`cuda.core.utils.prefetch`. - - Reserved for future prefetch flags. Currently has no fields; pass - ``PrefetchOptions()`` or ``None`` to use driver defaults. - """ - - -@dataclass(frozen=True) -class DiscardOptions: - """Per-call options for :func:`cuda.core.utils.discard`. - - Reserved for future discard flags. Currently has no fields; pass - ``DiscardOptions()`` or ``None`` to use driver defaults. - """ - - -@dataclass(frozen=True) -class DiscardPrefetchOptions: - """Per-call options for :func:`cuda.core.utils.discard_prefetch`. - - Reserved for future discard-and-prefetch flags. Currently has no - fields; pass ``DiscardPrefetchOptions()`` or ``None`` to use driver - defaults. - """ diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py index a252e3d935f..200e5241b50 100644 --- a/cuda_core/cuda/core/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -2,24 +2,17 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core._memory._managed_memory_ops import advise, discard, discard_prefetch, prefetch -from cuda.core._memory._managed_memory_options import ( - AdviseOptions, - DiscardOptions, - DiscardPrefetchOptions, - PrefetchOptions, +from cuda.core._memory._managed_memory_ops import ( + discard_batch, + discard_prefetch_batch, + prefetch_batch, ) from cuda.core._memoryview import StridedMemoryView, args_viewable_as_strided_memory __all__ = [ - "AdviseOptions", - "DiscardOptions", - "DiscardPrefetchOptions", - "PrefetchOptions", "StridedMemoryView", - "advise", "args_viewable_as_strided_memory", - "discard", - "discard_prefetch", - "prefetch", + "discard_batch", + "discard_prefetch_batch", + "prefetch_batch", ] diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index d3d2119196a..9f52ca98074 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -37,17 +37,18 @@ New features Use :meth:`ManagedBuffer.from_handle` to wrap an existing managed-memory pointer. -- Added managed-memory range operations to :mod:`cuda.core.utils`: - :func:`~utils.advise`, :func:`~utils.prefetch`, :func:`~utils.discard`, - and :func:`~utils.discard_prefetch`. Each operation accepts either a - single managed :class:`Buffer` or a sequence; with cuda.bindings 12.8+ - the N>1 case dispatches to the corresponding ``cuMem*BatchAsync`` driver - entry point, addressing the managed-memory portion of #1333. Locations - are expressed via :class:`Device` or :class:`Host`. - Per-call options use frozen dataclasses - (:class:`~utils.AdviseOptions`, :class:`~utils.PrefetchOptions`, - :class:`~utils.DiscardOptions`, :class:`~utils.DiscardPrefetchOptions`) - reserved for future flags. +- Added batched managed-memory range operations to :mod:`cuda.core.utils` + (CUDA 13+): :func:`~utils.prefetch_batch`, :func:`~utils.discard_batch`, + and :func:`~utils.discard_prefetch_batch`. Each takes a sequence of + managed :class:`Buffer` instances and dispatches to the corresponding + ``cuMem*BatchAsync`` driver entry point, addressing the managed-memory + portion of #1333. Single-buffer operations are exposed as instance + methods on :class:`ManagedBuffer` (:meth:`~ManagedBuffer.prefetch`, + :meth:`~ManagedBuffer.discard`, :meth:`~ManagedBuffer.discard_prefetch`) + and as property setters (:attr:`~ManagedBuffer.read_mostly`, + :attr:`~ManagedBuffer.preferred_location`, + :attr:`~ManagedBuffer.accessed_by`). Locations are expressed via + :class:`Device` or :class:`Host`. Fixes and enhancements diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index 16f6be92989..176438399c4 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -8,7 +8,7 @@ create_managed_memory_resource_or_skip, skip_if_managed_memory_unsupported, ) -from cuda.core import Device, Host, ManagedBuffer, utils +from cuda.core import Device, Host, ManagedBuffer from cuda.core._utils.cuda_utils import handle_return try: @@ -68,7 +68,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - utils.prefetch(buffer, Host(), stream=stream) + buffer.prefetch(Host(), stream=stream) stream.sync() last_location = _get_int_mem_range_attr( buffer, @@ -76,7 +76,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): ) assert last_location == _HOST_LOCATION_ID - utils.prefetch(buffer, device, stream=stream) + buffer.prefetch(device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( buffer, @@ -92,9 +92,10 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): _skip_if_managed_location_ops_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + buffer = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) - utils.advise(buffer, "set_read_mostly") + buffer.read_mostly = True assert ( _get_int_mem_range_attr( buffer, @@ -103,14 +104,14 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): == _READ_MOSTLY_ENABLED ) - utils.advise(buffer, "set_preferred_location", Host()) + buffer.preferred_location = Host() preferred_location = _get_int_mem_range_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, ) assert preferred_location == _HOST_LOCATION_ID - buffer.close() + plain.close() def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda): @@ -118,10 +119,11 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda _skip_if_managed_location_ops_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + buffer = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) stream = device.create_stream() - utils.prefetch(buffer, device, stream=stream) + buffer.prefetch(device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( @@ -130,7 +132,7 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda ) assert last_location == device.device_id - buffer.close() + plain.close() def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_cuda): @@ -143,10 +145,10 @@ def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_ buffer = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - utils.prefetch(buffer, Host(), stream=stream) + buffer.prefetch(Host(), stream=stream) stream.sync() - utils.discard_prefetch(buffer, device, stream=stream) + buffer.discard_prefetch(device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( @@ -163,13 +165,14 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i _skip_if_managed_discard_prefetch_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + buffer = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) stream = device.create_stream() - utils.prefetch(buffer, Host(), stream=stream) + buffer.prefetch(Host(), stream=stream) stream.sync() - utils.discard_prefetch(buffer, device, stream=stream) + buffer.discard_prefetch(device, stream=stream) stream.sync() last_location = _get_int_mem_range_attr( @@ -178,24 +181,28 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i ) assert last_location == device.device_id - buffer.close() + plain.close() def test_managed_memory_operations_reject_non_managed_allocations(init_cuda): + """Wrapping a non-managed pointer in ManagedBuffer raises at op time.""" device = Device() device.set_current() - buffer = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + plain = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + # Wrapping a device-only pointer as ManagedBuffer is allowed at construction + # (no driver query yet); the runtime managed-ness check fires at op time. + buffer = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) stream = device.create_stream() with pytest.raises(ValueError, match="managed-memory allocation"): - utils.advise(buffer, "set_read_mostly") + buffer.read_mostly = True with pytest.raises(ValueError, match="managed-memory allocation"): - utils.prefetch(buffer, device, stream=stream) + buffer.prefetch(device, stream=stream) with pytest.raises(ValueError, match="managed-memory allocation"): - utils.discard_prefetch(buffer, device, stream=stream) + buffer.discard_prefetch(device, stream=stream) - buffer.close() + plain.close() def test_managed_memory_operation_validation(init_cuda): @@ -208,10 +215,10 @@ def test_managed_memory_operation_validation(init_cuda): stream = device.create_stream() with pytest.raises(ValueError, match="location is required"): - utils.prefetch(buffer, stream=stream) + buffer.prefetch(None, stream=stream) with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - utils.advise(buffer, "set_accessed_by", Host(numa_id=_INVALID_HOST_DEVICE_ORDINAL)) + buffer.accessed_by.add(Host(numa_id=_INVALID_HOST_DEVICE_ORDINAL)) buffer.close() @@ -222,69 +229,31 @@ def test_managed_memory_advise_location_validation(init_cuda): _skip_if_managed_location_ops_unsupported(device) device.set_current() - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + buffer = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) - # set_read_mostly works without a location (location is ignored) - utils.advise(buffer, "set_read_mostly") + # read_mostly works without a location + buffer.read_mostly = True - # set_preferred_location requires a location; Device works - utils.advise(buffer, "set_preferred_location", device) + # preferred_location accepts Device + buffer.preferred_location = device - # set_preferred_location with host location - utils.advise(buffer, "set_preferred_location", Host()) + # preferred_location accepts Host() + buffer.preferred_location = Host() - # set_accessed_by with host_numa raises ValueError (INVALID per CUDA docs) + # accessed_by rejects host_numa with pytest.raises(ValueError, match="does not support location_type='host_numa'"): - utils.advise(buffer, "set_accessed_by", Host(numa_id=0)) + buffer.accessed_by.add(Host(numa_id=0)) - # set_accessed_by with host_numa_current also raises ValueError + # accessed_by rejects host_numa_current with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): - utils.advise(buffer, "set_accessed_by", Host.numa_current()) - - # Both Host and Device locations are accepted - utils.advise(buffer, "set_preferred_location", Host()) - utils.advise(buffer, "set_preferred_location", Device(0)) - - buffer.close() - + buffer.accessed_by.add(Host.numa_current()) -def test_managed_memory_advise_accepts_enum_value(init_cuda): - """advise() accepts CUmem_advise enum values directly, not just string aliases.""" - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + # Both Host and Device are accepted + buffer.preferred_location = Host() + buffer.preferred_location = Device(0) - advice_enum = driver.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY - utils.advise(buffer, advice_enum) - - assert ( - _get_int_mem_range_attr( - buffer, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - ) - == _READ_MOSTLY_ENABLED - ) - - buffer.close() - - -def test_managed_memory_advise_invalid_advice_values(init_cuda): - """advise() rejects invalid advice strings and wrong types.""" - device = Device() - _skip_if_managed_allocation_unsupported(device) - device.set_current() - - buffer = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - - with pytest.raises(ValueError, match="advice must be one of"): - utils.advise(buffer, "not_a_real_advice") - - with pytest.raises(TypeError, match="advice must be"): - utils.advise(buffer, 42) - - buffer.close() + plain.close() class TestHost: @@ -381,28 +350,11 @@ def test_bad_type(self): _coerce_location("device") -class TestPrefetch: - def test_single_with_host_location(self, init_cuda): - from cuda.core.utils import prefetch +class TestPrefetchBatch: + """Tests for utils.prefetch_batch (batched-only free function).""" - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - prefetch(buf, Host(), stream=stream) - stream.sync() - last = _get_int_mem_range_attr( - buf, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last == _HOST_LOCATION_ID - buf.close() - - def test_batched_same_location(self, init_cuda): - from cuda.core.utils import prefetch + def test_same_location(self, init_cuda): + from cuda.core.utils import prefetch_batch device = Device() skip_if_managed_memory_unsupported(device) @@ -413,7 +365,7 @@ def test_batched_same_location(self, init_cuda): bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] stream = device.create_stream() - prefetch(bufs, device, stream=stream) + prefetch_batch(bufs, device, stream=stream) stream.sync() for buf in bufs: @@ -424,8 +376,8 @@ def test_batched_same_location(self, init_cuda): assert last == device.device_id buf.close() - def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.utils import prefetch + def test_per_buffer_location(self, init_cuda): + from cuda.core.utils import prefetch_batch device = Device() skip_if_managed_memory_unsupported(device) @@ -436,7 +388,7 @@ def test_batched_per_buffer_location(self, init_cuda): bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] stream = device.create_stream() - prefetch(bufs, [Host(), device], stream=stream) + prefetch_batch(bufs, [Host(), device], stream=stream) stream.sync() last0 = _get_int_mem_range_attr( @@ -453,7 +405,7 @@ def test_batched_per_buffer_location(self, init_cuda): buf.close() def test_length_mismatch(self, init_cuda): - from cuda.core.utils import prefetch + from cuda.core.utils import prefetch_batch device = Device() skip_if_managed_memory_unsupported(device) @@ -463,23 +415,12 @@ def test_length_mismatch(self, init_cuda): stream = device.create_stream() with pytest.raises(ValueError, match="length"): - prefetch(bufs, [Host()], stream=stream) + prefetch_batch(bufs, [Host()], stream=stream) for buf in bufs: buf.close() - def test_rejects_non_managed(self, init_cuda): - from cuda.core.utils import prefetch - - device = Device() - device.set_current() - buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - with pytest.raises(ValueError, match="managed-memory"): - prefetch(buf, Host(), stream=stream) - buf.close() - - def test_location_required(self, init_cuda): - from cuda.core.utils import prefetch + def test_rejects_single_buffer(self, init_cuda): + from cuda.core.utils import prefetch_batch device = Device() skip_if_managed_memory_unsupported(device) @@ -487,44 +428,16 @@ def test_location_required(self, init_cuda): mr = create_managed_memory_resource_or_skip() buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(ValueError, match="location is required"): - prefetch(buf, None, stream=stream) + with pytest.raises(TypeError, match="sequence of Buffers"): + prefetch_batch(buf, Host(), stream=stream) buf.close() - def test_options_must_be_options_dataclass_or_none(self, init_cuda): - from cuda.core.utils import prefetch - - device = Device() - skip_if_managed_memory_unsupported(device) - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - with pytest.raises(TypeError, match="must be an? .*Options instance or None"): - prefetch(buf, Host(), options={}, stream=stream) - buf.close() - - -class TestDiscard: - def test_single_buffer(self, init_cuda): - from cuda.core.utils import discard, prefetch - device = Device() - skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemDiscardBatchAsync"): - pytest.skip("cuMemDiscardBatchAsync unavailable") - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - prefetch(buf, device, stream=stream) - stream.sync() - discard(buf, stream=stream) - stream.sync() - buf.close() +class TestDiscardBatch: + """Tests for utils.discard_batch (batched-only free function).""" - def test_batched(self, init_cuda): - from cuda.core.utils import discard, prefetch + def test_basic(self, init_cuda): + from cuda.core.utils import discard_batch, prefetch_batch device = Device() skip_if_managed_memory_unsupported(device) @@ -534,26 +447,15 @@ def test_batched(self, init_cuda): mr = create_managed_memory_resource_or_skip() bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] stream = device.create_stream() - prefetch(bufs, device, stream=stream) + prefetch_batch(bufs, device, stream=stream) stream.sync() - discard(bufs, stream=stream) + discard_batch(bufs, stream=stream) stream.sync() for buf in bufs: buf.close() - def test_rejects_non_managed(self, init_cuda): - from cuda.core.utils import discard - - device = Device() - device.set_current() - buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - with pytest.raises(ValueError, match="managed-memory"): - discard(buf, stream=stream) - buf.close() - - def test_options_must_be_options_dataclass_or_none(self, init_cuda): - from cuda.core.utils import discard + def test_rejects_single_buffer(self, init_cuda): + from cuda.core.utils import discard_batch device = Device() skip_if_managed_memory_unsupported(device) @@ -561,38 +463,16 @@ def test_options_must_be_options_dataclass_or_none(self, init_cuda): mr = create_managed_memory_resource_or_skip() buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(TypeError, match="must be an? .*Options instance or None"): - discard(buf, options={}, stream=stream) + with pytest.raises(TypeError, match="sequence of Buffers"): + discard_batch(buf, stream=stream) buf.close() -class TestDiscardPrefetch: - def test_single_buffer(self, init_cuda): - from cuda.core.utils import discard_prefetch, prefetch +class TestDiscardPrefetchBatch: + """Tests for utils.discard_prefetch_batch (batched-only free function).""" - device = Device() - skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemDiscardAndPrefetchBatchAsync"): - pytest.skip("cuMemDiscardAndPrefetchBatchAsync unavailable") - device.set_current() - mr = create_managed_memory_resource_or_skip() - buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) - stream = device.create_stream() - - prefetch(buf, Host(), stream=stream) - stream.sync() - discard_prefetch(buf, device, stream=stream) - stream.sync() - - last = _get_int_mem_range_attr( - buf, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, - ) - assert last == device.device_id - buf.close() - - def test_batched_same_location(self, init_cuda): - from cuda.core.utils import discard_prefetch, prefetch + def test_same_location(self, init_cuda): + from cuda.core.utils import discard_prefetch_batch, prefetch_batch device = Device() skip_if_managed_memory_unsupported(device) @@ -602,9 +482,9 @@ def test_batched_same_location(self, init_cuda): mr = create_managed_memory_resource_or_skip() bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] stream = device.create_stream() - prefetch(bufs, Host(), stream=stream) + prefetch_batch(bufs, Host(), stream=stream) stream.sync() - discard_prefetch(bufs, device, stream=stream) + discard_prefetch_batch(bufs, device, stream=stream) stream.sync() for buf in bufs: last = _get_int_mem_range_attr( @@ -615,7 +495,7 @@ def test_batched_same_location(self, init_cuda): buf.close() def test_length_mismatch(self, init_cuda): - from cuda.core.utils import discard_prefetch + from cuda.core.utils import discard_prefetch_batch device = Device() skip_if_managed_memory_unsupported(device) @@ -624,69 +504,21 @@ def test_length_mismatch(self, init_cuda): bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] stream = device.create_stream() with pytest.raises(ValueError, match="length"): - discard_prefetch(bufs, [Host()], stream=stream) + discard_prefetch_batch(bufs, [Host()], stream=stream) for buf in bufs: buf.close() - def test_rejects_non_managed(self, init_cuda): - from cuda.core.utils import discard_prefetch + def test_rejects_single_buffer(self, init_cuda): + from cuda.core.utils import discard_prefetch_batch device = Device() + skip_if_managed_memory_unsupported(device) device.set_current() - buf = DummyDeviceMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + mr = create_managed_memory_resource_or_skip() + buf = mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) stream = device.create_stream() - with pytest.raises(ValueError, match="managed-memory"): - discard_prefetch(buf, Host(), stream=stream) - buf.close() - - -class TestAdvise: - def test_batched_same_advice(self, init_cuda): - from cuda.core.utils import advise - - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - mr = DummyUnifiedMemoryResource(device) - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] - advise(bufs, "set_read_mostly") - # Query all attributes BEFORE closing any buffer. On CUDA 12, freeing - # a managed allocation can clear read-mostly advice on neighboring - # ranges; close-then-query in a single loop falsely flags the later - # iterations as having lost the advice. - results = [ - _get_int_mem_range_attr( - buf, - driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - ) - for buf in bufs - ] - for buf in bufs: - buf.close() - for r in results: - assert r == _READ_MOSTLY_ENABLED - - def test_batched_per_buffer_location(self, init_cuda): - from cuda.core.utils import advise - - device = Device() - _skip_if_managed_location_ops_unsupported(device) - device.set_current() - mr = DummyUnifiedMemoryResource(device) - bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] - advise(bufs, "set_preferred_location", [Host(), device]) - for buf in bufs: - buf.close() - - def test_options_must_be_options_dataclass_or_none(self, init_cuda): - from cuda.core.utils import advise - - device = Device() - _skip_if_managed_allocation_unsupported(device) - device.set_current() - buf = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) - with pytest.raises(TypeError, match="must be an? .*Options instance or None"): - advise(buf, "set_read_mostly", options={}) + with pytest.raises(TypeError, match="sequence of Buffers"): + discard_prefetch_batch(buf, Host(), stream=stream) buf.close() From 36012fd3ae8fefca20d9ccb2cdbd853a6ff84ed2 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 09:34:40 -0700 Subject: [PATCH 52/68] refactor(cuda.core): build advise reverse-lookup eagerly at module load (N4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Leo's review on PR #1775 (_managed_memory_ops.pyx:23), drop the lazy-init plumbing for the enum→alias reverse lookup table. The forward table _MANAGED_ADVICE_ALIASES has six entries; building the inverse at module load via a dict comprehension is the same data without the mutable-global pattern, the `if None` check, or the `global` declaration inside the function body. Forward lookup table (_MANAGED_ADVICE_ALIASES) is preserved as the source of truth — explicit alias→CUDA-name mapping, grep-friendly, no implicit naming-convention coupling. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index cc45273641c..b3f53c66f59 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -47,8 +47,12 @@ cdef dict _MANAGED_ADVICE_ALLOWED_LOCTYPES = { "unset_accessed_by": _DEVICE_HOST_ONLY, } -# Lazily cached: maps driver.CUmem_advise enum value → string alias. -cdef dict _ADVICE_ENUM_TO_ALIAS = None +# Reverse lookup: enum value → alias. Built once at module load. +cdef dict _ADVICE_ENUM_TO_ALIAS = { + getattr(driver.CUmem_advise, attr_name): alias + for alias, attr_name in _MANAGED_ADVICE_ALIASES.items() + if hasattr(driver.CUmem_advise, attr_name) +} cdef tuple _normalize_managed_advice(object advice): @@ -65,13 +69,6 @@ cdef tuple _normalize_managed_advice(object advice): return alias, getattr(driver.CUmem_advise, attr_name) if isinstance(advice, driver.CUmem_advise): - global _ADVICE_ENUM_TO_ALIAS - if _ADVICE_ENUM_TO_ALIAS is None: - _ADVICE_ENUM_TO_ALIAS = {} - for alias, attr_name in _MANAGED_ADVICE_ALIASES.items(): - enum_val = getattr(driver.CUmem_advise, attr_name, None) - if enum_val is not None: - _ADVICE_ENUM_TO_ALIAS[enum_val] = alias alias = _ADVICE_ENUM_TO_ALIAS.get(advice) if alias is None: raise ValueError(f"Unsupported advice value: {advice!r}") From 067fb15a11313d37930ac1758c3d49f437efc387 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 09:39:40 -0700 Subject: [PATCH 53/68] refactor(cuda.core): factor shared body of _do_batch_{prefetch,discard_prefetch} (N2) Per Leo's review on PR #1775 (_managed_memory_ops.pyx:425), the two batched-with-locations helpers were byte-for-byte identical except for the driver function being called. Both: - declare the same four std::vectors (ptrs, sizes, loc_arr, loc_indices) - resize and fill them in the same loop - release the GIL and call cuMem{Prefetch,DiscardAndPrefetch}BatchAsync with the same argument shape Introduce a function-pointer typedef _BatchPrefetchFn (the two driver calls share signature), parameterize the shared body as _do_batch_prefetch_op, and have the two callers pass the appropriate driver function. Both the typedef and the helper live inside the IF CUDA_CORE_BUILD_MAJOR >= 13 block since they reference cu13-only types. Net: -28 lines duplication, +25 for the shared helper. No behavior change; tests unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 46 ++++++++----------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index b3f53c66f59..8e66f706862 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -322,8 +322,18 @@ cdef void _do_single_prefetch(Buffer buf, object loc, Stream s): HANDLE_RETURN(cydriver.cuMemPrefetchAsync(cu_ptr, nbytes, dev_int, hstream)) -cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): - IF CUDA_CORE_BUILD_MAJOR >= 13: +IF CUDA_CORE_BUILD_MAJOR >= 13: + # Function-pointer type for cuMemPrefetchBatchAsync / + # cuMemDiscardAndPrefetchBatchAsync; both have identical signatures. + ctypedef cydriver.CUresult (*_BatchPrefetchFn)( + cydriver.CUdeviceptr*, size_t*, size_t, + cydriver.CUmemLocation*, size_t*, size_t, + unsigned long long, cydriver.CUstream, + ) except ?cydriver.CUDA_ERROR_NOT_FOUND nogil + + + cdef void _do_batch_prefetch_op(tuple bufs, tuple locs, Stream s, _BatchPrefetchFn fn): + """Shared body for batched prefetch / discard-and-prefetch.""" cdef Py_ssize_t n = len(bufs) cdef cydriver.CUstream hstream = as_cu(s._h_stream) cdef vector[cydriver.CUdeviceptr] ptrs @@ -343,11 +353,16 @@ cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): loc_arr[i] = _to_cumemlocation(locs[i]) loc_indices[i] = i with nogil: - HANDLE_RETURN(cydriver.cuMemPrefetchBatchAsync( + HANDLE_RETURN(fn( ptrs.data(), sizes.data(), n, loc_arr.data(), loc_indices.data(), n, 0, hstream, )) + + +cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): + IF CUDA_CORE_BUILD_MAJOR >= 13: + _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemPrefetchBatchAsync) ELSE: raise NotImplementedError( "batched prefetch requires a CUDA 13 build of cuda.core" @@ -400,30 +415,7 @@ def _do_single_discard_prefetch_py(Buffer buf, location, stream): cdef void _do_batch_discard_prefetch(tuple bufs, tuple locs, Stream s): IF CUDA_CORE_BUILD_MAJOR >= 13: - cdef Py_ssize_t n = len(bufs) - cdef cydriver.CUstream hstream = as_cu(s._h_stream) - cdef vector[cydriver.CUdeviceptr] ptrs - cdef vector[size_t] sizes - cdef vector[cydriver.CUmemLocation] loc_arr - cdef vector[size_t] loc_indices - ptrs.resize(n) - sizes.resize(n) - loc_arr.resize(n) - loc_indices.resize(n) - cdef Buffer buf - cdef Py_ssize_t i - for i in range(n): - buf = bufs[i] - ptrs[i] = as_cu(buf._h_ptr) - sizes[i] = buf._size - loc_arr[i] = _to_cumemlocation(locs[i]) - loc_indices[i] = i - with nogil: - HANDLE_RETURN(cydriver.cuMemDiscardAndPrefetchBatchAsync( - ptrs.data(), sizes.data(), n, - loc_arr.data(), loc_indices.data(), n, - 0, hstream, - )) + _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemDiscardAndPrefetchBatchAsync) ELSE: raise NotImplementedError( "discard_prefetch requires a CUDA 13 build of cuda.core" From a9cd713f34bc8ffef67f882bc0b751a7984121c6 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 09:51:17 -0700 Subject: [PATCH 54/68] test(cuda.core): reuse production _get_int_attr in managed-memory tests (N6) Per Leo's review on PR #1775 (test_managed_ops.py:28), the test file's _get_mem_range_attr / _get_int_mem_range_attr / the local _MEM_RANGE_ATTRIBUTE_VALUE_SIZE constant are functionally identical to the production _get_int_attr in _managed_buffer.py. Drop the duplicates and import the production helper. 14 call sites updated. No behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/tests/memory/test_managed_ops.py | 37 ++++++++-------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index 176438399c4..e237ab0fc21 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -9,7 +9,7 @@ skip_if_managed_memory_unsupported, ) from cuda.core import Device, Host, ManagedBuffer -from cuda.core._utils.cuda_utils import handle_return +from cuda.core._memory._managed_buffer import _get_int_attr try: from cuda.bindings import driver @@ -18,20 +18,11 @@ _MANAGED_TEST_ALLOCATION_SIZE = 4096 -_MEM_RANGE_ATTRIBUTE_VALUE_SIZE = 4 _READ_MOSTLY_ENABLED = 1 _HOST_LOCATION_ID = -1 _INVALID_HOST_DEVICE_ORDINAL = 0 -def _get_mem_range_attr(buffer, attribute, data_size): - return handle_return(driver.cuMemRangeGetAttribute(data_size, attribute, buffer.handle, buffer.size)) - - -def _get_int_mem_range_attr(buffer, attribute): - return _get_mem_range_attr(buffer, attribute, _MEM_RANGE_ATTRIBUTE_VALUE_SIZE) - - def _skip_if_managed_allocation_unsupported(device): try: if not device.properties.managed_memory: @@ -70,7 +61,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): buffer.prefetch(Host(), stream=stream) stream.sync() - last_location = _get_int_mem_range_attr( + last_location = _get_int_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) @@ -78,7 +69,7 @@ def test_managed_memory_prefetch_supports_managed_pool_allocations(init_cuda): buffer.prefetch(device, stream=stream) stream.sync() - last_location = _get_int_mem_range_attr( + last_location = _get_int_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) @@ -97,7 +88,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): buffer.read_mostly = True assert ( - _get_int_mem_range_attr( + _get_int_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, ) @@ -105,7 +96,7 @@ def test_managed_memory_advise_supports_external_managed_allocations(init_cuda): ) buffer.preferred_location = Host() - preferred_location = _get_int_mem_range_attr( + preferred_location = _get_int_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, ) @@ -126,7 +117,7 @@ def test_managed_memory_prefetch_supports_external_managed_allocations(init_cuda buffer.prefetch(device, stream=stream) stream.sync() - last_location = _get_int_mem_range_attr( + last_location = _get_int_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) @@ -151,7 +142,7 @@ def test_managed_memory_discard_prefetch_supports_managed_pool_allocations(init_ buffer.discard_prefetch(device, stream=stream) stream.sync() - last_location = _get_int_mem_range_attr( + last_location = _get_int_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) @@ -175,7 +166,7 @@ def test_managed_memory_discard_prefetch_supports_external_managed_allocations(i buffer.discard_prefetch(device, stream=stream) stream.sync() - last_location = _get_int_mem_range_attr( + last_location = _get_int_attr( buffer, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) @@ -369,7 +360,7 @@ def test_same_location(self, init_cuda): stream.sync() for buf in bufs: - last = _get_int_mem_range_attr( + last = _get_int_attr( buf, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) @@ -391,11 +382,11 @@ def test_per_buffer_location(self, init_cuda): prefetch_batch(bufs, [Host(), device], stream=stream) stream.sync() - last0 = _get_int_mem_range_attr( + last0 = _get_int_attr( bufs[0], driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) - last1 = _get_int_mem_range_attr( + last1 = _get_int_attr( bufs[1], driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) @@ -487,7 +478,7 @@ def test_same_location(self, init_cuda): discard_prefetch_batch(bufs, device, stream=stream) stream.sync() for buf in bufs: - last = _get_int_mem_range_attr( + last = _get_int_attr( buf, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) @@ -633,7 +624,7 @@ def test_instance_prefetch(self, init_cuda): try: buf.prefetch(device, stream=stream) stream.sync() - last = _get_int_mem_range_attr( + last = _get_int_attr( buf, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) @@ -670,7 +661,7 @@ def test_instance_discard_prefetch(self, init_cuda): stream.sync() buf.discard_prefetch(device, stream=stream) stream.sync() - last = _get_int_mem_range_attr( + last = _get_int_attr( buf, driver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, ) From d75a7bd49c3e65bbeeae45104b1f26d306c1963f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 10:04:41 -0700 Subject: [PATCH 55/68] feat(cuda.core): cu12 fallback for prefetch_batch (N3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Leo's review on PR #1775 (_managed_memory_ops.pyx:228), raising NotImplementedError on cu12 forces users to write their own loop. The CUDA driver semantics for cuMemPrefetchBatchAsync are equivalent to per-range cuMemPrefetchAsync calls — just more efficient when batched at the driver level. On cu12 builds (where cuMemPrefetchBatchAsync is not exposed), fall back to a Python-level loop calling cuMemPrefetchAsync per buffer. The single-range path (_do_single_prefetch) already works on cu12 via the IF/ELSE split inside it. Note this fallback applies only to prefetch_batch — discard_batch and discard_prefetch_batch keep the cu12 NotImplementedError because the driver has no single-range cuMemDiscard{,AndPrefetch}Async to fall back to. Test skips for cuMemPrefetchBatchAsync unavailability dropped from TestPrefetchBatch.test_same_location and test_per_buffer_location; the fallback path now runs on cu12 builds too. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_memory_ops.pyx | 19 ++++++++++++------- cuda_core/tests/memory/test_managed_ops.py | 4 ---- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 8e66f706862..ec714746cd3 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -280,10 +280,11 @@ def prefetch_batch(buffers, locations, *, stream): stream : :class:`~_stream.Stream` | :class:`~graph.GraphBuilder` Stream for the asynchronous prefetch (keyword-only). - Raises - ------ - NotImplementedError - On a CUDA 12 build of ``cuda.core``. + Notes + ----- + On a CUDA 12 build, falls back to a Python-level loop calling + ``cuMemPrefetchAsync`` per buffer (no batched driver entry point on + CUDA 12). CUDA 13 builds use ``cuMemPrefetchBatchAsync`` directly. """ cdef tuple bufs = _coerce_batch_buffers(buffers, "prefetch_batch") cdef Py_ssize_t n = len(bufs) @@ -364,9 +365,13 @@ cdef void _do_batch_prefetch(tuple bufs, tuple locs, Stream s): IF CUDA_CORE_BUILD_MAJOR >= 13: _do_batch_prefetch_op(bufs, locs, s, cydriver.cuMemPrefetchBatchAsync) ELSE: - raise NotImplementedError( - "batched prefetch requires a CUDA 13 build of cuda.core" - ) + # cu12 has no cuMemPrefetchBatchAsync; loop per-range. + cdef Buffer buf + cdef Py_ssize_t i + cdef Py_ssize_t n = len(bufs) + for i in range(n): + buf = bufs[i] + _do_single_prefetch(buf, locs[i], s) def discard_prefetch_batch(buffers, locations, *, stream): diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index e237ab0fc21..ee501914df7 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -349,8 +349,6 @@ def test_same_location(self, init_cuda): device = Device() skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemPrefetchBatchAsync"): - pytest.skip("cuMemPrefetchBatchAsync unavailable") device.set_current() mr = create_managed_memory_resource_or_skip() bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(3)] @@ -372,8 +370,6 @@ def test_per_buffer_location(self, init_cuda): device = Device() skip_if_managed_memory_unsupported(device) - if not hasattr(driver, "cuMemPrefetchBatchAsync"): - pytest.skip("cuMemPrefetchBatchAsync unavailable") device.set_current() mr = create_managed_memory_resource_or_skip() bufs = [mr.allocate(_MANAGED_TEST_ALLOCATION_SIZE) for _ in range(2)] From 0af5bd4e6ae5a1d768135d2c5364d2de84b23425 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 10:12:40 -0700 Subject: [PATCH 56/68] test(cuda.core): cover AccessedBySet read methods (N7) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Leo's review on PR #1775 (test_managed_ops.py:1), add a test for the read side of AccessedBySet: __iter__, __len__, __eq__, __repr__. These are part of the public set-like API (alongside __contains__, add(), discard(), and the setter, which are already covered) but were untested. The cu12 batch fallback path (Leo's other coverage point) is now exercised by TestPrefetchBatch.test_same_location and test_per_buffer_location running on cu12 CI — the cuMemPrefetchBatchAsync skip was dropped in d75a7bd49c when the fallback landed. Co-Authored-By: Claude Opus 4.7 (1M context) --- cuda_core/tests/memory/test_managed_ops.py | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index ee501914df7..ba686c7724a 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -595,6 +595,33 @@ def test_accessed_by_add_discard(self, init_cuda): finally: plain.close() + def test_accessed_by_read_methods(self, init_cuda): + """Cover __iter__, __len__, __eq__, __repr__ on AccessedBySet.""" + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + try: + buf = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) + + # Empty initially + assert len(buf.accessed_by) == 0 + assert list(buf.accessed_by) == [] + assert buf.accessed_by == set() + assert "AccessedBySet" in repr(buf.accessed_by) + + # After add + buf.accessed_by.add(device) + assert len(buf.accessed_by) == 1 + assert list(buf.accessed_by) == [device] + assert buf.accessed_by == {device} + assert buf.accessed_by != frozenset() + + # __eq__ vs another AccessedBySet on the same buffer + assert buf.accessed_by == buf.accessed_by + finally: + plain.close() + def test_accessed_by_set_assignment(self, init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) From b0d1a216e3932468d3801da5d83449465b3f8faf Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 10:27:14 -0700 Subject: [PATCH 57/68] feat(cuda.core): cu13 NUMA round-trip for ManagedBuffer.preferred_location (N8) Per the self-promised reply on PR #1775's R7 thread, fulfill the Host(numa_id=N) round-trip on CUDA 13 builds. The blocker before was that cuda.bindings's Python-level cuMemRangeGetAttribute wrapper rejects the new CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE / _ID attributes via its allowlist. The workaround: call cydriver.cuMemRangeGetAttribute directly from a new Cython helper _read_preferred_location_v2, bypassing the Python wrapper. The helper queries TYPE then ID, then decodes the (kind, id) pair into Device | Host | Host(numa_id=N) | Host.numa_current() | None. ManagedBuffer.preferred_location getter dispatches to the v2 path on binding_version() >= (13, 0, 0); falls back to the legacy single-int attribute on cu12 (no NUMA info available). Test: - TestManagedBuffer.test_preferred_location_roundtrip already exercises the cu13 v2 path for Device(...) and Host() (no NUMA), which now passes through _read_preferred_location_v2. - New test_preferred_location_roundtrip_host_numa exercises Host(numa_id=0) round-trip; skips on cu12, and also skips on cu13 hardware/drivers where set_preferred_location with HOST_NUMA is not preserved (e.g. single-NUMA test machines). ManagedBuffer class docstring updated to reflect the cu12-only limitation note. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda/core/_memory/_managed_buffer.py | 28 ++++++------- .../cuda/core/_memory/_managed_memory_ops.pyx | 39 +++++++++++++++++++ cuda_core/tests/memory/test_managed_ops.py | 23 +++++++++++ 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index a6649fbaa77..565ad4ac168 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -15,6 +15,7 @@ _do_single_prefetch_py, ) from cuda.core._utils.cuda_utils import driver, handle_return +from cuda.core._utils.version import binding_version if TYPE_CHECKING: from cuda.core._memory._buffer import MemoryResource @@ -119,10 +120,11 @@ class ManagedBuffer(Buffer): Note ---- - The legacy ``cuMemRangeGetAttribute`` query path returns integer - device ordinals, so ``Host(numa_id=...)`` collapses to ``Host()`` - on read-back. Setters preserve full NUMA information when issuing - advice. + On CUDA 13 builds, ``preferred_location`` round-trips full NUMA + information. On CUDA 12 the legacy ``cuMemRangeGetAttribute`` query + path returns integer device ordinals, so ``Host(numa_id=...)`` + collapses to ``Host()`` on read-back. Setters preserve full NUMA + information when issuing advice on both. """ @classmethod @@ -167,16 +169,16 @@ def read_mostly(self, value: bool) -> None: def preferred_location(self) -> Device | Host | None: """Currently applied ``set_preferred_location`` target, or ``None``. - .. note:: - The legacy ``CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION`` carries - only a device ordinal (or ``-1`` for host) and cannot represent - a specific NUMA node. As a result, ``Host(numa_id=N)`` set via - the setter currently round-trips back as ``Host()``. The CUDA 13 - driver added ``..._PREFERRED_LOCATION_TYPE`` / ``..._ID`` for - full ``CUmemLocation`` round-trip, but ``cuda.bindings`` does - not yet expose these via ``cuMemRangeGetAttribute``; once it - does, this getter will be upgraded. + On CUDA 13 builds, fully round-trips ``Host(numa_id=N)``. On CUDA 12 + the legacy attribute carries only a device ordinal (or ``-1`` for + host), so ``Host(numa_id=N)`` set via the setter round-trips back + as ``Host()``. """ + if binding_version() >= (13, 0, 0): + from cuda.core._memory._managed_memory_ops import _read_preferred_location_v2 + + return _read_preferred_location_v2(self) + # CUDA 12 legacy path (no NUMA info available). loc_id = _get_int_attr(self, _ATTR_PREFERRED) if loc_id == -2: return None diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index ec714746cd3..8233783f3ee 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -333,6 +333,45 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: ) except ?cydriver.CUDA_ERROR_NOT_FOUND nogil + def _read_preferred_location_v2(Buffer buf): + """Internal: read preferred_location with full NUMA detail. + + Bypasses cuda.bindings.driver.cuMemRangeGetAttribute (whose + attribute allowlist doesn't yet include the cu13 _TYPE / _ID + attributes) by calling cydriver directly. + + Returns Device | Host | None. + """ + cdef cydriver.CUdeviceptr cu_ptr = as_cu(buf._h_ptr) + cdef size_t nbytes = buf._size + cdef int loc_type = 0 + cdef int loc_id = 0 + with nogil: + HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( + &loc_type, sizeof(int), + cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE, + cu_ptr, nbytes, + )) + HANDLE_RETURN(cydriver.cuMemRangeGetAttribute( + &loc_id, sizeof(int), + cydriver.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID, + cu_ptr, nbytes, + )) + if loc_type == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: + from cuda.core._device import Device + return Device(loc_id) + if loc_type == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST: + from cuda.core._host import Host + return Host() + if loc_type == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA: + from cuda.core._host import Host + return Host(numa_id=loc_id) + if loc_type == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + from cuda.core._host import Host + return Host.numa_current() + return None # CU_MEM_LOCATION_TYPE_INVALID — no preferred location + + cdef void _do_batch_prefetch_op(tuple bufs, tuple locs, Stream s, _BatchPrefetchFn fn): """Shared body for batched prefetch / discard-and-prefetch.""" cdef Py_ssize_t n = len(bufs) diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index ba686c7724a..143e99ea426 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -578,6 +578,29 @@ def test_preferred_location_roundtrip(self, init_cuda): finally: plain.close() + def test_preferred_location_roundtrip_host_numa(self, init_cuda): + """Host(numa_id=N) round-trips correctly on CUDA 13 builds.""" + from cuda.core._utils.version import binding_version + + if binding_version() < (13, 0, 0): + pytest.skip("Host(numa_id=N) round-trip requires CUDA 13 bindings") + device = Device() + _skip_if_managed_location_ops_unsupported(device) + device.set_current() + plain = DummyUnifiedMemoryResource(device).allocate(_MANAGED_TEST_ALLOCATION_SIZE) + try: + buf = ManagedBuffer.from_handle(plain.handle, plain.size, owner=plain) + # An explicit NUMA id round-trips via the cu13 v2 attribute pair. + # NUMA node 0 exists on every multi-NUMA system; on single-NUMA + # systems the driver may collapse to HOST or reject — skip then. + buf.preferred_location = Host(numa_id=0) + got = buf.preferred_location + if got is None or not (isinstance(got, Host) and got.numa_id == 0): + pytest.skip("host_numa preferred_location not supported by this driver / hardware") + assert got == Host(numa_id=0) + finally: + plain.close() + def test_accessed_by_add_discard(self, init_cuda): device = Device() _skip_if_managed_location_ops_unsupported(device) From 4c228eb3b2cc33229ac1238514f4351f6c988a5e Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 1 May 2026 11:34:12 -0700 Subject: [PATCH 58/68] docs(cuda.core): replace stale utils autosummary entries api.rst still listed the single-buffer free functions and *Options dataclasses that were removed under R9/R11 (advise, prefetch, discard, discard_prefetch and their *Options classes). Replace with the actual cuda.core.utils exports: prefetch_batch, discard_batch, discard_prefetch_batch. Drop the now-orphan :template: dataclass.rst line. --- cuda_core/docs/source/api.rst | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 20237aaf3a6..f7571f475bf 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -249,17 +249,9 @@ Utility functions :toctree: generated/ args_viewable_as_strided_memory - advise - prefetch - discard - discard_prefetch - - :template: dataclass.rst - - AdviseOptions - PrefetchOptions - DiscardOptions - DiscardPrefetchOptions + prefetch_batch + discard_batch + discard_prefetch_batch :template: autosummary/cyclass.rst From 5743e0585f7d1aae3310b2793ee4665ce420ceca Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 4 May 2026 15:18:49 -0700 Subject: [PATCH 59/68] feat(cuda.core): make Host a singleton class Mirror Device's singleton semantics so Host() is Host() and Host(numa_id=1) is Host(numa_id=1) hold. Host.numa_current() returns its own singleton, distinct from Host(), since it represents a thread-relative location rather than a fixed one. Construction routes through __new__ -> _get_or_create with a double-checked dict + Lock cache keyed on (numa_id, is_numa_current). __eq__ collapses to identity (consistent with the retained __hash__). __reduce__ added so pickled Host instances round-trip back through the singleton cache instead of stranding copies. Resolves PR #1775 review: leofang and Andy-Jost requested Host follow Device as a singleton so users can rely on `is` for identity checks. --- cuda_core/cuda/core/_host.py | 51 ++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/_host.py b/cuda_core/cuda/core/_host.py index b595222fa03..e4fa31f7ab6 100644 --- a/cuda_core/cuda/core/_host.py +++ b/cuda_core/cuda/core/_host.py @@ -3,6 +3,8 @@ from __future__ import annotations +import threading + class Host: """Host (CPU) location for managed-memory operations. @@ -17,15 +19,41 @@ class Host: for managed-memory `prefetch`, `advise`, and `discard_prefetch` targets. Pass either a ``Device`` or a ``Host`` to those operations and to ``ManagedBuffer.preferred_location`` / ``accessed_by``. + + ``Host`` is a singleton class, mirroring :class:`~cuda.core.Device`: + constructor calls with the same arguments return the same instance, + so ``Host() is Host()`` and ``Host(numa_id=1) is Host(numa_id=1)``. + ``Host.numa_current()`` returns its own singleton, distinct from + ``Host()`` because it represents a thread-relative location rather + than a fixed one. """ - __slots__ = ("_is_numa_current", "_numa_id") + __slots__ = ("_is_numa_current", "_numa_id", "__weakref__") - def __init__(self, numa_id: int | None = None) -> None: + # Singleton cache keyed by (numa_id, is_numa_current). + _instances: dict[tuple[int | None, bool], Host] = {} + _instances_lock = threading.Lock() + + def __new__(cls, numa_id: int | None = None) -> Host: if numa_id is not None and (not isinstance(numa_id, int) or numa_id < 0): raise ValueError(f"numa_id must be a non-negative int, got {numa_id!r}") - object.__setattr__(self, "_numa_id", numa_id) - object.__setattr__(self, "_is_numa_current", False) + return cls._get_or_create(numa_id, is_numa_current=False) + + @classmethod + def _get_or_create(cls, numa_id: int | None, is_numa_current: bool) -> Host: + key = (numa_id, is_numa_current) + cache = cls._instances + inst = cache.get(key) + if inst is not None: + return inst + with cls._instances_lock: + inst = cache.get(key) + if inst is None: + inst = object.__new__(cls) + object.__setattr__(inst, "_numa_id", numa_id) + object.__setattr__(inst, "_is_numa_current", is_numa_current) + cache[key] = inst + return inst @property def numa_id(self) -> int | None: @@ -38,9 +66,7 @@ def is_numa_current(self) -> bool: @classmethod def numa_current(cls) -> Host: """Construct a ``Host`` referring to the calling thread's NUMA node.""" - h = cls() - object.__setattr__(h, "_is_numa_current", True) - return h + return cls._get_or_create(None, is_numa_current=True) def __setattr__(self, name: str, value) -> None: raise AttributeError(f"{type(self).__name__} is immutable; cannot set {name!r}") @@ -48,14 +74,23 @@ def __setattr__(self, name: str, value) -> None: def __eq__(self, other: object) -> bool: if not isinstance(other, Host): return NotImplemented - return self._numa_id == other._numa_id and self._is_numa_current == other._is_numa_current + return self is other def __hash__(self) -> int: return hash((Host, self._numa_id, self._is_numa_current)) + def __reduce__(self): + if self._is_numa_current: + return (_reconstruct_numa_current, ()) + return (Host, (self._numa_id,)) + def __repr__(self) -> str: if self.is_numa_current: return "Host.numa_current()" if self.numa_id is None: return "Host()" return f"Host(numa_id={self.numa_id})" + + +def _reconstruct_numa_current() -> Host: + return Host.numa_current() From 71263245cecf904a5bb7c034afdf775f6552cb95 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 4 May 2026 15:28:09 -0700 Subject: [PATCH 60/68] refactor(cuda.core): rename AccessedBySet -> AccessedBySetProxy Align with the graph module's AdjacencySetProxy: rename the class and inherit from collections.abc.MutableSet so the full set interface (remove, pop, clear, |=, &=, -=, ^=, isdisjoint, subset/superset operators, etc.) is filled in automatically from the existing add / discard / __contains__ / __iter__ / __len__ primitives. Add classmethod _from_iterable so binary set operators (&|^) produce plain sets rather than constructing a buffer-less proxy. Tighten add to TypeError on non-Device/Host inputs and discard / __contains__ to silently ignore them, matching MutableSet contracts. The hand-rolled __eq__ (set/frozenset comparison) is dropped: Set ABC's default implementation handles it correctly. Resolves PR #1775 review (Andy-Jost, 2026-05-04): naming consistency with AdjacencySetProxy and full MutableSet conformance. --- .../cuda/core/_memory/_managed_buffer.py | 33 +++++++++++-------- cuda_core/docs/source/api_private.rst | 2 +- cuda_core/tests/memory/test_managed_ops.py | 6 ++-- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index 565ad4ac168..0229de53d43 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -3,6 +3,7 @@ from __future__ import annotations +from collections.abc import MutableSet from typing import TYPE_CHECKING from cuda.core._device import Device @@ -56,7 +57,7 @@ def _query_accessed_by(buf: Buffer) -> list[Device | Host]: return [Host() if v == -1 else Device(v) for v in raw if v != -2] -class AccessedBySet: +class AccessedBySetProxy(MutableSet): """Live driver-backed view of ``set_accessed_by`` advice for a managed buffer. Reads (``__contains__``, ``__iter__``, ``len(...)``) call @@ -76,7 +77,16 @@ class AccessedBySet: def __init__(self, buf: ManagedBuffer): self._buf = buf + # Operators such as &|^ produce a plain set, not another proxy. + @classmethod + def _from_iterable(cls, it): + return set(it) + + # --- abstract methods required by MutableSet --- + def __contains__(self, location) -> bool: + if not isinstance(location, (Device, Host)): + return False return location in _query_accessed_by(self._buf) def __iter__(self): @@ -85,24 +95,21 @@ def __iter__(self): def __len__(self) -> int: return len(_query_accessed_by(self._buf)) - def __eq__(self, other) -> bool: - if isinstance(other, AccessedBySet): - return set(_query_accessed_by(self._buf)) == set(_query_accessed_by(other._buf)) - if isinstance(other, (set, frozenset)): - return set(_query_accessed_by(self._buf)) == other - return NotImplemented - - def __repr__(self) -> str: - return f"AccessedBySet({set(_query_accessed_by(self._buf))!r})" - def add(self, location: Device | Host) -> None: """Apply ``set_accessed_by`` advice for ``location``.""" + if not isinstance(location, (Device, Host)): + raise TypeError(f"expected Device or Host, got {type(location).__name__}") _advise_one(self._buf, _SET_ACCESSED_BY, location) def discard(self, location: Device | Host) -> None: """Apply ``unset_accessed_by`` advice for ``location``.""" + if not isinstance(location, (Device, Host)): + return _advise_one(self._buf, _UNSET_ACCESSED_BY, location) + def __repr__(self) -> str: + return f"AccessedBySetProxy({set(_query_accessed_by(self._buf))!r})" + class ManagedBuffer(Buffer): """Managed (unified) memory buffer with a property-style advice API. @@ -194,9 +201,9 @@ def preferred_location(self, value: Device | Host | None) -> None: _advise_one(self, _SET_PREFERRED, value) @property - def accessed_by(self) -> AccessedBySet: + def accessed_by(self) -> AccessedBySetProxy: """Live set-like view of ``set_accessed_by`` locations.""" - return AccessedBySet(self) + return AccessedBySetProxy(self) @accessed_by.setter def accessed_by(self, locations) -> None: diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index cc7408096b5..8448adadf03 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -32,7 +32,7 @@ CUDA runtime _device.DeviceProperties _memory._ipc.IPCAllocationHandle _memory._ipc.IPCBufferDescriptor - _memory._managed_buffer.AccessedBySet + _memory._managed_buffer.AccessedBySetProxy CUDA graphs diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index 143e99ea426..cf8a84b5cb9 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -619,7 +619,7 @@ def test_accessed_by_add_discard(self, init_cuda): plain.close() def test_accessed_by_read_methods(self, init_cuda): - """Cover __iter__, __len__, __eq__, __repr__ on AccessedBySet.""" + """Cover __iter__, __len__, __eq__, __repr__ on AccessedBySetProxy.""" device = Device() _skip_if_managed_location_ops_unsupported(device) device.set_current() @@ -631,7 +631,7 @@ def test_accessed_by_read_methods(self, init_cuda): assert len(buf.accessed_by) == 0 assert list(buf.accessed_by) == [] assert buf.accessed_by == set() - assert "AccessedBySet" in repr(buf.accessed_by) + assert "AccessedBySetProxy" in repr(buf.accessed_by) # After add buf.accessed_by.add(device) @@ -640,7 +640,7 @@ def test_accessed_by_read_methods(self, init_cuda): assert buf.accessed_by == {device} assert buf.accessed_by != frozenset() - # __eq__ vs another AccessedBySet on the same buffer + # __eq__ vs another AccessedBySetProxy on the same buffer assert buf.accessed_by == buf.accessed_by finally: plain.close() From 238cb149460ace86605b3f0b50cca66e02bf0a73 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 4 May 2026 15:31:49 -0700 Subject: [PATCH 61/68] fix(cuda.core): silence ruff lints on Host singleton - Annotate _instances / _instances_lock as ClassVar (RUF012). - Sort __slots__ alphabetically (RUF023, auto-fixed by ruff). --- cuda_core/cuda/core/_host.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/_host.py b/cuda_core/cuda/core/_host.py index e4fa31f7ab6..72309cab439 100644 --- a/cuda_core/cuda/core/_host.py +++ b/cuda_core/cuda/core/_host.py @@ -4,6 +4,7 @@ from __future__ import annotations import threading +from typing import ClassVar class Host: @@ -28,11 +29,11 @@ class Host: than a fixed one. """ - __slots__ = ("_is_numa_current", "_numa_id", "__weakref__") + __slots__ = ("__weakref__", "_is_numa_current", "_numa_id") # Singleton cache keyed by (numa_id, is_numa_current). - _instances: dict[tuple[int | None, bool], Host] = {} - _instances_lock = threading.Lock() + _instances: ClassVar[dict[tuple[int | None, bool], Host]] = {} + _instances_lock: ClassVar[threading.Lock] = threading.Lock() def __new__(cls, numa_id: int | None = None) -> Host: if numa_id is not None and (not isinstance(numa_id, int) or numa_id < 0): From d0b6621cc251d1d60e0fd8b114f4e5783847c516 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 5 May 2026 13:32:55 -0700 Subject: [PATCH 62/68] fix(cuda.core): reject bool as Host(numa_id=...) bool is an int subclass, so the previous guard let Host(True) and Host(False) seed the singleton cache under the same keys as Host(1) and Host(0). Whichever call landed first won, leaving repr(Host(1)) potentially showing as Host(numa_id=True). Reject bool explicitly. Addresses rwgk's Low finding on PR #1775. --- cuda_core/cuda/core/_host.py | 4 +++- cuda_core/tests/memory/test_managed_ops.py | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_host.py b/cuda_core/cuda/core/_host.py index 72309cab439..f1697b8288c 100644 --- a/cuda_core/cuda/core/_host.py +++ b/cuda_core/cuda/core/_host.py @@ -36,7 +36,9 @@ class Host: _instances_lock: ClassVar[threading.Lock] = threading.Lock() def __new__(cls, numa_id: int | None = None) -> Host: - if numa_id is not None and (not isinstance(numa_id, int) or numa_id < 0): + if numa_id is not None and ( + isinstance(numa_id, bool) or not isinstance(numa_id, int) or numa_id < 0 + ): raise ValueError(f"numa_id must be a non-negative int, got {numa_id!r}") return cls._get_or_create(numa_id, is_numa_current=False) diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index cf8a84b5cb9..2173889b199 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -267,6 +267,14 @@ def test_invalid_numa_id(self): with pytest.raises(ValueError, match="numa_id must be a non-negative int"): Host(numa_id=-1) + def test_numa_id_rejects_bool(self): + # bool is an int subclass; reject explicitly so Host(True) doesn't + # alias Host(1) (and vice versa) in the singleton cache. + with pytest.raises(ValueError, match="numa_id must be a non-negative int"): + Host(numa_id=True) + with pytest.raises(ValueError, match="numa_id must be a non-negative int"): + Host(numa_id=False) + def test_numa_current_only_via_classmethod(self): # is_numa_current is internal state, only settable via Host.numa_current() with pytest.raises(TypeError): From d0f9c7e085aa69f3a1fe5d782fa418b334a024c8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 5 May 2026 13:33:20 -0700 Subject: [PATCH 63/68] fix(cuda.core): hoist managed-buffer check in _advise_one Move _require_managed_buffer to the first statement of _advise_one so a non-managed buffer is rejected before advice/location parsing, matching the order in _do_single_prefetch_py and _do_single_discard_prefetch_py. This prevents surfacing an advice-validation error when the real problem is the buffer kind. --- cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 8233783f3ee..5e994828244 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -228,6 +228,7 @@ def _advise_one(Buffer buf, advice, location): Used by :class:`ManagedBuffer` property setters. Not part of the public API. """ + _require_managed_buffer(buf, "advise") cdef str advice_name cdef object advice_value advice_name, advice_value = _normalize_managed_advice(advice) @@ -238,7 +239,6 @@ def _advise_one(Buffer buf, advice, location): raise ValueError( f"advise '{advice_name}' does not support location_type='{loc.kind}'" ) - _require_managed_buffer(buf, "advise") _do_single_advise(buf, advice_value, loc, allow_none) From 191f29d7974be524c0658cd23792edeae04f4657 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 5 May 2026 13:33:30 -0700 Subject: [PATCH 64/68] fix(cuda.core): clarify CUDA 12 NUMA-host error message Rephrase the RuntimeError raised from _to_legacy_device when a caller passes Host(numa_id=...) or Host.numa_current() on a CUDA 12 build. The new message names the unsupported APIs and points the user at Host() as the working alternative, instead of leaking the internal location_type discriminator. --- cuda_core/cuda/core/_memory/_managed_memory_ops.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx index 5e994828244..e429150c877 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_ops.pyx @@ -154,7 +154,8 @@ ELSE: if kind == "host": return -1 raise RuntimeError( - f"location_type={kind!r} requires a CUDA 13 build of cuda.core" + "use Host() instead — NUMA-aware host locations " + "(Host(numa_id=...), Host.numa_current()) require a CUDA 13 build of cuda.core" ) From bcc056b8d16ad96212b55dce952c0a3437a9d4e2 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 5 May 2026 13:34:15 -0700 Subject: [PATCH 65/68] fix(cuda.core): reject Host(numa_id=...) up-front on CUDA 12 The CUDA 12 cuMemPrefetchAsync / cuMemAdvise ABI takes a plain device ordinal and cannot represent a specific host NUMA node. Previously _coerce_location accepted Host(numa_id=...) and Host.numa_current() on a CUDA 12 build and let the operation fail late inside the Cython layer with RuntimeError, which the public APIs surfaced as a confusing error from deep in the stack. Reject NUMA-host kinds at the call boundary in _coerce_location with a TypeError that names the unsupported APIs and points at Host() as the working alternative. Update the ManagedBuffer docstring to match the new contract, and broaden two host_numa-rejection test asserts to accept either the CUDA 13 kind-allowed ValueError or the CUDA 12 boundary TypeError. Addresses rwgk's Medium finding on PR #1775. --- .../cuda/core/_memory/_managed_buffer.py | 8 ++-- .../cuda/core/_memory/_managed_location.py | 38 ++++++++++++++++--- cuda_core/tests/memory/test_managed_ops.py | 21 +++++++--- 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index 0229de53d43..51b8bea3f45 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -128,10 +128,10 @@ class ManagedBuffer(Buffer): Note ---- On CUDA 13 builds, ``preferred_location`` round-trips full NUMA - information. On CUDA 12 the legacy ``cuMemRangeGetAttribute`` query - path returns integer device ordinals, so ``Host(numa_id=...)`` - collapses to ``Host()`` on read-back. Setters preserve full NUMA - information when issuing advice on both. + information. On CUDA 12 builds, ``Host(numa_id=...)`` and + ``Host.numa_current()`` are rejected with ``TypeError`` at the call + boundary — only ``Device(...)`` and the generic ``Host()`` are + accepted. Use ``Host()`` to target the host on CUDA 12. """ @classmethod diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index 4dae76f7479..be871e38e81 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -23,26 +23,52 @@ class _LocSpec: id: int = 0 +def _reject_numa_host_on_cuda12(spec: _LocSpec) -> None: + """Reject NUMA-host kinds on CUDA 12 builds at the public boundary. + + The CUDA 12 ``cuMemPrefetchAsync`` / ``cuMemAdvise`` ABI takes a + plain device ordinal (``-1`` for host), so it cannot represent a + specific host NUMA node. Rather than letting the operation fail + deep inside the Cython layer with ``RuntimeError``, raise a + ``TypeError`` at the call boundary with actionable wording. + """ + from cuda.core._utils.version import binding_version + + if binding_version() >= (13, 0, 0): + return + if spec.kind in ("host_numa", "host_numa_current"): + raise TypeError( + "Host(numa_id=...) / Host.numa_current() require a CUDA 13 " + "build of cuda.core; use Host() on CUDA 12" + ) + + def _coerce_location(value, *, allow_none: bool = False) -> _LocSpec | None: - """Coerce :class:`Device` / :class:`Host` / int / ``None`` to ``_LocSpec``. + """Coerce :class:`Device` / :class:`Host` / ``None`` to ``_LocSpec``. - Maps int ``-1`` to host and other non-negative ints to that device - ordinal. ``Host()``, ``Host(numa_id=N)``, and ``Host.numa_current()`` - map to the corresponding NUMA-aware kinds. + ``Host()``, ``Host(numa_id=N)``, and ``Host.numa_current()`` map to + the corresponding NUMA-aware kinds. On a CUDA 12 build of + ``cuda.core``, NUMA-host inputs are rejected with ``TypeError`` + because the legacy ABI cannot represent them. """ # Local imports to avoid import cycles (Device pulls in CUDA init). from cuda.core._device import Device from cuda.core._host import Host if isinstance(value, _LocSpec): + _reject_numa_host_on_cuda12(value) return value if isinstance(value, Device): return _LocSpec(kind="device", id=value.device_id) if isinstance(value, Host): if value.is_numa_current: - return _LocSpec(kind="host_numa_current") + spec = _LocSpec(kind="host_numa_current") + _reject_numa_host_on_cuda12(spec) + return spec if value.numa_id is not None: - return _LocSpec(kind="host_numa", id=value.numa_id) + spec = _LocSpec(kind="host_numa", id=value.numa_id) + _reject_numa_host_on_cuda12(spec) + return spec return _LocSpec(kind="host") if value is None: if allow_none: diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index 2173889b199..b0246992331 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -208,7 +208,12 @@ def test_managed_memory_operation_validation(init_cuda): with pytest.raises(ValueError, match="location is required"): buffer.prefetch(None, stream=stream) - with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + # CUDA 13: kind-allowed check fires (ValueError). CUDA 12: NUMA-host is + # rejected at the boundary first (TypeError). + with pytest.raises( + (ValueError, TypeError), + match="does not support location_type='host_numa'|require a CUDA 13 build", + ): buffer.accessed_by.add(Host(numa_id=_INVALID_HOST_DEVICE_ORDINAL)) buffer.close() @@ -232,12 +237,18 @@ def test_managed_memory_advise_location_validation(init_cuda): # preferred_location accepts Host() buffer.preferred_location = Host() - # accessed_by rejects host_numa - with pytest.raises(ValueError, match="does not support location_type='host_numa'"): + # accessed_by rejects host_numa (CUDA 13: kind check; CUDA 12: boundary) + with pytest.raises( + (ValueError, TypeError), + match="does not support location_type='host_numa'|require a CUDA 13 build", + ): buffer.accessed_by.add(Host(numa_id=0)) - # accessed_by rejects host_numa_current - with pytest.raises(ValueError, match="does not support location_type='host_numa_current'"): + # accessed_by rejects host_numa_current (same reasoning) + with pytest.raises( + (ValueError, TypeError), + match="does not support location_type='host_numa_current'|require a CUDA 13 build", + ): buffer.accessed_by.add(Host.numa_current()) # Both Host and Device are accepted From 1b663672e3a12464e65b349bb475edf00b9b339e Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 5 May 2026 13:34:30 -0700 Subject: [PATCH 66/68] fix(cuda.core): make ManagedBuffer.accessed_by setter atomic The previous setter computed (current - target) and (target - current) and called _advise_one in two loops. set(locations) raised TypeError on unhashable elements, but only after the first diff pair had already been issued, so an invalid RHS could leave accessed_by partially mutated. Reproduce: starting from {Device(0)}, assigning {Host(numa_id=0)} on CUDA 12 raises and leaves accessed_by == set(). Validate every target up-front (per-element isinstance(Device|Host)) and only then issue the diff loops, so a bad RHS raises before any driver state changes. Addresses rwgk's High finding on PR #1775. --- cuda_core/cuda/core/_memory/_managed_buffer.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index 51b8bea3f45..39e72990572 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -207,9 +207,16 @@ def accessed_by(self) -> AccessedBySetProxy: @accessed_by.setter def accessed_by(self, locations) -> None: - # Diff against the current driver state and advise only the deltas. + # Validate every target before issuing any cuMemAdvise so an invalid + # element can't leave accessed_by partially mutated. + target: set[Device | Host] = set() + for loc in locations: + if not isinstance(loc, (Device, Host)): + raise TypeError( + f"accessed_by entries must be Device or Host, got {type(loc).__name__}" + ) + target.add(loc) current = set(_query_accessed_by(self)) - target = set(locations) for loc in current - target: _advise_one(self, _UNSET_ACCESSED_BY, loc) for loc in target - current: From 5efbe4e09cc311269206f67314b8f9ba96a80cee Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 5 May 2026 13:40:52 -0700 Subject: [PATCH 67/68] style(cuda.core): apply ruff format Collapses multi-line string concats and conditions back to single lines under the project's line-length limit. No behavior change. --- cuda_core/cuda/core/_host.py | 4 +--- cuda_core/cuda/core/_memory/_managed_buffer.py | 4 +--- cuda_core/cuda/core/_memory/_managed_location.py | 3 +-- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/_host.py b/cuda_core/cuda/core/_host.py index f1697b8288c..47e8e222ede 100644 --- a/cuda_core/cuda/core/_host.py +++ b/cuda_core/cuda/core/_host.py @@ -36,9 +36,7 @@ class Host: _instances_lock: ClassVar[threading.Lock] = threading.Lock() def __new__(cls, numa_id: int | None = None) -> Host: - if numa_id is not None and ( - isinstance(numa_id, bool) or not isinstance(numa_id, int) or numa_id < 0 - ): + if numa_id is not None and (isinstance(numa_id, bool) or not isinstance(numa_id, int) or numa_id < 0): raise ValueError(f"numa_id must be a non-negative int, got {numa_id!r}") return cls._get_or_create(numa_id, is_numa_current=False) diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py index 39e72990572..3326d6b774f 100644 --- a/cuda_core/cuda/core/_memory/_managed_buffer.py +++ b/cuda_core/cuda/core/_memory/_managed_buffer.py @@ -212,9 +212,7 @@ def accessed_by(self, locations) -> None: target: set[Device | Host] = set() for loc in locations: if not isinstance(loc, (Device, Host)): - raise TypeError( - f"accessed_by entries must be Device or Host, got {type(loc).__name__}" - ) + raise TypeError(f"accessed_by entries must be Device or Host, got {type(loc).__name__}") target.add(loc) current = set(_query_accessed_by(self)) for loc in current - target: diff --git a/cuda_core/cuda/core/_memory/_managed_location.py b/cuda_core/cuda/core/_memory/_managed_location.py index be871e38e81..13fbf9ea7ba 100644 --- a/cuda_core/cuda/core/_memory/_managed_location.py +++ b/cuda_core/cuda/core/_memory/_managed_location.py @@ -38,8 +38,7 @@ def _reject_numa_host_on_cuda12(spec: _LocSpec) -> None: return if spec.kind in ("host_numa", "host_numa_current"): raise TypeError( - "Host(numa_id=...) / Host.numa_current() require a CUDA 13 " - "build of cuda.core; use Host() on CUDA 12" + "Host(numa_id=...) / Host.numa_current() require a CUDA 13 build of cuda.core; use Host() on CUDA 12" ) From 8c353763d729ea903bcb4e8d5f76e088af69fd63 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 5 May 2026 14:17:19 -0700 Subject: [PATCH 68/68] Skip NUMA-aware Host coerce tests on CUDA 12 builds Host(numa_id=N) and Host.numa_current() require CUDA 13 bindings; the TestLocationCoerce passthroughs were missing the binding_version guard already used by test_preferred_location_roundtrip_host_numa. --- cuda_core/tests/memory/test_managed_ops.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cuda_core/tests/memory/test_managed_ops.py b/cuda_core/tests/memory/test_managed_ops.py index b0246992331..4449ed51960 100644 --- a/cuda_core/tests/memory/test_managed_ops.py +++ b/cuda_core/tests/memory/test_managed_ops.py @@ -324,14 +324,20 @@ def test_host_passthrough(self): def test_host_numa_passthrough(self): from cuda.core._memory._managed_location import _coerce_location + from cuda.core._utils.version import binding_version + if binding_version() < (13, 0, 0): + pytest.skip("Host(numa_id=N) requires CUDA 13 bindings") spec = _coerce_location(Host(numa_id=3)) assert spec.kind == "host_numa" assert spec.id == 3 def test_host_numa_current_passthrough(self): from cuda.core._memory._managed_location import _coerce_location + from cuda.core._utils.version import binding_version + if binding_version() < (13, 0, 0): + pytest.skip("Host.numa_current() requires CUDA 13 bindings") spec = _coerce_location(Host.numa_current()) assert spec.kind == "host_numa_current"