diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd index c293d72750..a7f3bfd958 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pxd +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -7,7 +7,9 @@ from cuda.core._memory._ipc cimport IPCDataForMR cdef class DeviceMemoryResource(_MemPool): - pass + cdef: + int _dev_id + object _peer_accessible_by cpdef DMR_mempool_get_access(DeviceMemoryResource, int) diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx index 78a49d3e44..1299f1bd57 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx @@ -1,17 +1,24 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from cuda.bindings cimport cydriver -from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions +from cuda.core._memory._memory_pool cimport ( + _MemPool, MP_init_create_pool, MP_raise_release_threshold, +) from cuda.core._memory cimport _ipc from cuda.core._memory._ipc cimport IPCAllocationHandle +from cuda.core._resource_handles cimport ( + as_cu, + get_device_mempool, +) from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN, ) +from cpython.mem cimport PyMem_Malloc, PyMem_Free from dataclasses import dataclass import multiprocessing @@ -19,7 +26,6 @@ import platform # no-cython-lint import uuid from cuda.core._utils.cuda_utils import check_multiprocessing_start_method -from cuda.core._resource_handles cimport as_cu __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions'] @@ -122,27 +128,12 @@ cdef class DeviceMemoryResource(_MemPool): associated MMR. """ - def __init__(self, device_id: Device | int, options=None): - from .._device import Device - cdef int dev_id = Device(device_id).device_id - cdef DeviceMemoryResourceOptions opts = check_or_create_options( - DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", - keep_none=True - ) - cdef _MemPoolOptions opts_base = _MemPoolOptions() - - cdef bint ipc_enabled = False - if opts: - ipc_enabled = opts.ipc_enabled - if ipc_enabled and not _ipc.is_supported(): - raise RuntimeError("IPC is not available on {platform.system()}") - opts_base._max_size = opts.max_size - opts_base._use_current = False - opts_base._ipc_enabled = ipc_enabled - opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + def __cinit__(self, *args, **kwargs): + self._dev_id = cydriver.CU_DEVICE_INVALID + self._peer_accessible_by = () - super().__init__(dev_id, opts_base) + def __init__(self, device_id: Device | int, options=None): + _DMR_init(self, device_id, options) def __reduce__(self): return DeviceMemoryResource.from_registry, (self.uuid,) @@ -215,6 +206,37 @@ cdef class DeviceMemoryResource(_MemPool): raise RuntimeError("Memory resource is not IPC-enabled") return self._ipc_data._alloc_handle + @property + def device_id(self) -> int: + """The associated device ordinal.""" + return self._dev_id + + @property + def peer_accessible_by(self): + """ + Get or set the devices that can access allocations from this memory + pool. Access can be modified at any time and affects all allocations + from this memory pool. + + Returns a tuple of sorted device IDs that currently have peer access to + allocations from this memory pool. + + When setting, accepts a sequence of Device objects or device IDs. + Setting to an empty sequence revokes all peer access. + + Examples + -------- + >>> dmr = DeviceMemoryResource(0) + >>> dmr.peer_accessible_by = [1] # Grant access to device 1 + >>> assert dmr.peer_accessible_by == (1,) + >>> dmr.peer_accessible_by = [] # Revoke access + """ + return self._peer_accessible_by + + @peer_accessible_by.setter + def peer_accessible_by(self, devices): + _DMR_set_peer_accessible_by(self, devices) + @property def is_device_accessible(self) -> bool: """Return True. This memory resource provides device-accessible buffers.""" @@ -226,6 +248,82 @@ cdef class DeviceMemoryResource(_MemPool): return False +cdef inline _DMR_set_peer_accessible_by(DeviceMemoryResource self, devices): + from .._device import Device + + cdef set[int] target_ids = {Device(dev).device_id for dev in devices} + target_ids.discard(self._dev_id) + this_dev = Device(self._dev_id) + cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)] + if bad: + raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}") + cdef set[int] cur_ids = set(self._peer_accessible_by) + cdef set[int] to_add = target_ids - cur_ids + cdef set[int] to_rm = cur_ids - target_ids + cdef size_t count = len(to_add) + len(to_rm) + cdef cydriver.CUmemAccessDesc* access_desc = NULL + cdef size_t i = 0 + + if count > 0: + access_desc = PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc)) + if access_desc == NULL: + raise MemoryError("Failed to allocate memory for access descriptors") + + try: + for dev_id in to_add: + access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + access_desc[i].location.id = dev_id + i += 1 + + for dev_id in to_rm: + access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE + access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + access_desc[i].location.id = dev_id + i += 1 + + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count)) + finally: + if access_desc != NULL: + PyMem_Free(access_desc) + + self._peer_accessible_by = tuple(target_ids) + + +cdef inline _DMR_init(DeviceMemoryResource self, device_id, options): + from .._device import Device + cdef int dev_id = Device(device_id).device_id + cdef DeviceMemoryResourceOptions opts = check_or_create_options( + DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", + keep_none=True + ) + cdef bint ipc_enabled = False + cdef size_t max_size = 0 + + self._dev_id = dev_id + + if opts is not None: + ipc_enabled = opts.ipc_enabled + if ipc_enabled and not _ipc.is_supported(): + raise RuntimeError(f"IPC is not available on {platform.system()}") + max_size = opts.max_size + + if opts is None: + self._h_pool = get_device_mempool(dev_id) + self._mempool_owned = False + MP_raise_release_threshold(self) + else: + MP_init_create_pool( + self, + cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, + dev_id, + cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, + ipc_enabled, + max_size, + ) + + # Note: this is referenced in instructions to debug nvbug 5698116. cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id): """ diff --git a/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd index 46e00cd4cb..8dd0bbbeb1 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd +++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -6,4 +6,6 @@ from cuda.core._memory._memory_pool cimport _MemPool cdef class ManagedMemoryResource(_MemPool): - pass + cdef: + str _pref_loc_type + int _pref_loc_id diff --git a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx index a268520e55..4f24bd8d11 100644 --- a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx @@ -6,7 +6,7 @@ from __future__ import annotations from cuda.bindings cimport cydriver -from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions +from cuda.core._memory._memory_pool cimport _MemPool, MP_init_create_pool, MP_init_current_pool from cuda.core._utils.cuda_utils cimport ( HANDLE_RETURN, check_or_create_options, @@ -26,12 +26,35 @@ cdef class ManagedMemoryResourceOptions: Attributes ---------- preferred_location : int | None, optional - The preferred device location for the managed memory. - Use a device ID (0, 1, 2, ...) for device preference, -1 for CPU/host, - or None to let the driver decide. - (Default to None) + A location identifier (device ordinal or NUMA node ID) whose + meaning depends on ``preferred_location_type``. + (Default to ``None``) + + preferred_location_type : ``"device"`` | ``"host"`` | ``"host_numa"`` | None, optional + Controls how ``preferred_location`` is interpreted. + + When set to ``None`` (the default), legacy behavior is used: + ``preferred_location`` is interpreted as a device ordinal, + ``-1`` for host, or ``None`` for no preference. + + When set explicitly, the type determines both the kind of + preferred location and the valid values for + ``preferred_location``: + + - ``"device"``: prefer a specific GPU. ``preferred_location`` + must be a device ordinal (``>= 0``). + - ``"host"``: prefer host memory (OS-managed NUMA placement). + ``preferred_location`` must be ``None``. + - ``"host_numa"``: prefer a specific host NUMA node. + ``preferred_location`` must be a NUMA node ID (``>= 0``), + or ``None`` to derive the NUMA node from the current CUDA + device's ``host_numa_id`` attribute (requires an active + CUDA context). + + (Default to ``None``) """ preferred_location: int | None = None + preferred_location_type: str | None = None cdef class ManagedMemoryResource(_MemPool): @@ -64,40 +87,29 @@ cdef class ManagedMemoryResource(_MemPool): """ def __init__(self, options=None): - cdef ManagedMemoryResourceOptions opts = check_or_create_options( - ManagedMemoryResourceOptions, options, "ManagedMemoryResource options", - keep_none=True - ) - cdef _MemPoolOptions opts_base = _MemPoolOptions() - - cdef int device_id = -1 - cdef object preferred_location = None - if opts: - preferred_location = opts.preferred_location - if preferred_location is not None: - device_id = preferred_location - opts_base._use_current = False - - opts_base._ipc_enabled = False # IPC not supported for managed memory pools - - IF CUDA_CORE_BUILD_MAJOR >= 13: - # Set location based on preferred_location - if preferred_location is None: - # Let the driver decide - opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE - elif device_id == -1: - # CPU/host preference - opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST - else: - # Device preference - opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - - opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED - - super().__init__(device_id, opts_base) - _check_concurrent_managed_access() - ELSE: - raise RuntimeError("ManagedMemoryResource requires CUDA 13.0 or later") + _MMR_init(self, options) + + @property + def device_id(self) -> int: + """The preferred device ordinal, or -1 if the preferred location is not a device.""" + if self._pref_loc_type == "device": + return self._pref_loc_id + return -1 + + @property + def preferred_location(self) -> tuple | None: + """The preferred location for managed memory allocations. + + Returns ``None`` if no preferred location is set (driver decides), + or a tuple ``(type, id)`` where *type* is one of ``"device"``, + ``"host"``, or ``"host_numa"``, and *id* is the device ordinal, + ``None`` (for ``"host"``), or the NUMA node ID, respectively. + """ + if self._pref_loc_type is None: + return None + if self._pref_loc_type == "host": + return ("host", None) + return (self._pref_loc_type, self._pref_loc_id) @property def is_device_accessible(self) -> bool: @@ -110,6 +122,131 @@ cdef class ManagedMemoryResource(_MemPool): return True +IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef tuple _VALID_LOCATION_TYPES = ("device", "host", "host_numa") + + + cdef _resolve_preferred_location(ManagedMemoryResourceOptions opts): + """Resolve preferred location options into driver and stored values. + + Returns a 4-tuple: + (CUmemLocationType, loc_id, pref_loc_type_str, pref_loc_id) + """ + cdef object pref_loc = opts.preferred_location if opts is not None else None + cdef object pref_type = opts.preferred_location_type if opts is not None else None + + if pref_type is not None and pref_type not in _VALID_LOCATION_TYPES: + raise ValueError( + f"preferred_location_type must be one of {_VALID_LOCATION_TYPES!r} " + f"or None, got {pref_type!r}" + ) + + if pref_type is None: + # Legacy behavior + if pref_loc is None: + return ( + cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE, + -1, None, -1, + ) + if pref_loc == -1: + return ( + cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST, + -1, "host", -1, + ) + if pref_loc < 0: + raise ValueError( + f"preferred_location must be a device ordinal (>= 0), -1 for " + f"host, or None for no preference, got {pref_loc}" + ) + return ( + cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, + pref_loc, "device", pref_loc, + ) + + if pref_type == "device": + if pref_loc is None or pref_loc < 0: + raise ValueError( + f"preferred_location must be a device ordinal (>= 0) when " + f"preferred_location_type is 'device', got {pref_loc!r}" + ) + return ( + cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, + pref_loc, "device", pref_loc, + ) + + if pref_type == "host": + if pref_loc is not None: + raise ValueError( + f"preferred_location must be None when " + f"preferred_location_type is 'host', got {pref_loc!r}" + ) + return ( + cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST, + -1, "host", -1, + ) + + # pref_type == "host_numa" + if pref_loc is None: + from .._device import Device + dev = Device() + numa_id = dev.properties.host_numa_id + if numa_id < 0: + raise RuntimeError( + "Cannot determine host NUMA ID for the current CUDA device. " + "The system may not support NUMA, or no CUDA context is " + "active. Set preferred_location to an explicit NUMA node ID " + "or call Device.set_current() first." + ) + return ( + cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA, + numa_id, "host_numa", numa_id, + ) + if pref_loc < 0: + raise ValueError( + f"preferred_location must be a NUMA node ID (>= 0) or None " + f"when preferred_location_type is 'host_numa', got {pref_loc}" + ) + return ( + cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA, + pref_loc, "host_numa", pref_loc, + ) + + +cdef inline _MMR_init(ManagedMemoryResource self, options): + IF CUDA_CORE_BUILD_MAJOR >= 13: + cdef ManagedMemoryResourceOptions opts = check_or_create_options( + ManagedMemoryResourceOptions, options, "ManagedMemoryResource options", + keep_none=True + ) + cdef cydriver.CUmemLocationType loc_type + cdef int loc_id + + loc_type, loc_id, self._pref_loc_type, self._pref_loc_id = ( + _resolve_preferred_location(opts) + ) + + if opts is None: + MP_init_current_pool( + self, + loc_type, + loc_id, + cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED, + ) + else: + MP_init_create_pool( + self, + loc_type, + loc_id, + cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED, + False, + 0, + ) + + _check_concurrent_managed_access() + ELSE: + raise RuntimeError("ManagedMemoryResource requires CUDA 13.0 or later") + + cdef bint _concurrent_access_warned = False cdef object _concurrent_access_lock = threading.Lock() diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pxd b/cuda_core/cuda/core/_memory/_memory_pool.pxd index a8838bf9dc..45062826e4 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pxd +++ b/cuda_core/cuda/core/_memory/_memory_pool.pxd @@ -10,15 +10,32 @@ from cuda.core._resource_handles cimport MemoryPoolHandle cdef class _MemPool(MemoryResource): cdef: - int _dev_id MemoryPoolHandle _h_pool bint _mempool_owned IPCDataForMR _ipc_data object _attributes - object _peer_accessible_by object __weakref__ +cdef int MP_init_create_pool( + _MemPool self, + cydriver.CUmemLocationType loc_type, + int loc_id, + cydriver.CUmemAllocationType alloc_type, + bint ipc_enabled, + size_t max_size, +) except? -1 + +cdef int MP_init_current_pool( + _MemPool self, + cydriver.CUmemLocationType loc_type, + int loc_id, + cydriver.CUmemAllocationType alloc_type, +) except? -1 + +cdef int MP_raise_release_threshold(_MemPool self) except? -1 + + cdef class _MemPoolAttributes: cdef: MemoryPoolHandle _h_pool @@ -27,13 +44,3 @@ cdef class _MemPoolAttributes: cdef _MemPoolAttributes _init(MemoryPoolHandle h_pool) cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except? -1 - - -cdef class _MemPoolOptions: - - cdef: - bint _ipc_enabled - size_t _max_size - cydriver.CUmemLocationType _location - cydriver.CUmemAllocationType _type - bint _use_current diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index 1e9f5116c1..a37ea17ab3 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -7,7 +7,6 @@ from __future__ import annotations from libc.limits cimport ULLONG_MAX from libc.stdint cimport uintptr_t from libc.string cimport memset -from cpython.mem cimport PyMem_Malloc, PyMem_Free from cuda.bindings cimport cydriver from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource @@ -18,7 +17,6 @@ from cuda.core._resource_handles cimport ( DevicePtrHandle, create_mempool_handle, create_mempool_handle_ref, - get_device_mempool, deviceptr_alloc_from_pool, as_cu, as_py, @@ -28,20 +26,6 @@ from cuda.core._utils.cuda_utils cimport ( HANDLE_RETURN, ) -import platform # no-cython-lint - -from cuda.core._utils.cuda_utils import driver - - -cdef class _MemPoolOptions: - - def __cinit__(self): - self._ipc_enabled = False - self._max_size = 0 - self._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_INVALID - self._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_INVALID - self._use_current = True - cdef class _MemPoolAttributes: """Provides access to memory pool attributes.""" @@ -126,24 +110,14 @@ cdef class _MemPoolAttributes: cdef class _MemPool(MemoryResource): def __cinit__(self): - self._dev_id = cydriver.CU_DEVICE_INVALID + # Note: subclasses use MP_init_create_pool or MP_init_current_pool to initialize. self._mempool_owned = False self._ipc_data = None self._attributes = None - self._peer_accessible_by = () - - def __init__(self, int device_id, _MemPoolOptions opts): - if opts._use_current: - _MP_init_current(self, device_id, opts) - else: - _MP_init_create(self, device_id, opts) - - def __dealloc__(self): - _MP_close(self) def close(self): """ - Close the device memory resource and destroy the associated memory pool + Close the memory resource and destroy the associated memory pool if owned. """ _MP_close(self) @@ -194,11 +168,6 @@ cdef class _MemPool(MemoryResource): self._attributes = _MemPoolAttributes._init(self._h_pool) return self._attributes - @property - def device_id(self) -> int: - """The associated device ordinal.""" - return self._dev_id - @property def handle(self) -> object: """Handle to the underlying memory pool.""" @@ -209,73 +178,6 @@ cdef class _MemPool(MemoryResource): """Whether the memory resource handle is owned. If False, ``close`` has no effect.""" return self._mempool_owned - @property - def peer_accessible_by(self): - """ - Get or set the devices that can access allocations from this memory - pool. Access can be modified at any time and affects all allocations - from this memory pool. - - Returns a tuple of sorted device IDs that currently have peer access to - allocations from this memory pool. - - When setting, accepts a sequence of Device objects or device IDs. - Setting to an empty sequence revokes all peer access. - - Examples - -------- - >>> dmr = DeviceMemoryResource(0) - >>> dmr.peer_accessible_by = [1] # Grant access to device 1 - >>> assert dmr.peer_accessible_by == (1,) - >>> dmr.peer_accessible_by = [] # Revoke access - """ - return self._peer_accessible_by - - @peer_accessible_by.setter - def peer_accessible_by(self, devices): - """Set which devices can access this memory pool.""" - from .._device import Device - - # Convert all devices to device IDs - cdef set[int] target_ids = {Device(dev).device_id for dev in devices} - target_ids.discard(self._dev_id) # exclude this device from peer access list - this_dev = Device(self._dev_id) - cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)] - if bad: - raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}") - cdef set[int] cur_ids = set(self._peer_accessible_by) - cdef set[int] to_add = target_ids - cur_ids - cdef set[int] to_rm = cur_ids - target_ids - cdef size_t count = len(to_add) + len(to_rm) # transaction size - cdef cydriver.CUmemAccessDesc* access_desc = NULL - cdef size_t i = 0 - - if count > 0: - access_desc = PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc)) - if access_desc == NULL: - raise MemoryError("Failed to allocate memory for access descriptors") - - try: - for dev_id in to_add: - access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE - access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - access_desc[i].location.id = dev_id - i += 1 - - for dev_id in to_rm: - access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE - access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - access_desc[i].location.id = dev_id - i += 1 - - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolSetAccess(as_cu(self._h_pool), access_desc, count)) - finally: - if access_desc != NULL: - PyMem_Free(access_desc) - - self._peer_accessible_by = tuple(target_ids) - @property def is_ipc_enabled(self) -> bool: """Whether this memory resource has IPC enabled.""" @@ -298,106 +200,90 @@ cdef class _MemPool(MemoryResource): return getattr(self._ipc_data, 'uuid', None) -# _MemPool Implementation -# ----------------------- +cdef int MP_init_create_pool( + _MemPool self, + cydriver.CUmemLocationType loc_type, + int loc_id, + cydriver.CUmemAllocationType alloc_type, + bint ipc_enabled, + size_t max_size, +) except? -1: + """Initialize a _MemPool by creating a new memory pool with the given + parameters. -cdef int _MP_init_current(_MemPool self, int dev_id, _MemPoolOptions opts) except?-1: - # Get the current memory pool. - cdef cydriver.cuuint64_t current_threshold - cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX - cdef cydriver.CUmemLocation loc - cdef cydriver.CUmemoryPool pool + Sets ``_h_pool`` (owning), ``_mempool_owned``, and ``_ipc_data``. + """ + cdef cydriver.CUmemPoolProps properties + memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) - self._dev_id = dev_id - self._mempool_owned = False + properties.allocType = alloc_type + properties.handleTypes = ( + _ipc.IPC_HANDLE_TYPE if ipc_enabled + else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + ) + properties.location.id = loc_id + properties.location.type = loc_type + properties.maxSize = max_size - if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \ - and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: - assert dev_id >= 0 - self._h_pool = get_device_mempool(dev_id) + self._mempool_owned = True + self._h_pool = create_mempool_handle(properties) - # Set a higher release threshold to improve performance when there are - # no active allocations. By default, the release threshold is 0, which - # means memory is immediately released back to the OS when there are no - # active suballocations, causing performance issues. - with nogil: - HANDLE_RETURN( - cydriver.cuMemPoolGetAttribute( - as_cu(self._h_pool), - cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - ¤t_threshold - ) - ) - if current_threshold == 0: - HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( - as_cu(self._h_pool), - cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - &max_threshold - )) - elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \ - and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST: - IF CUDA_CORE_BUILD_MAJOR >= 13: - assert dev_id == -1 - loc.id = dev_id - loc.type = opts._location - with nogil: - HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type)) - self._h_pool = create_mempool_handle_ref(pool) - ELSE: - raise RuntimeError("not supported") - elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \ - and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA: - IF CUDA_CORE_BUILD_MAJOR >= 13: - assert dev_id == 0 - loc.id = 0 - loc.type = opts._location - with nogil: - HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type)) - self._h_pool = create_mempool_handle_ref(pool) - ELSE: - raise RuntimeError("not supported") - else: - IF CUDA_CORE_BUILD_MAJOR >= 13: - if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED: - # Managed memory pools - loc.id = dev_id - loc.type = opts._location - with nogil: - HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type)) - self._h_pool = create_mempool_handle_ref(pool) - else: - assert False - ELSE: - assert False + if ipc_enabled: + alloc_handle = _ipc.MP_export_mempool(self) + self._ipc_data = _ipc.IPCDataForMR(alloc_handle, False) return 0 -cdef int _MP_init_create(_MemPool self, int dev_id, _MemPoolOptions opts) except?-1: - cdef cydriver.CUmemPoolProps properties - memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) +cdef int MP_init_current_pool( + _MemPool self, + cydriver.CUmemLocationType loc_type, + int loc_id, + cydriver.CUmemAllocationType alloc_type, +) except? -1: + """Initialize a _MemPool by getting the driver's current pool for a + location and allocation type. - cdef bint ipc_enabled = opts._ipc_enabled - properties.allocType = opts._type - properties.handleTypes = _ipc.IPC_HANDLE_TYPE if ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE - properties.location.id = dev_id - properties.location.type = opts._location - # managed memory does not support maxSize as of CUDA 13.0 + Sets ``_h_pool`` (non-owning) via ``cuMemGetMemPool``. + Requires CUDA 13+. + """ IF CUDA_CORE_BUILD_MAJOR >= 13: - if properties.allocType != cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED: - properties.maxSize = opts._max_size + cdef cydriver.CUmemLocation loc + cdef cydriver.CUmemoryPool pool + loc.id = loc_id + loc.type = loc_type + with nogil: + HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, alloc_type)) + self._h_pool = create_mempool_handle_ref(pool) + self._mempool_owned = False ELSE: - properties.maxSize = opts._max_size - - self._dev_id = dev_id - self._mempool_owned = True + raise RuntimeError("not supported") + return 0 - self._h_pool = create_mempool_handle(properties) - if ipc_enabled: - alloc_handle = _ipc.MP_export_mempool(self) - self._ipc_data = _ipc.IPCDataForMR(alloc_handle, False) +cdef int MP_raise_release_threshold(_MemPool self) except? -1: + """Raise the pool's release threshold to ULLONG_MAX if currently zero. + By default the release threshold is 0, meaning memory is returned to + the OS as soon as there are no active suballocations. Setting it to + ULLONG_MAX avoids repeated OS round-trips. + """ + cdef cydriver.cuuint64_t current_threshold + cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX + with nogil: + HANDLE_RETURN( + cydriver.cuMemPoolGetAttribute( + as_cu(self._h_pool), + cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + ¤t_threshold + ) + ) + if current_threshold == 0: + HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( + as_cu(self._h_pool), + cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + &max_threshold + )) return 0 @@ -438,17 +324,9 @@ cdef inline _MP_close(_MemPool self): if not self._h_pool: return - # This works around nvbug 5698116. When a memory pool handle is recycled - # the new handle inherits the peer access state of the previous handle. - if self._peer_accessible_by: - self.peer_accessible_by = [] - # Reset members in declaration order. - # The RAII deleter handles nvbug 5698116 workaround (clears peer access) - # and calls cuMemPoolDestroy if this is an owning handle. + # The RAII deleter calls cuMemPoolDestroy if this is an owning handle. self._h_pool.reset() - self._dev_id = cydriver.CU_DEVICE_INVALID self._mempool_owned = False self._ipc_data = None self._attributes = None - self._peer_accessible_by = () diff --git a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd index a8262d9bd8..fcfcfeb346 100644 --- a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd +++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd @@ -7,4 +7,4 @@ from cuda.core._memory._ipc cimport IPCDataForMR cdef class PinnedMemoryResource(_MemPool): - pass + cdef int _numa_id diff --git a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx index b2a9db4594..64ebcc7bc5 100644 --- a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx @@ -1,11 +1,11 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations from cuda.bindings cimport cydriver -from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions +from cuda.core._memory._memory_pool cimport _MemPool, MP_init_create_pool, MP_init_current_pool from cuda.core._memory cimport _ipc from cuda.core._memory._ipc cimport IPCAllocationHandle from cuda.core._utils.cuda_utils cimport ( @@ -15,76 +15,11 @@ from cuda.core._utils.cuda_utils cimport ( from dataclasses import dataclass import multiprocessing -import os import platform # no-cython-lint -import subprocess -import threading import uuid -import warnings from cuda.core._utils.cuda_utils import check_multiprocessing_start_method - -# Cache to ensure NUMA warning is only raised once per process -cdef bint _numa_warning_shown = False -cdef object _lock = threading.Lock() - - -def _check_numa_nodes(): - """Check if system has multiple NUMA nodes and warn if so.""" - global _numa_warning_shown - if _numa_warning_shown: - return - - with _lock: - if _numa_warning_shown: - return - - if platform.system() != "Linux": - _numa_warning_shown = True - return - - numa_count = None - - # Try /sys filesystem first (most reliable and doesn't require external tools) - try: - node_path = "/sys/devices/system/node" - if os.path.exists(node_path): - # Count directories named "node[0-9]+" - nodes = [d for d in os.listdir(node_path) if d.startswith("node") and d[4:].isdigit()] - numa_count = len(nodes) - except (OSError, PermissionError): - pass - - # Fallback to lscpu if /sys check didn't work - if numa_count is None: - try: - result = subprocess.run( - ["lscpu"], - capture_output=True, - text=True, - timeout=1 - ) - for line in result.stdout.splitlines(): - if line.startswith("NUMA node(s):"): - numa_count = int(line.split(":")[1].strip()) - break - except (subprocess.SubprocessError, ValueError, FileNotFoundError): - pass - - # Warn if multiple NUMA nodes detected - if numa_count is not None and numa_count > 1: - warnings.warn( - f"System has {numa_count} NUMA nodes. IPC-enabled pinned memory " - f"uses location ID 0, which may not work correctly with multiple " - f"NUMA nodes.", - UserWarning, - stacklevel=3 - ) - - _numa_warning_shown = True - - __all__ = ['PinnedMemoryResource', 'PinnedMemoryResourceOptions'] @@ -102,9 +37,22 @@ cdef class PinnedMemoryResourceOptions: max_size : int, optional Maximum pool size. When set to 0, defaults to a system-dependent value. (Default to 0) + + numa_id : int or None, optional + Host NUMA node ID for pool placement. When set to None (the default), + the behavior depends on ``ipc_enabled``: + + - ``ipc_enabled=False``: OS-managed placement (location type HOST). + - ``ipc_enabled=True``: automatically derived from the current CUDA + device's ``host_numa_id`` attribute, requiring an active CUDA + context. + + When set to a non-negative integer, that NUMA node is used explicitly + regardless of ``ipc_enabled`` (location type HOST_NUMA). """ ipc_enabled : bool = False max_size : int = 0 + numa_id : int | None = None cdef class PinnedMemoryResource(_MemPool): @@ -132,41 +80,16 @@ cdef class PinnedMemoryResource(_MemPool): ----- To create an IPC-Enabled memory resource (MR) that is capable of sharing allocations between processes, specify ``ipc_enabled=True`` in the initializer - option. When IPC is enabled, the location type is automatically set to - CU_MEM_LOCATION_TYPE_HOST_NUMA instead of CU_MEM_LOCATION_TYPE_HOST, - with location ID 0. - - Note: IPC support for pinned memory requires a single NUMA node. A warning - is issued if multiple NUMA nodes are detected. + option. When IPC is enabled and ``numa_id`` is not specified, the NUMA node + is automatically derived from the current CUDA device's ``host_numa_id`` + attribute, which requires an active CUDA context. If ``numa_id`` is + explicitly set, that value is used regardless of ``ipc_enabled``. See :class:`DeviceMemoryResource` for more details on IPC usage patterns. """ def __init__(self, options=None): - cdef PinnedMemoryResourceOptions opts = check_or_create_options( - PinnedMemoryResourceOptions, options, "PinnedMemoryResource options", - keep_none=True - ) - cdef _MemPoolOptions opts_base = _MemPoolOptions() - - cdef bint ipc_enabled = False - if opts: - ipc_enabled = opts.ipc_enabled - if ipc_enabled and not _ipc.is_supported(): - raise RuntimeError(f"IPC is not available on {platform.system()}") - if ipc_enabled: - # Check for multiple NUMA nodes on Linux - _check_numa_nodes() - opts_base._max_size = opts.max_size - opts_base._use_current = False - opts_base._ipc_enabled = ipc_enabled - if ipc_enabled: - opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA - else: - opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST - opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED - - super().__init__(0 if ipc_enabled else -1, opts_base) + _PMR_init(self, options) def __reduce__(self): return PinnedMemoryResource.from_registry, (self.uuid,) @@ -239,6 +162,16 @@ cdef class PinnedMemoryResource(_MemPool): raise RuntimeError("Memory resource is not IPC-enabled") return self._ipc_data._alloc_handle + @property + def device_id(self) -> int: + """Return -1. Pinned memory is host memory and is not associated with a specific device.""" + return -1 + + @property + def numa_id(self) -> int: + """The host NUMA node ID used for pool placement, or -1 for OS-managed placement.""" + return self._numa_id + @property def is_device_accessible(self) -> bool: """Return True. This memory resource provides device-accessible buffers.""" @@ -250,6 +183,63 @@ cdef class PinnedMemoryResource(_MemPool): return True +cdef inline _PMR_init(PinnedMemoryResource self, options): + from .._device import Device + + cdef PinnedMemoryResourceOptions opts = check_or_create_options( + PinnedMemoryResourceOptions, options, "PinnedMemoryResource options", + keep_none=True + ) + cdef bint ipc_enabled = False + cdef size_t max_size = 0 + cdef cydriver.CUmemLocationType loc_type + cdef int numa_id = -1 + + if opts is not None: + ipc_enabled = opts.ipc_enabled + if ipc_enabled and not _ipc.is_supported(): + raise RuntimeError(f"IPC is not available on {platform.system()}") + max_size = opts.max_size + + if opts.numa_id is not None: + numa_id = opts.numa_id + if numa_id < 0: + raise ValueError(f"numa_id must be >= 0, got {numa_id}") + elif ipc_enabled: + dev = Device() + numa_id = dev.properties.host_numa_id + if numa_id < 0: + raise RuntimeError( + "Cannot determine host NUMA ID for IPC-enabled pinned " + "memory pool. The system may not support NUMA, or no " + "CUDA context is active. Set numa_id explicitly or " + "call Device.set_current() first.") + + if numa_id >= 0: + loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA + else: + loc_type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST + + self._numa_id = numa_id + + if opts is None: + MP_init_current_pool( + self, + loc_type, + numa_id, + cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, + ) + else: + MP_init_create_pool( + self, + loc_type, + numa_id, + cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED, + ipc_enabled, + max_size, + ) + + def _deep_reduce_pinned_memory_resource(mr): check_multiprocessing_start_method() alloc_handle = mr.get_allocation_handle() diff --git a/cuda_core/docs/source/release/0.7.x-notes.rst b/cuda_core/docs/source/release/0.7.x-notes.rst index 0fb0817581..1a1810219f 100644 --- a/cuda_core/docs/source/release/0.7.x-notes.rst +++ b/cuda_core/docs/source/release/0.7.x-notes.rst @@ -24,7 +24,24 @@ Breaking Changes New features ------------ -None. +- Added ``preferred_location_type`` option to :class:`ManagedMemoryResourceOptions` + for explicit control over the preferred location kind (``"device"``, + ``"host"``, or ``"host_numa"``). This enables NUMA-aware managed memory + pool placement. The existing ``preferred_location`` parameter retains full + backwards compatibility when ``preferred_location_type`` is not set. + +- Added :attr:`ManagedMemoryResource.preferred_location` property to query the + resolved preferred location of a managed memory pool. Returns ``None`` for no + preference, or a tuple such as ``("device", 0)``, ``("host", None)``, or + ``("host_numa", 3)``. + +- Added ``numa_id`` option to :class:`PinnedMemoryResourceOptions` for explicit + control over host NUMA node placement. When ``ipc_enabled=True`` and + ``numa_id`` is not set, the NUMA node is automatically derived from the + current CUDA device. + +- Added :attr:`PinnedMemoryResource.numa_id` property to query the host NUMA + node ID used for pool placement. Returns ``-1`` for OS-managed placement. New examples @@ -36,6 +53,10 @@ None. Fixes and enhancements ---------------------- +- Fixed IPC-enabled pinned memory pools using a hardcoded NUMA node ID of ``0`` + instead of the NUMA node closest to the active CUDA device. On multi-NUMA + systems where the device is attached to a non-zero host NUMA node, this could + cause pool creation or allocation failures. (:issue:`1603`) - Reduced Python overhead in :class:`Program` and :class:`Linker` by moving compilation and linking operations to the C level and releasing the GIL during backend calls. This benefits workloads that create many programs or linkers, and enables concurrent compilation in diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index b771d04276..71d2f30573 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -57,6 +57,12 @@ def skip_if_managed_memory_unsupported(device): pytest.skip("Device does not support managed memory pool operations") except AttributeError: pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later") + try: + ManagedMemoryResource() + except RuntimeError as e: + if "requires CUDA 13.0" in str(e): + pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later") + raise def create_managed_memory_resource_or_skip(*args, **kwargs): diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index ea2e989e1a..934bbf9578 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -998,6 +998,155 @@ def test_managed_memory_resource_with_options(init_cuda): src_buffer.close() +def test_managed_memory_resource_preferred_location_default(init_cuda): + """preferred_location property returns None when no preference is set.""" + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mr = create_managed_memory_resource_or_skip() + assert mr.preferred_location is None + + +def test_managed_memory_resource_preferred_location_device(init_cuda): + """preferred_location returns ("device", ordinal) for device preference.""" + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + # Legacy style + opts = ManagedMemoryResourceOptions(preferred_location=device.device_id) + mr = create_managed_memory_resource_or_skip(opts) + assert mr.preferred_location == ("device", device.device_id) + + # Explicit style + opts = ManagedMemoryResourceOptions( + preferred_location=device.device_id, + preferred_location_type="device", + ) + mr = create_managed_memory_resource_or_skip(opts) + assert mr.preferred_location == ("device", device.device_id) + + +def test_managed_memory_resource_preferred_location_host(init_cuda): + """preferred_location returns ("host", None) for host preference.""" + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + # Legacy style + opts = ManagedMemoryResourceOptions(preferred_location=-1) + mr = create_managed_memory_resource_or_skip(opts) + assert mr.preferred_location == ("host", None) + + # Explicit style + opts = ManagedMemoryResourceOptions(preferred_location_type="host") + mr = create_managed_memory_resource_or_skip(opts) + assert mr.preferred_location == ("host", None) + + +def test_managed_memory_resource_preferred_location_host_numa(init_cuda): + """preferred_location returns ("host_numa", id) for NUMA preference.""" + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + numa_id = device.properties.host_numa_id + if numa_id < 0: + pytest.skip("System does not support NUMA") + + # Auto-resolved from current device + opts = ManagedMemoryResourceOptions(preferred_location_type="host_numa") + mr = create_managed_memory_resource_or_skip(opts) + assert mr.preferred_location == ("host_numa", numa_id) + + # Explicit NUMA node ID + opts = ManagedMemoryResourceOptions( + preferred_location=numa_id, + preferred_location_type="host_numa", + ) + mr = create_managed_memory_resource_or_skip(opts) + assert mr.preferred_location == ("host_numa", numa_id) + + +def test_managed_memory_resource_preferred_location_validation(init_cuda): + """Invalid preferred_location combinations raise errors.""" + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + # Invalid preferred_location_type + with pytest.raises(ValueError, match="preferred_location_type must be one of"): + ManagedMemoryResource( + ManagedMemoryResourceOptions( + preferred_location_type="invalid", + ) + ) + + # "device" requires a non-negative int + with pytest.raises(ValueError, match="must be a device ordinal"): + ManagedMemoryResource( + ManagedMemoryResourceOptions( + preferred_location_type="device", + ) + ) + with pytest.raises(ValueError, match="must be a device ordinal"): + ManagedMemoryResource( + ManagedMemoryResourceOptions( + preferred_location=-1, + preferred_location_type="device", + ) + ) + + # "host" requires preferred_location=None + with pytest.raises(ValueError, match="must be None"): + ManagedMemoryResource( + ManagedMemoryResourceOptions( + preferred_location=0, + preferred_location_type="host", + ) + ) + + # "host_numa" rejects negative IDs + with pytest.raises(ValueError, match="must be a NUMA node ID"): + ManagedMemoryResource( + ManagedMemoryResourceOptions( + preferred_location=-1, + preferred_location_type="host_numa", + ) + ) + + # Legacy mode rejects invalid negative values + with pytest.raises(ValueError, match="preferred_location must be"): + ManagedMemoryResource( + ManagedMemoryResourceOptions( + preferred_location=-2, + ) + ) + + +def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): + """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" + from unittest.mock import MagicMock, patch + + device = Device() + skip_if_managed_memory_unsupported(device) + device.set_current() + + mock_dev = MagicMock() + mock_dev.properties.host_numa_id = -1 + + with ( + patch("cuda.core._device.Device", return_value=mock_dev), + pytest.raises(RuntimeError, match="Cannot determine host NUMA ID"), + ): + ManagedMemoryResource( + ManagedMemoryResourceOptions( + preferred_location_type="host_numa", + ) + ) + + def test_mempool_ipc_errors(mempool_device): """Test error cases when IPC operations are disabled.""" device = mempool_device @@ -1038,7 +1187,8 @@ def test_pinned_mempool_ipc_basic(): assert mr.is_ipc_enabled assert mr.is_device_accessible assert mr.is_host_accessible - assert mr.device_id == 0 # IPC-enabled uses location id 0 + assert mr.device_id == -1 # pinned memory is not device-specific + assert mr.numa_id >= 0 # IPC requires a concrete NUMA node # Test allocation handle export alloc_handle = mr.get_allocation_handle() @@ -1070,7 +1220,8 @@ def test_pinned_mempool_ipc_errors(): options = PinnedMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False) mr = PinnedMemoryResource(options) assert not mr.is_ipc_enabled - assert mr.device_id == -1 # Non-IPC uses location id -1 + assert mr.device_id == -1 + assert mr.numa_id == -1 # Non-IPC uses OS-managed placement buffer = mr.allocate(64) ipc_error_msg = "Memory resource is not IPC-enabled" @@ -1089,6 +1240,74 @@ def test_pinned_mempool_ipc_errors(): mr.close() +def test_pinned_mr_numa_id_default_no_ipc(init_cuda): + """numa_id defaults to -1 (OS-managed) when IPC is disabled.""" + device = Device() + skip_if_pinned_memory_unsupported(device) + + mr = PinnedMemoryResource(PinnedMemoryResourceOptions()) + assert mr.numa_id == -1 + mr.close() + + mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=False)) + assert mr.numa_id == -1 + mr.close() + + +def test_pinned_mr_numa_id_default_with_ipc(init_cuda): + """numa_id is derived from the current device when IPC is enabled.""" + device = Device() + skip_if_pinned_memory_unsupported(device) + + if platform.system() == "Windows": + pytest.skip("IPC not implemented for Windows") + if not supports_ipc_mempool(device): + pytest.skip("Driver rejects IPC-enabled mempool creation on this platform") + + expected_numa_id = device.properties.host_numa_id + if expected_numa_id < 0: + pytest.skip("System does not support NUMA") + + mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=True, max_size=POOL_SIZE)) + assert mr.numa_id == expected_numa_id + mr.close() + + +def test_pinned_mr_numa_id_explicit(init_cuda): + """Explicit numa_id is used regardless of ipc_enabled.""" + device = Device() + skip_if_pinned_memory_unsupported(device) + + host_numa_id = device.properties.host_numa_id + if host_numa_id < 0: + pytest.skip("System does not support NUMA") + + mr = PinnedMemoryResource(PinnedMemoryResourceOptions(numa_id=host_numa_id)) + assert mr.numa_id == host_numa_id + mr.close() + + if platform.system() == "Windows": + pytest.skip("IPC not implemented for Windows") + if not supports_ipc_mempool(device): + pytest.skip("Driver rejects IPC-enabled mempool creation on this platform") + + mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=True, numa_id=host_numa_id, max_size=POOL_SIZE)) + assert mr.numa_id == host_numa_id + mr.close() + + +def test_pinned_mr_numa_id_negative_error(init_cuda): + """Negative numa_id raises ValueError.""" + device = Device() + skip_if_pinned_memory_unsupported(device) + + with pytest.raises(ValueError, match="numa_id must be >= 0"): + PinnedMemoryResource(PinnedMemoryResourceOptions(numa_id=-1)) + + with pytest.raises(ValueError, match="numa_id must be >= 0"): + PinnedMemoryResource(PinnedMemoryResourceOptions(numa_id=-42)) + + @pytest.mark.parametrize("ipc_enabled", [True, False]) @pytest.mark.parametrize( "property_name,expected_type", diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index 4da1a3ffb4..7d66ef4763 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -5,7 +5,7 @@ from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer import cuda.core -from cuda.core import DeviceMemoryResource +from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core._utils.cuda_utils import CUDAError NBYTES = 1024 @@ -17,7 +17,8 @@ def test_peer_access_basic(mempool_device_x2): zero_on_dev0 = make_scratch_buffer(dev0, 0, NBYTES) one_on_dev0 = make_scratch_buffer(dev0, 1, NBYTES) stream_on_dev0 = dev0.create_stream() - dmr_on_dev1 = DeviceMemoryResource(dev1) + # Use owned pool to ensure clean initial state (no stale peer access). + dmr_on_dev1 = DeviceMemoryResource(dev1, DeviceMemoryResourceOptions()) buf_on_dev1 = dmr_on_dev1.allocate(NBYTES) # No access at first. @@ -52,7 +53,8 @@ def test_peer_access_property_x2(mempool_device_x2): # The peer access list is a sorted tuple and always excludes the self # device. dev0, dev1 = mempool_device_x2 - dmr = DeviceMemoryResource(dev0) + # Use owned pool to ensure clean initial state (no stale peer access). + dmr = DeviceMemoryResource(dev0, DeviceMemoryResourceOptions()) def check(expected): assert isinstance(dmr.peer_accessible_by, tuple) @@ -98,7 +100,9 @@ def test_peer_access_transitions(mempool_device_x3): # Allocate per-device resources. streams = [dev.create_stream() for dev in devs] pgens = [PatternGen(devs[i], NBYTES, streams[i]) for i in range(3)] - dmrs = [DeviceMemoryResource(dev) for dev in devs] + # Use owned pools (with options) to ensure clean initial state. + # Default pools are shared and may have stale peer access from prior tests. + dmrs = [DeviceMemoryResource(dev, DeviceMemoryResourceOptions()) for dev in devs] bufs = [dmr.allocate(NBYTES) for dmr in dmrs] def verify_state(state, pattern_seed):