From e3e1899206b77ef700f66d4fe89cde23f715b63b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 23 Feb 2026 14:02:32 -0800 Subject: [PATCH 01/13] initial commit --- cuda_core/cuda/core/__init__.py | 10 + cuda_core/cuda/core/_kernel_arg_handler.pyx | 24 + cuda_core/cuda/core/_tensor_map.pxd | 12 + cuda_core/cuda/core/_tensor_map.pyx | 524 ++++++++++++++++++++ cuda_core/examples/tma_tensor_map.py | 195 ++++++++ cuda_core/tests/test_tensor_map.py | 279 +++++++++++ 6 files changed, 1044 insertions(+) create mode 100644 cuda_core/cuda/core/_tensor_map.pxd create mode 100644 cuda_core/cuda/core/_tensor_map.pyx create mode 100644 cuda_core/examples/tma_tensor_map.py create mode 100644 cuda_core/tests/test_tensor_map.py diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index dfba887144..84e4f3d356 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -60,6 +60,7 @@ StridedMemoryView, args_viewable_as_strided_memory, ) +<<<<<<< HEAD from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import ( @@ -68,3 +69,12 @@ Stream, StreamOptions, ) +from cuda.core._tensor_map import ( + TensorMapDataType, + TensorMapDescriptor, + TensorMapIm2ColWideMode, + TensorMapInterleave, + TensorMapL2Promotion, + TensorMapOOBFill, + TensorMapSwizzle, +) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index 882ca5eaab..e88def13cd 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -6,6 +6,7 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.stdint cimport (intptr_t, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t,) +from libc.string cimport memcpy from libcpp cimport bool as cpp_bool from libcpp.complex cimport complex as cpp_complex from libcpp cimport nullptr @@ -16,6 +17,8 @@ import ctypes import numpy from cuda.core._memory import Buffer +from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py +from cuda.core._tensor_map cimport TensorMapDescriptor from cuda.core._utils.cuda_utils import driver from cuda.bindings cimport cydriver @@ -97,6 +100,9 @@ cdef object numpy_complex64 = numpy.complex64 cdef object numpy_complex128 = numpy.complex128 +cdef object tensor_map_descriptor_type = _TensorMapDescriptor_py + + # limitation due to cython/cython#534 ctypedef void* voidptr @@ -124,6 +130,18 @@ cdef inline int prepare_arg( return 0 +cdef inline int prepare_tensor_map_arg( + vector.vector[void*]& data, + vector.vector[void*]& data_addresses, + TensorMapDescriptor arg, + const size_t idx) except -1: + cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) + memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) + data_addresses[idx] = ptr + data[idx] = ptr + return 0 + + cdef inline int prepare_ctypes_arg( vector.vector[void*]& data, vector.vector[void*]& data_addresses, @@ -273,6 +291,9 @@ cdef class ParamHolder: # it's a CUdeviceptr: self.data_addresses[i] = (arg.handle.getPtr()) continue + elif arg_type is tensor_map_descriptor_type: + prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) + continue elif arg_type is bool: prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i) continue @@ -322,6 +343,9 @@ cdef class ParamHolder: elif isinstance(arg, driver.CUgraphConditionalHandle): prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i) continue + elif isinstance(arg, tensor_map_descriptor_type): + prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) + continue # TODO: support ctypes/numpy struct raise TypeError("the argument is of unsupported type: " + str(type(arg))) diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd new file mode 100644 index 0000000000..3cfd571d89 --- /dev/null +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef class TensorMapDescriptor: + cdef cydriver.CUtensorMap _tensor_map + cdef object _source_ref + + cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx new file mode 100644 index 0000000000..9375acf944 --- /dev/null +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -0,0 +1,524 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t, uint32_t, uint64_t +from cuda.bindings cimport cydriver +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN + +import enum + +import numpy + +from cuda.core._memoryview import StridedMemoryView + + +try: + from ml_dtypes import bfloat16 as ml_bfloat16 +except ImportError: + ml_bfloat16 = None + + +class TensorMapDataType(enum.IntEnum): + """Data types for tensor map descriptors. + + These correspond to the ``CUtensorMapDataType`` driver enum values. + """ + UINT8 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8 + UINT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16 + UINT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32 + INT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32 + UINT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64 + INT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64 + FLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16 + FLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32 + FLOAT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64 + BFLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16 + FLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ + TFLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32 + TFLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ + + +class TensorMapInterleave(enum.IntEnum): + """Interleave layout for tensor map descriptors. + + These correspond to the ``CUtensorMapInterleave`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_INTERLEAVE_NONE + INTERLEAVE_16B = cydriver.CU_TENSOR_MAP_INTERLEAVE_16B + INTERLEAVE_32B = cydriver.CU_TENSOR_MAP_INTERLEAVE_32B + + +class TensorMapSwizzle(enum.IntEnum): + """Swizzle mode for tensor map descriptors. + + These correspond to the ``CUtensorMapSwizzle`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_SWIZZLE_NONE + SWIZZLE_32B = cydriver.CU_TENSOR_MAP_SWIZZLE_32B + SWIZZLE_64B = cydriver.CU_TENSOR_MAP_SWIZZLE_64B + SWIZZLE_128B = cydriver.CU_TENSOR_MAP_SWIZZLE_128B + + +class TensorMapL2Promotion(enum.IntEnum): + """L2 promotion mode for tensor map descriptors. + + These correspond to the ``CUtensorMapL2promotion`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_L2_PROMOTION_NONE + L2_64B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_64B + L2_128B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_128B + L2_256B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_256B + + +class TensorMapOOBFill(enum.IntEnum): + """Out-of-bounds fill mode for tensor map descriptors. + + These correspond to the ``CUtensorMapFloatOOBfill`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE + NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA + + +# Mapping from numpy dtype to TMA data type +_NUMPY_DTYPE_TO_TMA = { + numpy.dtype(numpy.uint8): TensorMapDataType.UINT8, + numpy.dtype(numpy.uint16): TensorMapDataType.UINT16, + numpy.dtype(numpy.uint32): TensorMapDataType.UINT32, + numpy.dtype(numpy.int32): TensorMapDataType.INT32, + numpy.dtype(numpy.uint64): TensorMapDataType.UINT64, + numpy.dtype(numpy.int64): TensorMapDataType.INT64, + numpy.dtype(numpy.float16): TensorMapDataType.FLOAT16, + numpy.dtype(numpy.float32): TensorMapDataType.FLOAT32, + numpy.dtype(numpy.float64): TensorMapDataType.FLOAT64, +} + +if ml_bfloat16 is not None: + _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = TensorMapDataType.BFLOAT16 + + +# Mapping from TMA data type to element size in bytes +_TMA_DATA_TYPE_SIZE = { + TensorMapDataType.UINT8: 1, + TensorMapDataType.UINT16: 2, + TensorMapDataType.UINT32: 4, + TensorMapDataType.INT32: 4, + TensorMapDataType.UINT64: 8, + TensorMapDataType.INT64: 8, + TensorMapDataType.FLOAT16: 2, + TensorMapDataType.FLOAT32: 4, + TensorMapDataType.FLOAT64: 8, + TensorMapDataType.BFLOAT16: 2, + TensorMapDataType.FLOAT32_FTZ: 4, + TensorMapDataType.TFLOAT32: 4, + TensorMapDataType.TFLOAT32_FTZ: 4, +} + + +def _resolve_data_type(view, data_type): + """Resolve the TMA data type from an explicit value or the view's dtype.""" + + if data_type is not None: + if not isinstance(data_type, TensorMapDataType): + raise TypeError( + f"data_type must be a TensorMapDataType, got {type(data_type)}") + return data_type + + dt = view.dtype + if dt is None: + raise ValueError( + "Cannot infer TMA data type from the tensor; " + "please specify data_type explicitly") + + tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt) + if tma_dt is None: + raise ValueError( + f"Unsupported dtype {dt} for TMA; " + f"supported dtypes: {list(_NUMPY_DTYPE_TO_TMA.keys())}. " + "You may also specify data_type explicitly.") + + return tma_dt + + +cdef class TensorMapDescriptor: + """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs. + + A ``TensorMapDescriptor`` wraps the opaque 128-byte ``CUtensorMap`` struct + used by the hardware TMA unit for efficient bulk data movement between + global and shared memory. + + Instances are created via the class methods :meth:`from_tiled` and + :meth:`from_im2col`, and can be passed directly to + :func:`~cuda.core.launch` as a kernel argument. + """ + + def __init__(self): + raise RuntimeError( + "TensorMapDescriptor cannot be instantiated directly. " + "Use TensorMapDescriptor.from_tiled() or " + "TensorMapDescriptor.from_im2col().") + + cdef void* _get_data_ptr(self): + return &self._tensor_map + + @staticmethod + def from_tiled(tensor, box_dim, *, + element_strides=None, + data_type=None, + interleave=TensorMapInterleave.NONE, + swizzle=TensorMapSwizzle.NONE, + l2_promotion=TensorMapL2Promotion.NONE, + oob_fill=TensorMapOOBFill.NONE): + """Create a tiled TMA descriptor from a tensor object. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + box_dim : tuple of int + The size of each tile dimension (in elements). Must have the + same rank as the tensor and each value must be in [1, 256]. + Specified in the same (row-major) order as the tensor shape. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + Specified in the same (row-major) order as the tensor shape. + data_type : TensorMapDataType, optional + Explicit data type override. If ``None``, inferred from the + tensor's dtype. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [1, 5], the pointer is not + 16-byte aligned, or dimension/stride constraints are violated. + """ + cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + + # Obtain a StridedMemoryView from the tensor + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + # Keep a strong reference to prevent GC + desc._source_ref = tensor + + # Resolve data type + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int + + # Get tensor metadata + cdef intptr_t global_address = view.ptr + shape = view.shape + strides = view.strides # in elements, can be None for C-contiguous + + cdef int rank = len(shape) + if rank < 1 or rank > 5: + raise ValueError( + f"Tensor rank must be between 1 and 5, got {rank}") + + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)") + + if len(box_dim) != rank: + raise ValueError( + f"box_dim must have {rank} elements (same as tensor rank), " + f"got {len(box_dim)}") + + for i, bd in enumerate(box_dim): + if bd < 1 or bd > 256: + raise ValueError( + f"box_dim[{i}] must be in [1, 256], got {bd}") + + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + else: + element_strides = (1,) * rank + + # Compute byte strides from element strides + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + if strides is not None: + byte_strides = tuple(s * elem_size for s in strides) + else: + # C-contiguous: strides in bytes, row-major + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + + # Reverse dimensions for column-major cuTensorMap convention + # Python/DLPack: row-major (dim 0 = outermost) + # cuTensorMap: column-major (dim 0 = innermost) + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides # rank - 1 elements + cdef uint32_t[5] c_box_dim + cdef uint32_t[5] c_element_strides + cdef int i_c + + for i_c in range(rank): + # Reverse: Python dim i -> cuTensorMap dim (rank - 1 - i) + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_box_dim[i_c] = box_dim[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + # globalStrides: rank-1 elements (byte strides for dims 1..N-1 in col-major order) + # The innermost stride (dim 0) is implicit = element size + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + cdef uint32_t c_rank = rank + cdef int c_interleave_int = int(interleave) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeTiled( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_box_dim, + c_element_strides, + c_interleave, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + return desc + + @staticmethod + def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner, + channels_per_pixel, pixels_per_column, *, + element_strides=None, + data_type=None, + interleave=TensorMapInterleave.NONE, + swizzle=TensorMapSwizzle.NONE, + l2_promotion=TensorMapL2Promotion.NONE, + oob_fill=TensorMapOOBFill.NONE): + """Create an im2col TMA descriptor from a tensor object. + + Im2col layout is used for convolution-style data access patterns. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + pixel_box_lower_corner : tuple of int + Lower corner of the pixel bounding box for each spatial + dimension (rank - 2 elements). Specified in row-major order + matching the tensor's spatial dimensions. + pixel_box_upper_corner : tuple of int + Upper corner of the pixel bounding box for each spatial + dimension (rank - 2 elements). Specified in row-major order + matching the tensor's spatial dimensions. + channels_per_pixel : int + Number of channels per pixel. + pixels_per_column : int + Number of pixels per column. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + data_type : TensorMapDataType, optional + Explicit data type override. If ``None``, inferred from the + tensor's dtype. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [3, 5], the pointer is not + 16-byte aligned, or other constraints are violated. + """ + cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + + # Obtain a StridedMemoryView from the tensor + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + desc._source_ref = tensor + + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int + + cdef intptr_t global_address = view.ptr + shape = view.shape + strides = view.strides + + cdef int rank = len(shape) + if rank < 3 or rank > 5: + raise ValueError( + f"Im2col tensor rank must be between 3 and 5, got {rank}") + + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x}") + + cdef int n_spatial = rank - 2 + if len(pixel_box_lower_corner) != n_spatial: + raise ValueError( + f"pixel_box_lower_corner must have {n_spatial} elements " + f"(rank - 2), got {len(pixel_box_lower_corner)}") + if len(pixel_box_upper_corner) != n_spatial: + raise ValueError( + f"pixel_box_upper_corner must have {n_spatial} elements " + f"(rank - 2), got {len(pixel_box_upper_corner)}") + + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + else: + element_strides = (1,) * rank + + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + if strides is not None: + byte_strides = tuple(s * elem_size for s in strides) + else: + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + + # Reverse all dimension arrays for column-major convention + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides + cdef uint32_t[5] c_element_strides + cdef int[3] c_pixel_box_lower # max 3 spatial dims (rank 5 - 2) + cdef int[3] c_pixel_box_upper + cdef int i_c + + for i_c in range(rank): + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + # Reverse spatial dimensions for lower/upper corners + for i_c in range(n_spatial): + c_pixel_box_lower[i_c] = pixel_box_lower_corner[n_spatial - 1 - i_c] + c_pixel_box_upper[i_c] = pixel_box_upper_corner[n_spatial - 1 - i_c] + + cdef uint32_t c_rank = rank + cdef uint32_t c_channels = channels_per_pixel + cdef uint32_t c_pixels = pixels_per_column + cdef int c_interleave_int = int(interleave) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2col( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_pixel_box_lower, + c_pixel_box_upper, + c_channels, + c_pixels, + c_element_strides, + c_interleave, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + return desc + + def replace_address(self, tensor): + """Replace the global memory address in this tensor map descriptor. + + This is useful when the tensor data has been reallocated but the + shape, strides, and other parameters remain the same. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + """ + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + cdef intptr_t global_address = view.ptr + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x}") + + self._source_ref = tensor + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress( + &self._tensor_map, + global_address, + )) + + def __repr__(self): + return f"TensorMapDescriptor()" diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py new file mode 100644 index 0000000000..de51570ec4 --- /dev/null +++ b/cuda_core/examples/tma_tensor_map.py @@ -0,0 +1,195 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates how to use TMA (Tensor Memory Accelerator) descriptors +# with cuda.core on Hopper+ GPUs (compute capability >= 9.0). +# +# TMA enables efficient bulk data movement between global and shared memory using +# hardware-managed tensor map descriptors. This example shows: +# +# 1. Creating a TMA tiled descriptor from a CuPy device array +# 2. Passing the descriptor to a kernel via launch() +# 3. Using TMA to load tiles into shared memory (via inline PTX) +# 4. Updating the descriptor's source address with replace_address() +# +# Requirements: +# - Hopper or later GPU (compute capability >= 9.0) +# - CuPy +# - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set) +# +# ################################################################################ + +import sys + +import cupy as cp +import numpy as np + +from cuda.core import ( + Device, + LaunchConfig, + Program, + ProgramOptions, + TensorMapDescriptor, + launch, +) + +# --------------------------------------------------------------------------- +# Check for Hopper+ GPU +# --------------------------------------------------------------------------- +dev = Device() +arch = dev.compute_capability +if arch < (9, 0): + print( + "TMA requires compute capability >= 9.0 (Hopper or later)", + file=sys.stderr, + ) + sys.exit(0) +dev.set_current() + +arch_str = "".join(f"{i}" for i in arch) + +# --------------------------------------------------------------------------- +# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then +# copies the tile to an output buffer so we can verify correctness. +# +# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be +# compiled with NVRTC without pulling in the full driver-API header. +# +# Key points: +# - The tensor map is passed by value with __grid_constant__ so the TMA +# hardware can read it from grid-constant memory. +# - Thread 0 in each block issues the TMA load and manages the mbarrier. +# - All threads wait on the mbarrier, then copy from shared to global. +# --------------------------------------------------------------------------- +TILE_SIZE = 128 # elements per tile (must match the kernel constant) + +code = r""" +// Minimal definition of the 128-byte opaque tensor map struct. +struct __align__(64) TensorMap { unsigned long long opaque[16]; }; + +static constexpr int TILE_SIZE = 128; + +extern "C" +__global__ void tma_copy( + const __grid_constant__ TensorMap tensor_map, + float* output, + int N) +{ + __shared__ __align__(128) float smem[TILE_SIZE]; + __shared__ __align__(8) unsigned long long mbar; + + const int tid = threadIdx.x; + const int tile_start = blockIdx.x * TILE_SIZE; + + // ---- Thread 0: set up mbarrier and issue the TMA load ---- + if (tid == 0) + { + // Initialise a single-phase mbarrier (1 arriving thread). + asm volatile( + "mbarrier.init.shared.b64 [%0], 1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start' + // from the tensor described by 'tensor_map' into shared memory. + asm volatile( + "cp.async.bulk.tensor.1d.shared::cluster.global.tile" + ".mbarrier::complete_tx::bytes" + " [%0], [%1, {%2}], [%3];" + :: "r"((unsigned)__cvta_generic_to_shared(smem)), + "l"(&tensor_map), + "r"(tile_start), + "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Tell the mbarrier how many bytes the TMA will deliver. + asm volatile( + "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar)), + "r"((unsigned)(TILE_SIZE * sizeof(float)))); + } + + __syncthreads(); + + // ---- Wait for the TMA load to complete ---- + if (tid == 0) + { + asm volatile( + "{ .reg .pred P; \n" + "WAIT: \n" + " mbarrier.try_wait.parity.shared.b64 P, [%0], 0; \n" + " @!P bra WAIT; \n" + "} \n" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + } + + __syncthreads(); + + // ---- Copy the tile from shared memory to the output buffer ---- + if (tid < TILE_SIZE) + { + const int idx = tile_start + tid; + if (idx < N) + output[idx] = smem[tid]; + } +} +""" + +# --------------------------------------------------------------------------- +# Compile the kernel +# --------------------------------------------------------------------------- +prog = Program( + code, + code_type="c++", + options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"), +) +mod = prog.compile("cubin") +ker = mod.get_kernel("tma_copy") + +# --------------------------------------------------------------------------- +# 1) Prepare input data on the device +# --------------------------------------------------------------------------- +N = 1024 +a = cp.arange(N, dtype=cp.float32) # [0, 1, 2, ..., N-1] +output = cp.zeros(N, dtype=cp.float32) +dev.sync() # cupy uses its own stream + +# --------------------------------------------------------------------------- +# 2) Create a TMA tiled descriptor +# from_tiled() accepts any DLPack / __cuda_array_interface__ object. +# The dtype (float32) is inferred automatically from the CuPy array. +# --------------------------------------------------------------------------- +tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) + +# --------------------------------------------------------------------------- +# 3) Launch the kernel +# The TensorMapDescriptor is passed directly as a kernel argument — the +# 128-byte struct is copied into kernel parameter space automatically. +# --------------------------------------------------------------------------- +n_tiles = N // TILE_SIZE +config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) +launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output, a), "TMA copy produced incorrect results" +print(f"TMA copy verified: {N} elements across {n_tiles} tiles") + +# --------------------------------------------------------------------------- +# 4) Demonstrate replace_address() +# Create a second tensor with different content, point the *same* +# descriptor at it, and re-launch without rebuilding the descriptor. +# --------------------------------------------------------------------------- +b = cp.full(N, fill_value=42.0, dtype=cp.float32) +dev.sync() + +tensor_map.replace_address(b) + +output2 = cp.zeros(N, dtype=cp.float32) +dev.sync() + +launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output2, b), "replace_address produced incorrect results" +print("replace_address verified: descriptor reused with new source tensor") diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py new file mode 100644 index 0000000000..0aa5968a85 --- /dev/null +++ b/cuda_core/tests/test_tensor_map.py @@ -0,0 +1,279 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import numpy as np + +from cuda.core import ( + Device, + TensorMapDescriptor, + TensorMapDataType, + TensorMapInterleave, + TensorMapL2Promotion, + TensorMapOOBFill, + TensorMapSwizzle, +) + + +@pytest.fixture +def dev(init_cuda): + return Device() + + +@pytest.fixture +def skip_if_no_tma(dev): + if not dev.properties.tensor_map_access_supported: + pytest.skip("Device does not support TMA (requires compute capability 9.0+)") + + +def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256): + """Allocate a device buffer and return it with proper alignment.""" + n_elements = 1 + for s in shape: + n_elements *= s + buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment) + return buf + + +class TestTensorMapEnums: + """Test that enum wrappers expose the expected values.""" + + def test_data_type_values(self): + assert TensorMapDataType.UINT8 == 0 + assert TensorMapDataType.FLOAT32 == 7 + assert TensorMapDataType.FLOAT64 == 8 + assert TensorMapDataType.BFLOAT16 == 9 + + def test_interleave_values(self): + assert TensorMapInterleave.NONE == 0 + assert TensorMapInterleave.INTERLEAVE_16B == 1 + assert TensorMapInterleave.INTERLEAVE_32B == 2 + + def test_swizzle_values(self): + assert TensorMapSwizzle.NONE == 0 + assert TensorMapSwizzle.SWIZZLE_32B == 1 + assert TensorMapSwizzle.SWIZZLE_64B == 2 + assert TensorMapSwizzle.SWIZZLE_128B == 3 + + def test_l2_promotion_values(self): + assert TensorMapL2Promotion.NONE == 0 + assert TensorMapL2Promotion.L2_64B == 1 + assert TensorMapL2Promotion.L2_128B == 2 + assert TensorMapL2Promotion.L2_256B == 3 + + def test_oob_fill_values(self): + assert TensorMapOOBFill.NONE == 0 + assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1 + + +class TestTensorMapDescriptorCreation: + """Test TensorMapDescriptor factory methods.""" + + def test_cannot_instantiate_directly(self): + with pytest.raises(RuntimeError, match="cannot be instantiated directly"): + TensorMapDescriptor() + + def test_from_tiled_1d(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) # 1024 float32 elements + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + assert repr(desc) == "TensorMapDescriptor()" + + def test_from_tiled_2d(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) # 64x64 float32 + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_tiled_3d(self, dev, skip_if_no_tma): + buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(8, 8, 8), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + swizzle=TensorMapSwizzle.SWIZZLE_128B, + ) + assert desc is not None + + def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + l2_promotion=TensorMapL2Promotion.L2_128B, + ) + assert desc is not None + + def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA, + ) + assert desc is not None + + +class TestTensorMapDescriptorValidation: + """Test validation in TensorMapDescriptor factory methods.""" + + def test_invalid_rank_zero(self, dev, skip_if_no_tma): + buf = dev.allocate(64) + with pytest.raises(ValueError, match="rank must be between 1 and 5"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="box_dim must have 1 elements"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_box_dim_out_of_range(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match=r"box_dim\[0\] must be in \[1, 256\]"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(512,), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="element_strides must have 1 elements"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + element_strides=(1, 1), + data_type=TensorMapDataType.FLOAT32, + ) + + def test_invalid_data_type(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(TypeError, match="data_type must be a TensorMapDataType"): + TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + data_type=42, + ) + + +class TestTensorMapDtypeMapping: + """Test automatic dtype inference from numpy dtypes.""" + + @pytest.mark.parametrize("np_dtype,expected_tma_dt", [ + (np.uint8, TensorMapDataType.UINT8), + (np.uint16, TensorMapDataType.UINT16), + (np.uint32, TensorMapDataType.UINT32), + (np.int32, TensorMapDataType.INT32), + (np.uint64, TensorMapDataType.UINT64), + (np.int64, TensorMapDataType.INT64), + (np.float16, TensorMapDataType.FLOAT16), + (np.float32, TensorMapDataType.FLOAT32), + (np.float64, TensorMapDataType.FLOAT64), + ]) + def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma): + from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt + + def test_bfloat16_mapping(self): + try: + from ml_dtypes import bfloat16 + from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16 + except ImportError: + pytest.skip("ml_dtypes not installed") + + +class TestTensorMapReplaceAddress: + """Test replace_address functionality.""" + + def test_replace_address(self, dev, skip_if_no_tma): + buf1 = dev.allocate(1024 * 4) + desc = TensorMapDescriptor.from_tiled( + buf1, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + + buf2 = dev.allocate(1024 * 4) + desc.replace_address(buf2) + # No exception means success + + def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma): + buf1 = dev.allocate(1024 * 4) + desc = TensorMapDescriptor.from_tiled( + buf1, + box_dim=(64,), + data_type=TensorMapDataType.FLOAT32, + ) + # Create a host-only array (not device-accessible) + host_arr = np.zeros(1024, dtype=np.float32) + with pytest.raises(ValueError, match="device-accessible"): + desc.replace_address(host_arr) + + +class TestTensorMapIm2col: + """Test im2col TMA descriptor creation.""" + + def test_from_im2col_3d(self, dev, skip_if_no_tma): + # 3D tensor: batch=1, height=32, channels=64 + buf = dev.allocate(1 * 32 * 64 * 4) + desc = TensorMapDescriptor.from_im2col( + buf, + pixel_box_lower_corner=(0,), + pixel_box_upper_corner=(4,), + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_im2col_rank_validation(self, dev, skip_if_no_tma): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="Im2col tensor rank must be between 3 and 5"): + TensorMapDescriptor.from_im2col( + buf, + pixel_box_lower_corner=(), + pixel_box_upper_corner=(), + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + + def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): + buf = dev.allocate(1 * 32 * 64 * 4) + with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"): + TensorMapDescriptor.from_im2col( + buf, + pixel_box_lower_corner=(0, 0), + pixel_box_upper_corner=(4,), + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) From 77a3c8ecab4734eacbf7f36af964b103ca2ecf69 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 13:53:49 -0800 Subject: [PATCH 02/13] tma wide --- cuda_core/cuda/core/__init__.py | 1 - cuda_core/cuda/core/_tensor_map.pxd | 1 + cuda_core/cuda/core/_tensor_map.pyx | 208 +++++++++++++++++++++++++++- cuda_core/pixi.toml | 13 +- cuda_core/tests/test_tensor_map.py | 133 +++++++++++++++++- 5 files changed, 341 insertions(+), 15 deletions(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 84e4f3d356..d5f3693721 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -60,7 +60,6 @@ StridedMemoryView, args_viewable_as_strided_memory, ) -<<<<<<< HEAD from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import ( diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index 3cfd571d89..fa2254ba70 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -8,5 +8,6 @@ from cuda.bindings cimport cydriver cdef class TensorMapDescriptor: cdef cydriver.CUtensorMap _tensor_map cdef object _source_ref + cdef object _repr_info cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 9375acf944..6b072f7404 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -80,6 +80,16 @@ class TensorMapOOBFill(enum.IntEnum): NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA +class TensorMapIm2ColWideMode(enum.IntEnum): + """Im2col wide mode for tensor map descriptors. + + These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values. + Supported on compute capability 10.0+. + """ + W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W + W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128 + + # Mapping from numpy dtype to TMA data type _NUMPY_DTYPE_TO_TMA = { numpy.dtype(numpy.uint8): TensorMapDataType.UINT8, @@ -316,6 +326,13 @@ cdef class TensorMapDescriptor: c_oob_fill, )) + desc._repr_info = { + "method": "tiled", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + return desc @staticmethod @@ -483,6 +500,176 @@ cdef class TensorMapDescriptor: c_oob_fill, )) + desc._repr_info = { + "method": "im2col", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + + return desc + + @staticmethod + def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, + channels_per_pixel, pixels_per_column, *, + element_strides=None, + data_type=None, + interleave=TensorMapInterleave.NONE, + mode=TensorMapIm2ColWideMode.W, + swizzle=TensorMapSwizzle.SWIZZLE_128B, + l2_promotion=TensorMapL2Promotion.NONE, + oob_fill=TensorMapOOBFill.NONE): + """Create an im2col-wide TMA descriptor from a tensor object. + + Im2col-wide layout loads elements exclusively along the W (width) + dimension. This variant is supported on compute capability 10.0+ + (Blackwell and later). + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + pixel_box_lower_corner_width : int + Lower corner of the pixel bounding box along the W dimension. + pixel_box_upper_corner_width : int + Upper corner of the pixel bounding box along the W dimension. + channels_per_pixel : int + Number of channels per pixel. + pixels_per_column : int + Number of pixels per column. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + data_type : TensorMapDataType, optional + Explicit data type override. If ``None``, inferred from the + tensor's dtype. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + mode : TensorMapIm2ColWideMode + Im2col wide mode. Default ``W``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``SWIZZLE_128B``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [3, 5], the pointer is not + 16-byte aligned, or other constraints are violated. + """ + cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + + # Obtain a StridedMemoryView from the tensor + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + desc._source_ref = tensor + + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int + + cdef intptr_t global_address = view.ptr + shape = view.shape + strides = view.strides + + cdef int rank = len(shape) + if rank < 3 or rank > 5: + raise ValueError( + f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") + + if global_address % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{global_address:x}") + + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + else: + element_strides = (1,) * rank + + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + if strides is not None: + byte_strides = tuple(s * elem_size for s in strides) + else: + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + + # Reverse all dimension arrays for column-major convention + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides + cdef uint32_t[5] c_element_strides + cdef int i_c + + for i_c in range(rank): + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + cdef uint32_t c_rank = rank + cdef int c_lower_w = pixel_box_lower_corner_width + cdef int c_upper_w = pixel_box_upper_corner_width + cdef uint32_t c_channels = channels_per_pixel + cdef uint32_t c_pixels = pixels_per_column + cdef int c_interleave_int = int(interleave) + cdef int c_mode_int = int(mode) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapIm2ColWideMode c_mode = c_mode_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_lower_w, + c_upper_w, + c_channels, + c_pixels, + c_element_strides, + c_interleave, + c_mode, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + desc._repr_info = { + "method": "im2col_wide", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + return desc def replace_address(self, tensor): @@ -512,13 +699,28 @@ cdef class TensorMapDescriptor: f"Global memory address must be 16-byte aligned, " f"got address 0x{global_address:x}") - self._source_ref = tensor - with nogil: HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress( &self._tensor_map, global_address, )) + # Update the source reference only after the driver call succeeds, + # so we don't drop the old tensor (risking a dangling pointer in the + # CUtensorMap struct) if the call fails. + self._source_ref = tensor + def __repr__(self): - return f"TensorMapDescriptor()" + info = self._repr_info + if info is None: + return "TensorMapDescriptor()" + parts = [] + if "method" in info: + parts.append(info["method"]) + if "rank" in info: + parts.append(f"rank={info['rank']}") + if "data_type" in info: + parts.append(f"dtype={info['data_type'].name}") + if "swizzle" in info: + parts.append(f"swizzle={info['swizzle'].name}") + return f"TensorMapDescriptor({', '.join(parts)})" diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml index 9dc6ac1ed9..e351fb63b6 100644 --- a/cuda_core/pixi.toml +++ b/cuda_core/pixi.toml @@ -75,16 +75,9 @@ cuda = "13" [feature.cu13.dependencies] cuda-version = "13.1.*" -[feature.cu12.system-requirements] -cuda = "12" - -[feature.cu12.dependencies] -cuda-version = "12.*" - -# We keep both cu12 and cu13 because cuda.core works with either major version -# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1) -# For cu12 testing, use conda-forge packages: temporarily change path to wildcard -# or skip cu12 locally: pixi run -e cu13 test +# NOTE: cu12 environment is intentionally omitted because the path dependency +# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing, +# use conda-forge packages or CI workflows. [environments] default = { features = [ "cu13", diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 0aa5968a85..f57e73ded8 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -9,6 +9,7 @@ Device, TensorMapDescriptor, TensorMapDataType, + TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, TensorMapOOBFill, @@ -36,6 +37,23 @@ def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256): return buf +class _DeviceArray: + """Wrap a Buffer with explicit shape via __cuda_array_interface__. + + dev.allocate() returns a 1D byte buffer. For multi-dimensional TMA tests + we need the tensor to report a proper shape/dtype so the TMA encoder sees + the correct rank, dimensions, and strides. + """ + def __init__(self, buf, shape, dtype=np.float32): + self._buf = buf # prevent GC + self.__cuda_array_interface__ = { + "shape": tuple(shape), + "typestr": np.dtype(dtype).str, + "data": (int(buf.handle), False), + "version": 3, + } + + class TestTensorMapEnums: """Test that enum wrappers expose the expected values.""" @@ -66,6 +84,10 @@ def test_oob_fill_values(self): assert TensorMapOOBFill.NONE == 0 assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1 + def test_im2col_wide_mode_values(self): + assert TensorMapIm2ColWideMode.W == 0 + assert TensorMapIm2ColWideMode.W128 == 1 + class TestTensorMapDescriptorCreation: """Test TensorMapDescriptor factory methods.""" @@ -82,7 +104,7 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma): data_type=TensorMapDataType.FLOAT32, ) assert desc is not None - assert repr(desc) == "TensorMapDescriptor()" + assert repr(desc) == "TensorMapDescriptor(tiled, rank=1, dtype=FLOAT32, swizzle=NONE)" def test_from_tiled_2d(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) # 64x64 float32 @@ -102,6 +124,18 @@ def test_from_tiled_3d(self, dev, skip_if_no_tma): ) assert desc is not None + def test_from_tiled_5d(self, dev, skip_if_no_tma): + # 5D: exercises all 5 c_global_dim / 4 c_global_strides slots + shape = (2, 4, 4, 4, 8) + n_bytes = 2 * 4 * 4 * 4 * 8 * 4 # float32 + buf = dev.allocate(n_bytes) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_tiled( + tensor, + box_dim=(1, 2, 2, 2, 8), + ) + assert desc is not None + def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) desc = TensorMapDescriptor.from_tiled( @@ -277,3 +311,100 @@ def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): pixels_per_column=4, data_type=TensorMapDataType.FLOAT32, ) + + def test_from_im2col_4d(self, dev, skip_if_no_tma): + # NHWC layout: N=1, H=8, W=8, C=64 — 2 spatial dims + # Exercises spatial corner reversal with n_spatial=2: + # Python [H_lower, W_lower] -> driver [W_lower, H_lower] + shape = (1, 8, 8, 64) + buf = dev.allocate(1 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col( + tensor, + pixel_box_lower_corner=(0, 0), + pixel_box_upper_corner=(4, 4), + channels_per_pixel=64, + pixels_per_column=16, + ) + assert desc is not None + + def test_from_im2col_5d(self, dev, skip_if_no_tma): + # NDHWC layout: N=1, D=4, H=8, W=8, C=64 — 3 spatial dims + # Exercises the full spatial corner reversal: + # Python [D, H, W] -> driver [W, H, D] + shape = (1, 4, 8, 8, 64) + buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col( + tensor, + pixel_box_lower_corner=(0, 0, 0), + pixel_box_upper_corner=(2, 4, 4), + channels_per_pixel=64, + pixels_per_column=32, + ) + assert desc is not None + + +class TestTensorMapIm2colWide: + """Test im2col-wide TMA descriptor creation (compute capability 10.0+).""" + + @pytest.fixture + def skip_if_no_im2col_wide(self, dev): + cc = dev.compute_capability + if cc.major < 10: + pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)") + + def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide): + # 3D tensor: batch=1, width=32, channels=64 + buf = dev.allocate(1 * 32 * 64 * 4) + desc = TensorMapDescriptor.from_im2col_wide( + buf, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_im2col_wide_4d(self, dev, skip_if_no_im2col_wide): + # NHWC layout: N=1, H=8, W=8, C=64 + # Wide mode only uses scalar W corners, even with higher rank + shape = (1, 8, 8, 64) + buf = dev.allocate(1 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col_wide( + tensor, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=16, + ) + assert desc is not None + + def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide): + # NDHWC layout: N=1, D=4, H=8, W=8, C=64 + # Max rank boundary — verifies all 5 dim/stride slots are filled + shape = (1, 4, 8, 8, 64) + buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4) + tensor = _DeviceArray(buf, shape) + desc = TensorMapDescriptor.from_im2col_wide( + tensor, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=32, + ) + assert desc is not None + + def test_from_im2col_wide_rank_validation(self, dev, skip_if_no_im2col_wide): + buf = dev.allocate(1024 * 4) + with pytest.raises(ValueError, match="Im2col-wide tensor rank must be between 3 and 5"): + TensorMapDescriptor.from_im2col_wide( + buf, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) From 19c4a0f6ce25b30faffec5dbb0b78dca2496af8f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 15:15:56 -0800 Subject: [PATCH 03/13] clean up --- cuda_core/cuda/core/_tensor_map.pxd | 2 +- cuda_core/cuda/core/_tensor_map.pyx | 185 ++++++++++----------------- cuda_core/examples/tma_tensor_map.py | 2 +- cuda_core/tests/test_tensor_map.py | 2 +- 4 files changed, 72 insertions(+), 119 deletions(-) diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index fa2254ba70..b2b165b5aa 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 6b072f7404..9edcccb928 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -150,6 +150,55 @@ def _resolve_data_type(view, data_type): return tma_dt +def _get_validated_view(tensor): + """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer.""" + if isinstance(tensor, StridedMemoryView): + view = tensor + else: + # stream_ptr=-1: no stream synchronization needed because descriptor + # creation only reads tensor metadata, it does not move data. + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + + if not view.is_device_accessible: + raise ValueError("The tensor must be device-accessible") + + if view.ptr % 16 != 0: + raise ValueError( + f"Global memory address must be 16-byte aligned, " + f"got address 0x{view.ptr:x}") + + return view + + +def _compute_byte_strides(shape, strides, elem_size): + """Compute byte strides from element strides or C-contiguous fallback. + + Returns a tuple of byte strides in row-major order. + """ + if strides is not None: + return tuple(s * elem_size for s in strides) + + # C-contiguous: compute byte strides from shape, innermost first + rank = len(shape) + byte_strides = [] + stride = elem_size + for i in range(rank - 1, -1, -1): + byte_strides.append(stride) + stride *= shape[i] + byte_strides.reverse() + return tuple(byte_strides) + + +def _validate_element_strides(element_strides, rank): + """Validate or default element_strides to all-ones.""" + if element_strides is not None: + if len(element_strides) != rank: + raise ValueError( + f"element_strides must have {rank} elements, got {len(element_strides)}") + return element_strides + return (1,) * rank + + cdef class TensorMapDescriptor: """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs. @@ -171,8 +220,8 @@ cdef class TensorMapDescriptor: cdef void* _get_data_ptr(self): return &self._tensor_map - @staticmethod - def from_tiled(tensor, box_dim, *, + @classmethod + def from_tiled(cls, tensor, box_dim, *, element_strides=None, data_type=None, interleave=TensorMapInterleave.NONE, @@ -216,40 +265,23 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [1, 5], the pointer is not 16-byte aligned, or dimension/stride constraints are violated. """ - cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) + cdef TensorMapDescriptor desc = cls.__new__(cls) - # Obtain a StridedMemoryView from the tensor - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") - - # Keep a strong reference to prevent GC + view = _get_validated_view(tensor) desc._source_ref = tensor - # Resolve data type tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int - # Get tensor metadata cdef intptr_t global_address = view.ptr shape = view.shape - strides = view.strides # in elements, can be None for C-contiguous cdef int rank = len(shape) if rank < 1 or rank > 5: raise ValueError( f"Tensor rank must be between 1 and 5, got {rank}") - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)") - if len(box_dim) != rank: raise ValueError( f"box_dim must have {rank} elements (same as tensor rank), " @@ -260,25 +292,10 @@ cdef class TensorMapDescriptor: raise ValueError( f"box_dim[{i}] must be in [1, 256], got {bd}") - if element_strides is not None: - if len(element_strides) != rank: - raise ValueError( - f"element_strides must have {rank} elements, got {len(element_strides)}") - else: - element_strides = (1,) * rank + element_strides = _validate_element_strides(element_strides, rank) - # Compute byte strides from element strides cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - if strides is not None: - byte_strides = tuple(s * elem_size for s in strides) - else: - # C-contiguous: strides in bytes, row-major - byte_strides = [] - stride = elem_size - for i in range(rank - 1, -1, -1): - byte_strides.append(stride) - stride *= shape[i] - byte_strides.reverse() + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) # Reverse dimensions for column-major cuTensorMap convention # Python/DLPack: row-major (dim 0 = outermost) @@ -335,8 +352,8 @@ cdef class TensorMapDescriptor: return desc - @staticmethod - def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner, + @classmethod + def from_im2col(cls, tensor, pixel_box_lower_corner, pixel_box_upper_corner, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, @@ -390,17 +407,9 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [3, 5], the pointer is not 16-byte aligned, or other constraints are violated. """ - cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) - - # Obtain a StridedMemoryView from the tensor - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") + cdef TensorMapDescriptor desc = cls.__new__(cls) + view = _get_validated_view(tensor) desc._source_ref = tensor tma_dt = _resolve_data_type(view, data_type) @@ -409,18 +418,12 @@ cdef class TensorMapDescriptor: cdef intptr_t global_address = view.ptr shape = view.shape - strides = view.strides cdef int rank = len(shape) if rank < 3 or rank > 5: raise ValueError( f"Im2col tensor rank must be between 3 and 5, got {rank}") - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x}") - cdef int n_spatial = rank - 2 if len(pixel_box_lower_corner) != n_spatial: raise ValueError( @@ -431,23 +434,10 @@ cdef class TensorMapDescriptor: f"pixel_box_upper_corner must have {n_spatial} elements " f"(rank - 2), got {len(pixel_box_upper_corner)}") - if element_strides is not None: - if len(element_strides) != rank: - raise ValueError( - f"element_strides must have {rank} elements, got {len(element_strides)}") - else: - element_strides = (1,) * rank + element_strides = _validate_element_strides(element_strides, rank) cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - if strides is not None: - byte_strides = tuple(s * elem_size for s in strides) - else: - byte_strides = [] - stride = elem_size - for i in range(rank - 1, -1, -1): - byte_strides.append(stride) - stride *= shape[i] - byte_strides.reverse() + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) # Reverse all dimension arrays for column-major convention cdef uint64_t[5] c_global_dim @@ -509,8 +499,8 @@ cdef class TensorMapDescriptor: return desc - @staticmethod - def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, + @classmethod + def from_im2col_wide(cls, tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, @@ -565,17 +555,9 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [3, 5], the pointer is not 16-byte aligned, or other constraints are violated. """ - cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor) - - # Obtain a StridedMemoryView from the tensor - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") + cdef TensorMapDescriptor desc = cls.__new__(cls) + view = _get_validated_view(tensor) desc._source_ref = tensor tma_dt = _resolve_data_type(view, data_type) @@ -584,35 +566,16 @@ cdef class TensorMapDescriptor: cdef intptr_t global_address = view.ptr shape = view.shape - strides = view.strides cdef int rank = len(shape) if rank < 3 or rank > 5: raise ValueError( f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x}") - - if element_strides is not None: - if len(element_strides) != rank: - raise ValueError( - f"element_strides must have {rank} elements, got {len(element_strides)}") - else: - element_strides = (1,) * rank + element_strides = _validate_element_strides(element_strides, rank) cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - if strides is not None: - byte_strides = tuple(s * elem_size for s in strides) - else: - byte_strides = [] - stride = elem_size - for i in range(rank - 1, -1, -1): - byte_strides.append(stride) - stride *= shape[i] - byte_strides.reverse() + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) # Reverse all dimension arrays for column-major convention cdef uint64_t[5] c_global_dim @@ -685,19 +648,9 @@ cdef class TensorMapDescriptor: or a :obj:`~cuda.core.StridedMemoryView`. Must refer to device-accessible memory with a 16-byte-aligned pointer. """ - if isinstance(tensor, StridedMemoryView): - view = tensor - else: - view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) - - if not view.is_device_accessible: - raise ValueError("The tensor must be device-accessible") + view = _get_validated_view(tensor) cdef intptr_t global_address = view.ptr - if global_address % 16 != 0: - raise ValueError( - f"Global memory address must be 16-byte aligned, " - f"got address 0x{global_address:x}") with nogil: HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress( diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index de51570ec4..e93c02dc08 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index f57e73ded8..790307857b 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import pytest From 35a04b97367b5cf39a493d2ab6c325e70d970cb8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 15:55:00 -0800 Subject: [PATCH 04/13] Add comments to prepare_tensor_map_arg explaining allocation and lifetime Co-Authored-By: Claude Opus 4.6 --- cuda_core/cuda/core/_kernel_arg_handler.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index e88def13cd..b55a4abc7a 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -135,8 +135,13 @@ cdef inline int prepare_tensor_map_arg( vector.vector[void*]& data_addresses, TensorMapDescriptor arg, const size_t idx) except -1: + # Allocate a temporary buffer for the 128-byte CUtensorMap struct. + # We copy rather than pointing directly at arg._tensor_map for lifetime + # safety: ParamHolder owns and frees its argument buffers independently. cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) + # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__, + # data_addresses[idx] is the pointer passed to cuLaunchKernel. data_addresses[idx] = ptr data[idx] = ptr return 0 From bb19e4f5c03cb65fd0d53a1cd1a58a8159acaee8 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 17:04:26 -0800 Subject: [PATCH 05/13] Address Copilot review feedback - Remove unused _alloc_device_tensor helper from tests - Add test for rank > 5 (6D tensor) to verify upper bound validation - Add NULL check for PyMem_Malloc in prepare_tensor_map_arg Co-Authored-By: Claude Opus 4.6 --- cuda_core/cuda/core/_kernel_arg_handler.pyx | 2 ++ cuda_core/tests/test_tensor_map.py | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index b55a4abc7a..5619ba7492 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -139,6 +139,8 @@ cdef inline int prepare_tensor_map_arg( # We copy rather than pointing directly at arg._tensor_map for lifetime # safety: ParamHolder owns and frees its argument buffers independently. cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap)) + if ptr is NULL: + raise MemoryError("Failed to allocate memory for CUtensorMap") memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap)) # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__, # data_addresses[idx] is the pointer passed to cuLaunchKernel. diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 790307857b..96dd14a419 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -28,14 +28,6 @@ def skip_if_no_tma(dev): pytest.skip("Device does not support TMA (requires compute capability 9.0+)") -def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256): - """Allocate a device buffer and return it with proper alignment.""" - n_elements = 1 - for s in shape: - n_elements *= s - buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment) - return buf - class _DeviceArray: """Wrap a Buffer with explicit shape via __cuda_array_interface__. @@ -179,6 +171,19 @@ def test_invalid_rank_zero(self, dev, skip_if_no_tma): data_type=TensorMapDataType.FLOAT32, ) + def test_invalid_rank_six(self, dev, skip_if_no_tma): + shape = (2, 2, 2, 2, 2, 2) + n_elements = 1 + for s in shape: + n_elements *= s + buf = dev.allocate(n_elements * 4) + arr = _DeviceArray(buf, shape) + with pytest.raises(ValueError, match="rank must be between 1 and 5"): + TensorMapDescriptor.from_tiled( + arr, + box_dim=(2,) * 6, + ) + def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma): buf = dev.allocate(1024 * 4) with pytest.raises(ValueError, match="box_dim must have 1 elements"): From 23a8900834a1b62a163664d3ab31fcbdf29bba02 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 24 Feb 2026 17:20:05 -0800 Subject: [PATCH 06/13] Split TMA example into two focused files Move the replace_address() demonstration into its own self-contained example (tma_replace_address.py) so each file covers a single concept. Co-Authored-By: Claude Opus 4.6 --- cuda_core/examples/tma_replace_address.py | 189 ++++++++++++++++++++++ cuda_core/examples/tma_tensor_map.py | 20 --- 2 files changed, 189 insertions(+), 20 deletions(-) create mode 100644 cuda_core/examples/tma_replace_address.py diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py new file mode 100644 index 0000000000..7f11cf551c --- /dev/null +++ b/cuda_core/examples/tma_replace_address.py @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates how to use replace_address() to repoint a TMA +# (Tensor Memory Accelerator) descriptor at a different tensor without +# rebuilding the descriptor from scratch. +# +# The workflow is: +# +# 1. Create a TMA tiled descriptor and launch a kernel to verify it works +# 2. Allocate a second tensor with different content +# 3. Call replace_address() to repoint the same descriptor at the new tensor +# 4. Re-launch the kernel and verify it reads from the new tensor +# +# This is useful when the tensor layout (shape, dtype, tile size) stays the +# same but the underlying data buffer changes, e.g. double-buffering or +# iterating over a sequence of same-shaped tensors. +# +# Requirements: +# - Hopper or later GPU (compute capability >= 9.0) +# - CuPy +# - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set) +# +# ################################################################################ + +import sys + +import cupy as cp +import numpy as np + +from cuda.core import ( + Device, + LaunchConfig, + Program, + ProgramOptions, + TensorMapDescriptor, + launch, +) + +# --------------------------------------------------------------------------- +# Check for Hopper+ GPU +# --------------------------------------------------------------------------- +dev = Device() +arch = dev.compute_capability +if arch < (9, 0): + print( + "TMA requires compute capability >= 9.0 (Hopper or later)", + file=sys.stderr, + ) + sys.exit(0) +dev.set_current() + +arch_str = "".join(f"{i}" for i in arch) + +# --------------------------------------------------------------------------- +# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then +# copies the tile to an output buffer so we can verify correctness. +# +# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be +# compiled with NVRTC without pulling in the full driver-API header. +# +# Key points: +# - The tensor map is passed by value with __grid_constant__ so the TMA +# hardware can read it from grid-constant memory. +# - Thread 0 in each block issues the TMA load and manages the mbarrier. +# - All threads wait on the mbarrier, then copy from shared to global. +# --------------------------------------------------------------------------- +TILE_SIZE = 128 # elements per tile (must match the kernel constant) + +code = r""" +// Minimal definition of the 128-byte opaque tensor map struct. +struct __align__(64) TensorMap { unsigned long long opaque[16]; }; + +static constexpr int TILE_SIZE = 128; + +extern "C" +__global__ void tma_copy( + const __grid_constant__ TensorMap tensor_map, + float* output, + int N) +{ + __shared__ __align__(128) float smem[TILE_SIZE]; + __shared__ __align__(8) unsigned long long mbar; + + const int tid = threadIdx.x; + const int tile_start = blockIdx.x * TILE_SIZE; + + // ---- Thread 0: set up mbarrier and issue the TMA load ---- + if (tid == 0) + { + // Initialise a single-phase mbarrier (1 arriving thread). + asm volatile( + "mbarrier.init.shared.b64 [%0], 1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start' + // from the tensor described by 'tensor_map' into shared memory. + asm volatile( + "cp.async.bulk.tensor.1d.shared::cluster.global.tile" + ".mbarrier::complete_tx::bytes" + " [%0], [%1, {%2}], [%3];" + :: "r"((unsigned)__cvta_generic_to_shared(smem)), + "l"(&tensor_map), + "r"(tile_start), + "r"((unsigned)__cvta_generic_to_shared(&mbar))); + + // Tell the mbarrier how many bytes the TMA will deliver. + asm volatile( + "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar)), + "r"((unsigned)(TILE_SIZE * sizeof(float)))); + } + + __syncthreads(); + + // ---- Wait for the TMA load to complete ---- + if (tid == 0) + { + asm volatile( + "{ .reg .pred P; \n" + "WAIT: \n" + " mbarrier.try_wait.parity.shared.b64 P, [%0], 0; \n" + " @!P bra WAIT; \n" + "} \n" + :: "r"((unsigned)__cvta_generic_to_shared(&mbar))); + } + + __syncthreads(); + + // ---- Copy the tile from shared memory to the output buffer ---- + if (tid < TILE_SIZE) + { + const int idx = tile_start + tid; + if (idx < N) + output[idx] = smem[tid]; + } +} +""" + +# --------------------------------------------------------------------------- +# Compile the kernel +# --------------------------------------------------------------------------- +prog = Program( + code, + code_type="c++", + options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"), +) +mod = prog.compile("cubin") +ker = mod.get_kernel("tma_copy") + +# --------------------------------------------------------------------------- +# 1) Prepare input data and verify the initial TMA copy +# --------------------------------------------------------------------------- +N = 1024 +a = cp.arange(N, dtype=cp.float32) # [0, 1, 2, ..., N-1] +output = cp.zeros(N, dtype=cp.float32) +dev.sync() # cupy uses its own stream + +tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) + +n_tiles = N // TILE_SIZE +config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) +launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output, a), "TMA copy produced incorrect results" +print(f"TMA copy verified: {N} elements across {n_tiles} tiles") + +# --------------------------------------------------------------------------- +# 2) Demonstrate replace_address() +# Create a second tensor with different content, point the *same* +# descriptor at it, and re-launch without rebuilding the descriptor. +# --------------------------------------------------------------------------- +b = cp.full(N, fill_value=42.0, dtype=cp.float32) +dev.sync() + +tensor_map.replace_address(b) + +output2 = cp.zeros(N, dtype=cp.float32) +dev.sync() + +launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) +dev.sync() + +assert cp.array_equal(output2, b), "replace_address produced incorrect results" +print("replace_address verified: descriptor reused with new source tensor") diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index e93c02dc08..63c0ac7b5c 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -13,7 +13,6 @@ # 1. Creating a TMA tiled descriptor from a CuPy device array # 2. Passing the descriptor to a kernel via launch() # 3. Using TMA to load tiles into shared memory (via inline PTX) -# 4. Updating the descriptor's source address with replace_address() # # Requirements: # - Hopper or later GPU (compute capability >= 9.0) @@ -174,22 +173,3 @@ assert cp.array_equal(output, a), "TMA copy produced incorrect results" print(f"TMA copy verified: {N} elements across {n_tiles} tiles") - -# --------------------------------------------------------------------------- -# 4) Demonstrate replace_address() -# Create a second tensor with different content, point the *same* -# descriptor at it, and re-launch without rebuilding the descriptor. -# --------------------------------------------------------------------------- -b = cp.full(N, fill_value=42.0, dtype=cp.float32) -dev.sync() - -tensor_map.replace_address(b) - -output2 = cp.zeros(N, dtype=cp.float32) -dev.sync() - -launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N)) -dev.sync() - -assert cp.array_equal(output2, b), "replace_address produced incorrect results" -print("replace_address verified: descriptor reused with new source tensor") From 0a1b7202b9bbaee509c8a6b44cc47edb9d51d03b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 25 Feb 2026 14:14:38 -0800 Subject: [PATCH 07/13] pre-commit --- cuda_core/cuda/core/__init__.py | 1 + cuda_core/examples/tma_replace_address.py | 1 - cuda_core/examples/tma_tensor_map.py | 1 - cuda_core/tests/test_tensor_map.py | 37 ++++++++++++----------- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index d5f3693721..84e4f3d356 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -60,6 +60,7 @@ StridedMemoryView, args_viewable_as_strided_memory, ) +<<<<<<< HEAD from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import ( diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py index 7f11cf551c..21fb91c254 100644 --- a/cuda_core/examples/tma_replace_address.py +++ b/cuda_core/examples/tma_replace_address.py @@ -30,7 +30,6 @@ import cupy as cp import numpy as np - from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index 63c0ac7b5c..c85c5d96bd 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -25,7 +25,6 @@ import cupy as cp import numpy as np - from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 96dd14a419..1d6e8195b6 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -1,14 +1,12 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import pytest - import numpy as np - +import pytest from cuda.core import ( Device, - TensorMapDescriptor, TensorMapDataType, + TensorMapDescriptor, TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, @@ -28,7 +26,6 @@ def skip_if_no_tma(dev): pytest.skip("Device does not support TMA (requires compute capability 9.0+)") - class _DeviceArray: """Wrap a Buffer with explicit shape via __cuda_array_interface__. @@ -36,6 +33,7 @@ class _DeviceArray: we need the tensor to report a proper shape/dtype so the TMA encoder sees the correct rank, dimensions, and strides. """ + def __init__(self, buf, shape, dtype=np.float32): self._buf = buf # prevent GC self.__cuda_array_interface__ = { @@ -225,25 +223,30 @@ def test_invalid_data_type(self, dev, skip_if_no_tma): class TestTensorMapDtypeMapping: """Test automatic dtype inference from numpy dtypes.""" - @pytest.mark.parametrize("np_dtype,expected_tma_dt", [ - (np.uint8, TensorMapDataType.UINT8), - (np.uint16, TensorMapDataType.UINT16), - (np.uint32, TensorMapDataType.UINT32), - (np.int32, TensorMapDataType.INT32), - (np.uint64, TensorMapDataType.UINT64), - (np.int64, TensorMapDataType.INT64), - (np.float16, TensorMapDataType.FLOAT16), - (np.float32, TensorMapDataType.FLOAT32), - (np.float64, TensorMapDataType.FLOAT64), - ]) + @pytest.mark.parametrize( + "np_dtype,expected_tma_dt", + [ + (np.uint8, TensorMapDataType.UINT8), + (np.uint16, TensorMapDataType.UINT16), + (np.uint32, TensorMapDataType.UINT32), + (np.int32, TensorMapDataType.INT32), + (np.uint64, TensorMapDataType.UINT64), + (np.int64, TensorMapDataType.INT64), + (np.float16, TensorMapDataType.FLOAT16), + (np.float32, TensorMapDataType.FLOAT32), + (np.float64, TensorMapDataType.FLOAT64), + ], + ) def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma): from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt def test_bfloat16_mapping(self): try: - from ml_dtypes import bfloat16 from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + from ml_dtypes import bfloat16 + assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16 except ImportError: pytest.skip("ml_dtypes not installed") From 44fbdcf80f7c917bbd9ef790da0c7001cd2b747f Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 25 Feb 2026 15:07:54 -0800 Subject: [PATCH 08/13] adding stride meta data to gpu allocated memory --- cuda_core/tests/test_tensor_map.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 1d6e8195b6..3e030dd29d 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -98,8 +98,9 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma): def test_from_tiled_2d(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) # 64x64 float32 + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, ) @@ -107,8 +108,9 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma): def test_from_tiled_3d(self, dev, skip_if_no_tma): buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 + tensor = _DeviceArray(buf, (16, 16, 16)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(8, 8, 8), data_type=TensorMapDataType.FLOAT32, ) @@ -128,8 +130,9 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma): def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, swizzle=TensorMapSwizzle.SWIZZLE_128B, @@ -138,8 +141,9 @@ def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, l2_promotion=TensorMapL2Promotion.L2_128B, @@ -148,8 +152,9 @@ def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma): def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) desc = TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(32, 32), data_type=TensorMapDataType.FLOAT32, oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA, @@ -162,9 +167,10 @@ class TestTensorMapDescriptorValidation: def test_invalid_rank_zero(self, dev, skip_if_no_tma): buf = dev.allocate(64) + tensor = _DeviceArray(buf, ()) # 0-dim tensor with pytest.raises(ValueError, match="rank must be between 1 and 5"): TensorMapDescriptor.from_tiled( - buf, + tensor, box_dim=(), data_type=TensorMapDataType.FLOAT32, ) @@ -286,8 +292,9 @@ class TestTensorMapIm2col: def test_from_im2col_3d(self, dev, skip_if_no_tma): # 3D tensor: batch=1, height=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) desc = TensorMapDescriptor.from_im2col( - buf, + tensor, pixel_box_lower_corner=(0,), pixel_box_upper_corner=(4,), channels_per_pixel=64, @@ -310,9 +317,10 @@ def test_from_im2col_rank_validation(self, dev, skip_if_no_tma): def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma): buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) # 3D: n_spatial = 1 with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"): TensorMapDescriptor.from_im2col( - buf, + tensor, pixel_box_lower_corner=(0, 0), pixel_box_upper_corner=(4,), channels_per_pixel=64, From bdf39a24efcb528a2610967ea5f6459c0fee2472 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Wed, 25 Feb 2026 15:59:31 -0800 Subject: [PATCH 09/13] im2col fixes --- cuda_core/tests/test_tensor_map.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 3e030dd29d..8b5e27ce6f 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -373,8 +373,9 @@ def skip_if_no_im2col_wide(self, dev): def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide): # 3D tensor: batch=1, width=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) desc = TensorMapDescriptor.from_im2col_wide( - buf, + tensor, pixel_box_lower_corner_width=0, pixel_box_upper_corner_width=4, channels_per_pixel=64, From 96a3e84275f8d88e6476b1e71ef04bf6fedba6c8 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 3 Mar 2026 18:41:07 -0500 Subject: [PATCH 10/13] Reuse CCCL TMA descriptor construction for tiled TensorMap and keep validated views alive to avoid DLPack-backed pointer lifetime hazards. Add explicit tiled element-stride coverage and acknowledge the DLPack include-layout compatibility follow-up in NVIDIA/cccl#7871. Made-with: Cursor --- .gitignore | 1 + cuda_core/cuda/core/__init__.py | 1 - cuda_core/cuda/core/_cpp/tensor_map.cpp | 149 +++++++++ cuda_core/cuda/core/_cpp/tensor_map_cccl.h | 43 +++ cuda_core/cuda/core/_tensor_map.pxd | 1 + cuda_core/cuda/core/_tensor_map.pyx | 358 ++++++++++++++++----- cuda_core/examples/tma_replace_address.py | 1 + cuda_core/examples/tma_tensor_map.py | 1 + cuda_core/tests/test_tensor_map.py | 27 +- 9 files changed, 497 insertions(+), 85 deletions(-) create mode 100644 cuda_core/cuda/core/_cpp/tensor_map.cpp create mode 100644 cuda_core/cuda/core/_cpp/tensor_map_cccl.h diff --git a/.gitignore b/.gitignore index 7d9bcd38c3..18c919ca5e 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ __pycache__/ !*_impl.cpp !cuda_bindings/cuda/bindings/_lib/param_packer.cpp !cuda_bindings/cuda/bindings/_bindings/loader.cpp +!cuda_core/cuda/core/_cpp/*.cpp cache_driver cache_runtime cache_nvrtc diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 84e4f3d356..d5f3693721 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -60,7 +60,6 @@ StridedMemoryView, args_viewable_as_strided_memory, ) -<<<<<<< HEAD from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import ( diff --git a/cuda_core/cuda/core/_cpp/tensor_map.cpp b/cuda_core/cuda/core/_cpp/tensor_map.cpp new file mode 100644 index 0000000000..af09aa1b2f --- /dev/null +++ b/cuda_core/cuda/core/_cpp/tensor_map.cpp @@ -0,0 +1,149 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tensor_map_cccl.h" + +#include + +#include +#include + +#if defined(__has_include) +# if __has_include() +# include +# define CUDA_CORE_HAS_CUDA_TMA 1 +# else +# define CUDA_CORE_HAS_CUDA_TMA 0 +# endif +# if __has_include() +# include +# define CUDA_CORE_HAS_DLPACK_H 1 +# else +# define CUDA_CORE_HAS_DLPACK_H 0 +# endif +#else +# define CUDA_CORE_HAS_CUDA_TMA 0 +# define CUDA_CORE_HAS_DLPACK_H 0 +#endif + +static inline void cuda_core_write_err(char* err, size_t cap, const char* msg) noexcept +{ + if (!err || cap == 0) + return; + if (!msg) + { + err[0] = '\0'; + return; + } + size_t n = ::strlen(msg); + if (n >= cap) + n = cap - 1; + ::memcpy(err, msg, n); + err[n] = '\0'; +} + +int cuda_core_cccl_make_tma_descriptor_tiled( + void* out_tensor_map, + void* data, + int device_type, + int device_id, + int ndim, + const int64_t* shape, + const int64_t* strides, + uint8_t dtype_code, + uint8_t dtype_bits, + uint16_t dtype_lanes, + const int* box_sizes, + const int* elem_strides, + int interleave_layout, + int swizzle, + int l2_fetch_size, + int oob_fill, + char* err, + size_t err_cap) noexcept +{ +#if !(CUDA_CORE_HAS_CUDA_TMA && CUDA_CORE_HAS_DLPACK_H) + (void)out_tensor_map; + (void)data; + (void)device_type; + (void)device_id; + (void)ndim; + (void)shape; + (void)strides; + (void)dtype_code; + (void)dtype_bits; + (void)dtype_lanes; + (void)box_sizes; + (void)elem_strides; + (void)interleave_layout; + (void)swizzle; + (void)l2_fetch_size; + (void)oob_fill; + cuda_core_write_err(err, err_cap, "CCCL and/or not available at build time"); + return 1; +#else + try + { + if (!out_tensor_map) + { + cuda_core_write_err(err, err_cap, "out_tensor_map is NULL"); + return 1; + } + if (!data) + { + cuda_core_write_err(err, err_cap, "tensor data pointer is NULL"); + return 1; + } + if (!shape || !box_sizes || ndim <= 0) + { + cuda_core_write_err(err, err_cap, "invalid rank/shape/box_sizes"); + return 1; + } + + DLTensor t{}; + t.data = data; + t.device = {static_cast(device_type), device_id}; + t.ndim = ndim; + t.dtype.code = dtype_code; + t.dtype.bits = dtype_bits; + t.dtype.lanes = dtype_lanes; + // CCCL promises not to mutate the arrays, but DLPack uses non-const pointers. + t.shape = const_cast(shape); + t.strides = const_cast(strides); + t.byte_offset = 0; + + const auto layout = static_cast(interleave_layout); + const auto swz = static_cast(swizzle); + const auto l2 = static_cast(l2_fetch_size); + const auto oob = static_cast(oob_fill); + + auto box = cuda::std::span(box_sizes, static_cast(ndim)); + + CUtensorMap desc{}; + if (elem_strides) + { + auto es = cuda::std::span(elem_strides, static_cast(ndim)); + desc = cuda::make_tma_descriptor(t, box, es, layout, swz, l2, oob); + } + else + { + desc = cuda::make_tma_descriptor(t, box, layout, swz, l2, oob); + } + + ::memcpy(out_tensor_map, &desc, sizeof(CUtensorMap)); + cuda_core_write_err(err, err_cap, nullptr); + return 0; + } + catch (const std::exception& e) + { + cuda_core_write_err(err, err_cap, e.what()); + return 1; + } + catch (...) + { + cuda_core_write_err(err, err_cap, "unknown error while building TMA descriptor"); + return 1; + } +#endif +} diff --git a/cuda_core/cuda/core/_cpp/tensor_map_cccl.h b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h new file mode 100644 index 0000000000..71be425182 --- /dev/null +++ b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_ +#define CUDA_CORE_TENSOR_MAP_CCCL_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from ). +// +// Returns 0 on success; on failure returns non-zero and writes a best-effort +// human-readable message into (err, err_cap) if provided. +int cuda_core_cccl_make_tma_descriptor_tiled( + void* out_tensor_map, + void* data, + int device_type, + int device_id, + int ndim, + const int64_t* shape, // length ndim + const int64_t* strides, // length ndim, or NULL for contiguous + uint8_t dtype_code, + uint8_t dtype_bits, + uint16_t dtype_lanes, + const int* box_sizes, // length ndim + const int* elem_strides, // length ndim, or NULL for all-ones overload + int interleave_layout, + int swizzle, + int l2_fetch_size, + int oob_fill, + char* err, + size_t err_cap) noexcept; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // CUDA_CORE_TENSOR_MAP_CCCL_H_ diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index b2b165b5aa..07e8adebc9 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver cdef class TensorMapDescriptor: cdef cydriver.CUtensorMap _tensor_map cdef object _source_ref + cdef object _view_ref cdef object _repr_info cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 9edcccb928..6054add9b6 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -2,9 +2,11 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport intptr_t, uint32_t, uint64_t +from libc.stdint cimport intptr_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t +from libc.stddef cimport size_t from cuda.bindings cimport cydriver from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._dlpack cimport kDLInt, kDLUInt, kDLFloat, kDLBfloat, _kDLCUDA import enum @@ -12,6 +14,27 @@ import numpy from cuda.core._memoryview import StridedMemoryView +cdef extern from "_cpp/tensor_map_cccl.h": + int cuda_core_cccl_make_tma_descriptor_tiled( + void* out_tensor_map, + void* data, + int device_type, + int device_id, + int ndim, + const int64_t* shape, + const int64_t* strides, + uint8_t dtype_code, + uint8_t dtype_bits, + uint16_t dtype_lanes, + const int* box_sizes, + const int* elem_strides, + int interleave_layout, + int swizzle, + int l2_fetch_size, + int oob_fill, + char* err, + size_t err_cap) nogil + try: from ml_dtypes import bfloat16 as ml_bfloat16 @@ -80,14 +103,25 @@ class TensorMapOOBFill(enum.IntEnum): NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA -class TensorMapIm2ColWideMode(enum.IntEnum): - """Im2col wide mode for tensor map descriptors. +IF CUDA_CORE_BUILD_MAJOR >= 13: + class TensorMapIm2ColWideMode(enum.IntEnum): + """Im2col wide mode for tensor map descriptors. - These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values. - Supported on compute capability 10.0+. - """ - W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W - W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128 + These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values. + Supported on compute capability 10.0+. + """ + W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W + W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128 +ELSE: + class TensorMapIm2ColWideMode(enum.IntEnum): + """Im2col wide mode for tensor map descriptors. + + This enum is always defined for API stability, but the + :meth:`TensorMapDescriptor.from_im2col_wide` factory requires a CUDA 13+ + build and will raise otherwise. + """ + W = 0 + W128 = 1 # Mapping from numpy dtype to TMA data type @@ -150,6 +184,65 @@ def _resolve_data_type(view, data_type): return tma_dt +cdef inline bint _tma_dtype_to_dlpack( + object tma_dt, + uint8_t* out_code, + uint8_t* out_bits, + uint16_t* out_lanes, +) noexcept: + if tma_dt == TensorMapDataType.UINT8: + out_code[0] = kDLUInt + out_bits[0] = 8 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.UINT16: + out_code[0] = kDLUInt + out_bits[0] = 16 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.UINT32: + out_code[0] = kDLUInt + out_bits[0] = 32 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.UINT64: + out_code[0] = kDLUInt + out_bits[0] = 64 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.INT32: + out_code[0] = kDLInt + out_bits[0] = 32 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.INT64: + out_code[0] = kDLInt + out_bits[0] = 64 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.FLOAT16: + out_code[0] = kDLFloat + out_bits[0] = 16 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.FLOAT32: + out_code[0] = kDLFloat + out_bits[0] = 32 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.FLOAT64: + out_code[0] = kDLFloat + out_bits[0] = 64 + out_lanes[0] = 1 + return True + if tma_dt == TensorMapDataType.BFLOAT16: + out_code[0] = kDLBfloat + out_bits[0] = 16 + out_lanes[0] = 1 + return True + return False + + def _get_validated_view(tensor): """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer.""" if isinstance(tensor, StridedMemoryView): @@ -268,7 +361,11 @@ cdef class TensorMapDescriptor: cdef TensorMapDescriptor desc = cls.__new__(cls) view = _get_validated_view(tensor) + # Keep both the original tensor object and the validated view alive. + # For DLPack exporters, the view may hold the owning capsule whose + # deleter can free the backing allocation when released. desc._source_ref = tensor + desc._view_ref = view tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -292,8 +389,92 @@ cdef class TensorMapDescriptor: raise ValueError( f"box_dim[{i}] must be in [1, 256], got {bd}") + cdef bint elem_strides_provided = element_strides is not None element_strides = _validate_element_strides(element_strides, rank) + # Reuse CCCL/libcu++'s DLPack -> CUtensorMap conversion when possible. + # This avoids maintaining a second, independent validation/encoding implementation. + cdef uint8_t dl_code + cdef uint8_t dl_bits + cdef uint16_t dl_lanes + cdef int64_t c_shape[5] + cdef int64_t c_strides[5] + cdef int c_box_sizes[5] + cdef int c_elem_strides[5] + cdef const int64_t* c_strides_ptr + cdef const int* c_elem_strides_ptr + cdef char errbuf[512] + cdef int i_cccl + cdef int device_type + cdef int c_device_id + cdef int c_cccl_interleave_int + cdef int c_cccl_swizzle_int + cdef int c_cccl_l2_promotion_int + cdef int c_cccl_oob_fill_int + cdef int rc + if _tma_dtype_to_dlpack(tma_dt, &dl_code, &dl_bits, &dl_lanes): + c_strides_ptr = NULL + c_elem_strides_ptr = NULL + errbuf[0] = 0 + + for i_cccl in range(rank): + c_shape[i_cccl] = shape[i_cccl] + c_box_sizes[i_cccl] = box_dim[i_cccl] + if elem_strides_provided: + c_elem_strides[i_cccl] = element_strides[i_cccl] + + if view.strides is not None: + for i_cccl in range(rank): + c_strides[i_cccl] = view.strides[i_cccl] + c_strides_ptr = &c_strides[0] + + if elem_strides_provided: + c_elem_strides_ptr = &c_elem_strides[0] + + device_type = _kDLCUDA + c_device_id = view.device_id + c_cccl_interleave_int = int(interleave) + c_cccl_swizzle_int = int(swizzle) + c_cccl_l2_promotion_int = int(l2_promotion) + c_cccl_oob_fill_int = int(oob_fill) + + with nogil: + rc = cuda_core_cccl_make_tma_descriptor_tiled( + &desc._tensor_map, + global_address, + device_type, + c_device_id, + rank, + &c_shape[0], + c_strides_ptr, + dl_code, + dl_bits, + dl_lanes, + &c_box_sizes[0], + c_elem_strides_ptr, + c_cccl_interleave_int, + c_cccl_swizzle_int, + c_cccl_l2_promotion_int, + c_cccl_oob_fill_int, + &errbuf[0], + sizeof(errbuf), + ) + + if rc == 0: + desc._repr_info = { + "method": "tiled", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + return desc + + msg = errbuf[:].split(b"\0", 1)[0].decode("utf-8", errors="replace") + # If CCCL isn't available at build time, fall back to the direct + # driver API path to preserve functionality on older toolchains. + if "not available at build time" not in msg: + raise ValueError(f"Failed to build TMA descriptor via CCCL: {msg}") + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] byte_strides = _compute_byte_strides(shape, view.strides, elem_size) @@ -411,6 +592,7 @@ cdef class TensorMapDescriptor: view = _get_validated_view(tensor) desc._source_ref = tensor + desc._view_ref = view tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -447,6 +629,10 @@ cdef class TensorMapDescriptor: cdef int[3] c_pixel_box_upper cdef int i_c + for i_c in range(3): + c_pixel_box_lower[i_c] = 0 + c_pixel_box_upper[i_c] = 0 + for i_c in range(rank): c_global_dim[i_c] = shape[rank - 1 - i_c] c_element_strides[i_c] = element_strides[rank - 1 - i_c] @@ -555,85 +741,90 @@ cdef class TensorMapDescriptor: If the tensor rank is outside [3, 5], the pointer is not 16-byte aligned, or other constraints are violated. """ - cdef TensorMapDescriptor desc = cls.__new__(cls) - - view = _get_validated_view(tensor) - desc._source_ref = tensor - - tma_dt = _resolve_data_type(view, data_type) - cdef int c_data_type_int = int(tma_dt) - cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int - - cdef intptr_t global_address = view.ptr - shape = view.shape - - cdef int rank = len(shape) - if rank < 3 or rank > 5: - raise ValueError( - f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") - - element_strides = _validate_element_strides(element_strides, rank) - - cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] - byte_strides = _compute_byte_strides(shape, view.strides, elem_size) - - # Reverse all dimension arrays for column-major convention - cdef uint64_t[5] c_global_dim - cdef uint64_t[4] c_global_strides - cdef uint32_t[5] c_element_strides - cdef int i_c - - for i_c in range(rank): - c_global_dim[i_c] = shape[rank - 1 - i_c] - c_element_strides[i_c] = element_strides[rank - 1 - i_c] + IF CUDA_CORE_BUILD_MAJOR < 13: + raise RuntimeError( + "TensorMapDescriptor.from_im2col_wide requires a CUDA 13+ build") + ELSE: + cdef TensorMapDescriptor desc = cls.__new__(cls) - for i_c in range(rank - 1): - c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + view = _get_validated_view(tensor) + desc._source_ref = tensor + desc._view_ref = view - cdef uint32_t c_rank = rank - cdef int c_lower_w = pixel_box_lower_corner_width - cdef int c_upper_w = pixel_box_upper_corner_width - cdef uint32_t c_channels = channels_per_pixel - cdef uint32_t c_pixels = pixels_per_column - cdef int c_interleave_int = int(interleave) - cdef int c_mode_int = int(mode) - cdef int c_swizzle_int = int(swizzle) - cdef int c_l2_promotion_int = int(l2_promotion) - cdef int c_oob_fill_int = int(oob_fill) - cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int - cdef cydriver.CUtensorMapIm2ColWideMode c_mode = c_mode_int - cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int - cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int - cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + tma_dt = _resolve_data_type(view, data_type) + cdef int c_data_type_int = int(tma_dt) + cdef cydriver.CUtensorMapDataType c_data_type = c_data_type_int - with nogil: - HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide( - &desc._tensor_map, - c_data_type, - c_rank, - global_address, - c_global_dim, - c_global_strides, - c_lower_w, - c_upper_w, - c_channels, - c_pixels, - c_element_strides, - c_interleave, - c_mode, - c_swizzle, - c_l2_promotion, - c_oob_fill, - )) + cdef intptr_t global_address = view.ptr + shape = view.shape - desc._repr_info = { - "method": "im2col_wide", - "rank": rank, - "data_type": tma_dt, - "swizzle": swizzle, - } - - return desc + cdef int rank = len(shape) + if rank < 3 or rank > 5: + raise ValueError( + f"Im2col-wide tensor rank must be between 3 and 5, got {rank}") + + element_strides = _validate_element_strides(element_strides, rank) + + cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt] + byte_strides = _compute_byte_strides(shape, view.strides, elem_size) + + # Reverse all dimension arrays for column-major convention + cdef uint64_t[5] c_global_dim + cdef uint64_t[4] c_global_strides + cdef uint32_t[5] c_element_strides + cdef int i_c + + for i_c in range(rank): + c_global_dim[i_c] = shape[rank - 1 - i_c] + c_element_strides[i_c] = element_strides[rank - 1 - i_c] + + for i_c in range(rank - 1): + c_global_strides[i_c] = byte_strides[rank - 2 - i_c] + + cdef uint32_t c_rank = rank + cdef int c_lower_w = pixel_box_lower_corner_width + cdef int c_upper_w = pixel_box_upper_corner_width + cdef uint32_t c_channels = channels_per_pixel + cdef uint32_t c_pixels = pixels_per_column + cdef int c_interleave_int = int(interleave) + cdef int c_mode_int = int(mode) + cdef int c_swizzle_int = int(swizzle) + cdef int c_l2_promotion_int = int(l2_promotion) + cdef int c_oob_fill_int = int(oob_fill) + cdef cydriver.CUtensorMapInterleave c_interleave = c_interleave_int + cdef cydriver.CUtensorMapIm2ColWideMode c_mode = c_mode_int + cdef cydriver.CUtensorMapSwizzle c_swizzle = c_swizzle_int + cdef cydriver.CUtensorMapL2promotion c_l2_promotion = c_l2_promotion_int + cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = c_oob_fill_int + + with nogil: + HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide( + &desc._tensor_map, + c_data_type, + c_rank, + global_address, + c_global_dim, + c_global_strides, + c_lower_w, + c_upper_w, + c_channels, + c_pixels, + c_element_strides, + c_interleave, + c_mode, + c_swizzle, + c_l2_promotion, + c_oob_fill, + )) + + desc._repr_info = { + "method": "im2col_wide", + "rank": rank, + "data_type": tma_dt, + "swizzle": swizzle, + } + + return desc def replace_address(self, tensor): """Replace the global memory address in this tensor map descriptor. @@ -662,6 +853,7 @@ cdef class TensorMapDescriptor: # so we don't drop the old tensor (risking a dangling pointer in the # CUtensorMap struct) if the call fails. self._source_ref = tensor + self._view_ref = view def __repr__(self): info = self._repr_info diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py index 21fb91c254..7f11cf551c 100644 --- a/cuda_core/examples/tma_replace_address.py +++ b/cuda_core/examples/tma_replace_address.py @@ -30,6 +30,7 @@ import cupy as cp import numpy as np + from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index c85c5d96bd..63c0ac7b5c 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -25,6 +25,7 @@ import cupy as cp import numpy as np + from cuda.core import ( Device, LaunchConfig, diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 8b5e27ce6f..4db456d863 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -3,6 +3,7 @@ import numpy as np import pytest + from cuda.core import ( Device, TensorMapDataType, @@ -128,6 +129,29 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma): ) assert desc is not None + def test_from_tiled_with_element_strides_buffer(self, dev, skip_if_no_tma): + # Use a Buffer input (DLPack path) and explicit element_strides. + buf = dev.allocate(1024 * 4) + desc = TensorMapDescriptor.from_tiled( + buf, + box_dim=(64,), + element_strides=(2,), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + + def test_from_tiled_with_element_strides_cai(self, dev, skip_if_no_tma): + # Use a CAI-style tensor wrapper and explicit element_strides. + buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) + desc = TensorMapDescriptor.from_tiled( + tensor, + box_dim=(32, 32), + element_strides=(2, 1), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma): buf = dev.allocate(64 * 64 * 4) tensor = _DeviceArray(buf, (64, 64)) @@ -250,9 +274,10 @@ def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma): def test_bfloat16_mapping(self): try: - from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA from ml_dtypes import bfloat16 + from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA + assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16 except ImportError: pytest.skip("ml_dtypes not installed") From 1a6b41608658499486fa5705eaea93551425442b Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 7 Mar 2026 08:00:36 -0500 Subject: [PATCH 11/13] Skip im2col-wide TensorMap tests when runtime support is unavailable. Probe support in the fixture and skip when cuda.core is built without CUDA 13 im2col-wide support or when the driver/GPU reports CUDA_ERROR_INVALID_VALUE, so unsupported RTXPRO6000 lanes don't block unrelated changes. Made-with: Cursor --- cuda_core/tests/test_tensor_map.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 4db456d863..2d6dc138c0 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -395,6 +395,31 @@ def skip_if_no_im2col_wide(self, dev): if cc.major < 10: pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)") + # Some environments in CI exercise this test module with a cuda.core + # build that does not include im2col-wide symbols (CUDA < 13 build), + # or with driver/GPU combinations that reject im2col-wide descriptor + # encoding for otherwise valid inputs. Probe once per test invocation + # and skip only for those known unsupported cases. + buf = dev.allocate(1 * 32 * 64 * 4) + tensor = _DeviceArray(buf, (1, 32, 64)) + try: + TensorMapDescriptor.from_im2col_wide( + tensor, + pixel_box_lower_corner_width=0, + pixel_box_upper_corner_width=4, + channels_per_pixel=64, + pixels_per_column=4, + data_type=TensorMapDataType.FLOAT32, + ) + except RuntimeError as e: + if "requires a CUDA 13+ build" in str(e): + pytest.skip("Im2col-wide requires cuda.core built with CUDA 13+") + raise + except Exception as e: + if "CUDA_ERROR_INVALID_VALUE" in str(e): + pytest.skip("Im2col-wide unsupported on this driver/GPU combination") + raise + def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide): # 3D tensor: batch=1, width=32, channels=64 buf = dev.allocate(1 * 32 * 64 * 4) From 892ee605e500b2e9225f7077bd66e61e1fc4eb97 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 7 Mar 2026 09:41:46 -0500 Subject: [PATCH 12/13] Align TensorMap API surface with review feedback and enforce context safety. Expose only TensorMapDescriptor in cuda.core, add StridedMemoryView.as_tensor_map(), remove redundant tensor-map fallback packing, and track/check descriptor context/device compatibility before replacement and kernel launch argument packing. Made-with: Cursor --- .gitignore | 1 - cuda_core/cuda/core/__init__.py | 10 +---- cuda_core/cuda/core/_kernel_arg_handler.pyx | 4 +- cuda_core/cuda/core/_memoryview.pyx | 33 ++++++++++++++ cuda_core/cuda/core/_tensor_map.pxd | 4 ++ cuda_core/cuda/core/_tensor_map.pyx | 48 +++++++++++++++++++++ cuda_core/examples/tma_replace_address.py | 4 +- cuda_core/examples/tma_tensor_map.py | 11 ++--- cuda_core/tests/test_tensor_map.py | 15 ++++++- 9 files changed, 107 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 18c919ca5e..7d9bcd38c3 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,6 @@ __pycache__/ !*_impl.cpp !cuda_bindings/cuda/bindings/_lib/param_packer.cpp !cuda_bindings/cuda/bindings/_bindings/loader.cpp -!cuda_core/cuda/core/_cpp/*.cpp cache_driver cache_runtime cache_nvrtc diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index d5f3693721..c32ff79ecc 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -68,12 +68,4 @@ Stream, StreamOptions, ) -from cuda.core._tensor_map import ( - TensorMapDataType, - TensorMapDescriptor, - TensorMapIm2ColWideMode, - TensorMapInterleave, - TensorMapL2Promotion, - TensorMapOOBFill, - TensorMapSwizzle, -) +from cuda.core._tensor_map import TensorMapDescriptor diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index 5619ba7492..28a981fed2 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -135,6 +135,7 @@ cdef inline int prepare_tensor_map_arg( vector.vector[void*]& data_addresses, TensorMapDescriptor arg, const size_t idx) except -1: + arg._check_context_compat() # Allocate a temporary buffer for the 128-byte CUtensorMap struct. # We copy rather than pointing directly at arg._tensor_map for lifetime # safety: ParamHolder owns and frees its argument buffers independently. @@ -350,9 +351,6 @@ cdef class ParamHolder: elif isinstance(arg, driver.CUgraphConditionalHandle): prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i) continue - elif isinstance(arg, tensor_map_descriptor_type): - prepare_tensor_map_arg(self.data, self.data_addresses, arg, i) - continue # TODO: support ctypes/numpy struct raise TypeError("the argument is of unsupported type: " + str(type(arg))) diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index 0e1df726c0..78002b3ff1 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -316,6 +316,39 @@ cdef class StridedMemoryView: view_buffer_strided(view, self.get_buffer(), layout, dtype, self.readonly) return view + def as_tensor_map( + self, + box_dim, + *, + element_strides=None, + data_type=None, + interleave=None, + swizzle=None, + l2_promotion=None, + oob_fill=None, + ): + """Create a tiled :obj:`TensorMapDescriptor` from this view. + + This is a convenience wrapper around + :meth:`cuda.core._tensor_map.TensorMapDescriptor.from_tiled`. + """ + from cuda.core._tensor_map import TensorMapDescriptor + + kwargs = {} + if element_strides is not None: + kwargs["element_strides"] = element_strides + if data_type is not None: + kwargs["data_type"] = data_type + if interleave is not None: + kwargs["interleave"] = interleave + if swizzle is not None: + kwargs["swizzle"] = swizzle + if l2_promotion is not None: + kwargs["l2_promotion"] = l2_promotion + if oob_fill is not None: + kwargs["oob_fill"] = oob_fill + return TensorMapDescriptor.from_tiled(self, box_dim, **kwargs) + def copy_from( self, other : StridedMemoryView, stream : Stream, allocator = None, diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd index 07e8adebc9..4c60b7fc70 100644 --- a/cuda_core/cuda/core/_tensor_map.pxd +++ b/cuda_core/cuda/core/_tensor_map.pxd @@ -3,12 +3,16 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver +from libc.stdint cimport intptr_t cdef class TensorMapDescriptor: cdef cydriver.CUtensorMap _tensor_map + cdef int _device_id + cdef intptr_t _context cdef object _source_ref cdef object _view_ref cdef object _repr_info + cdef int _check_context_compat(self) except -1 cdef void* _get_data_ptr(self) diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx index 6054add9b6..9d83a709cc 100644 --- a/cuda_core/cuda/core/_tensor_map.pyx +++ b/cuda_core/cuda/core/_tensor_map.pyx @@ -263,6 +263,22 @@ def _get_validated_view(tensor): return view +cdef inline intptr_t _get_current_context_ptr() except? 0: + cdef cydriver.CUcontext ctx + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + if ctx == NULL: + raise RuntimeError("TensorMapDescriptor requires an active CUDA context") + return ctx + + +cdef inline int _get_current_device_id() except -1: + cdef cydriver.CUdevice dev + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + return dev + + def _compute_byte_strides(shape, strides, elem_size): """Compute byte strides from element strides or C-contiguous fallback. @@ -313,6 +329,28 @@ cdef class TensorMapDescriptor: cdef void* _get_data_ptr(self): return &self._tensor_map + cdef int _check_context_compat(self) except -1: + cdef cydriver.CUcontext current_ctx + cdef cydriver.CUdevice current_dev + cdef int current_dev_id + if self._context == 0 and self._device_id < 0: + return 0 + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(¤t_ctx)) + if current_ctx == NULL: + raise RuntimeError("TensorMapDescriptor requires an active CUDA context") + if self._context != 0 and current_ctx != self._context: + raise RuntimeError( + "TensorMapDescriptor was created in a different CUDA context") + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetDevice(¤t_dev)) + current_dev_id = current_dev + if self._device_id >= 0 and current_dev_id != self._device_id: + raise RuntimeError( + f"TensorMapDescriptor belongs to device {self._device_id}, " + f"but current device is {current_dev_id}") + return 0 + @classmethod def from_tiled(cls, tensor, box_dim, *, element_strides=None, @@ -366,6 +404,8 @@ cdef class TensorMapDescriptor: # deleter can free the backing allocation when released. desc._source_ref = tensor desc._view_ref = view + desc._context = _get_current_context_ptr() + desc._device_id = _get_current_device_id() tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -593,6 +633,8 @@ cdef class TensorMapDescriptor: view = _get_validated_view(tensor) desc._source_ref = tensor desc._view_ref = view + desc._context = _get_current_context_ptr() + desc._device_id = _get_current_device_id() tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -750,6 +792,8 @@ cdef class TensorMapDescriptor: view = _get_validated_view(tensor) desc._source_ref = tensor desc._view_ref = view + desc._context = _get_current_context_ptr() + desc._device_id = _get_current_device_id() tma_dt = _resolve_data_type(view, data_type) cdef int c_data_type_int = int(tma_dt) @@ -839,7 +883,11 @@ cdef class TensorMapDescriptor: or a :obj:`~cuda.core.StridedMemoryView`. Must refer to device-accessible memory with a 16-byte-aligned pointer. """ + self._check_context_compat() view = _get_validated_view(tensor) + if view.device_id != self._device_id: + raise ValueError( + f"replace_address expects tensor on device {self._device_id}, got {view.device_id}") cdef intptr_t global_address = view.ptr diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py index 7f11cf551c..a734301fd6 100644 --- a/cuda_core/examples/tma_replace_address.py +++ b/cuda_core/examples/tma_replace_address.py @@ -36,7 +36,7 @@ LaunchConfig, Program, ProgramOptions, - TensorMapDescriptor, + StridedMemoryView, launch, ) @@ -159,7 +159,7 @@ output = cp.zeros(N, dtype=cp.float32) dev.sync() # cupy uses its own stream -tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) +tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,)) n_tiles = N // TILE_SIZE config = LaunchConfig(grid=n_tiles, block=TILE_SIZE) diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py index 63c0ac7b5c..2a5ce9ad86 100644 --- a/cuda_core/examples/tma_tensor_map.py +++ b/cuda_core/examples/tma_tensor_map.py @@ -31,7 +31,7 @@ LaunchConfig, Program, ProgramOptions, - TensorMapDescriptor, + StridedMemoryView, launch, ) @@ -48,8 +48,6 @@ sys.exit(0) dev.set_current() -arch_str = "".join(f"{i}" for i in arch) - # --------------------------------------------------------------------------- # CUDA kernel that uses TMA to load a 1-D tile into shared memory, then # copies the tile to an output buffer so we can verify correctness. @@ -141,7 +139,7 @@ prog = Program( code, code_type="c++", - options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"), + options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"), ) mod = prog.compile("cubin") ker = mod.get_kernel("tma_copy") @@ -155,11 +153,10 @@ dev.sync() # cupy uses its own stream # --------------------------------------------------------------------------- -# 2) Create a TMA tiled descriptor -# from_tiled() accepts any DLPack / __cuda_array_interface__ object. +# 2) Create a TMA tiled descriptor from a StridedMemoryView. # The dtype (float32) is inferred automatically from the CuPy array. # --------------------------------------------------------------------------- -tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,)) +tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,)) # --------------------------------------------------------------------------- # 3) Launch the kernel diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py index 2d6dc138c0..dee1f1e2e1 100644 --- a/cuda_core/tests/test_tensor_map.py +++ b/cuda_core/tests/test_tensor_map.py @@ -6,8 +6,11 @@ from cuda.core import ( Device, - TensorMapDataType, + StridedMemoryView, TensorMapDescriptor, +) +from cuda.core._tensor_map import ( + TensorMapDataType, TensorMapIm2ColWideMode, TensorMapInterleave, TensorMapL2Promotion, @@ -107,6 +110,16 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma): ) assert desc is not None + def test_strided_memory_view_as_tensor_map(self, dev, skip_if_no_tma): + buf = dev.allocate(64 * 64 * 4) + tensor = _DeviceArray(buf, (64, 64)) + view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1) + desc = view.as_tensor_map( + box_dim=(32, 32), + data_type=TensorMapDataType.FLOAT32, + ) + assert desc is not None + def test_from_tiled_3d(self, dev, skip_if_no_tma): buf = dev.allocate(16 * 16 * 16 * 4) # 16x16x16 float32 tensor = _DeviceArray(buf, (16, 16, 16)) From 5a0e141ebf5b4d7a65818c20d4adc4648d37571f Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 7 Mar 2026 09:54:32 -0500 Subject: [PATCH 13/13] Restore cu12 feature definitions in cuda_core pixi manifest. Bring back the cu12 feature blocks so pixi can parse the manifest and local test commands no longer fail early with a missing feature error. Made-with: Cursor --- cuda_core/pixi.toml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml index e351fb63b6..9dc6ac1ed9 100644 --- a/cuda_core/pixi.toml +++ b/cuda_core/pixi.toml @@ -75,9 +75,16 @@ cuda = "13" [feature.cu13.dependencies] cuda-version = "13.1.*" -# NOTE: cu12 environment is intentionally omitted because the path dependency -# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing, -# use conda-forge packages or CI workflows. +[feature.cu12.system-requirements] +cuda = "12" + +[feature.cu12.dependencies] +cuda-version = "12.*" + +# We keep both cu12 and cu13 because cuda.core works with either major version +# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1) +# For cu12 testing, use conda-forge packages: temporarily change path to wildcard +# or skip cu12 locally: pixi run -e cu13 test [environments] default = { features = [ "cu13",