From e3e1899206b77ef700f66d4fe89cde23f715b63b Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 23 Feb 2026 14:02:32 -0800
Subject: [PATCH 01/13] initial commit

---
 cuda_core/cuda/core/__init__.py             |  10 +
 cuda_core/cuda/core/_kernel_arg_handler.pyx |  24 +
 cuda_core/cuda/core/_tensor_map.pxd         |  12 +
 cuda_core/cuda/core/_tensor_map.pyx         | 524 ++++++++++++++++++++
 cuda_core/examples/tma_tensor_map.py        | 195 ++++++++
 cuda_core/tests/test_tensor_map.py          | 279 +++++++++++
 6 files changed, 1044 insertions(+)
 create mode 100644 cuda_core/cuda/core/_tensor_map.pxd
 create mode 100644 cuda_core/cuda/core/_tensor_map.pyx
 create mode 100644 cuda_core/examples/tma_tensor_map.py
 create mode 100644 cuda_core/tests/test_tensor_map.py

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index dfba887144..84e4f3d356 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -60,6 +60,7 @@
     StridedMemoryView,
     args_viewable_as_strided_memory,
 )
+<<<<<<< HEAD
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
@@ -68,3 +69,12 @@
     Stream,
     StreamOptions,
 )
+from cuda.core._tensor_map import (
+    TensorMapDataType,
+    TensorMapDescriptor,
+    TensorMapIm2ColWideMode,
+    TensorMapInterleave,
+    TensorMapL2Promotion,
+    TensorMapOOBFill,
+    TensorMapSwizzle,
+)
diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index 882ca5eaab..e88def13cd 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -6,6 +6,7 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from libc.stdint cimport (intptr_t,
                           int8_t, int16_t, int32_t, int64_t,
                           uint8_t, uint16_t, uint32_t, uint64_t,)
+from libc.string cimport memcpy
 from libcpp cimport bool as cpp_bool
 from libcpp.complex cimport complex as cpp_complex
 from libcpp cimport nullptr
@@ -16,6 +17,8 @@ import ctypes
 import numpy
 
 from cuda.core._memory import Buffer
+from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py
+from cuda.core._tensor_map cimport TensorMapDescriptor
 from cuda.core._utils.cuda_utils import driver
 from cuda.bindings cimport cydriver
 
@@ -97,6 +100,9 @@ cdef object numpy_complex64 = numpy.complex64
 cdef object numpy_complex128 = numpy.complex128
 
 
+cdef object tensor_map_descriptor_type = _TensorMapDescriptor_py
+
+
 # limitation due to cython/cython#534
 ctypedef void* voidptr
 
@@ -124,6 +130,18 @@ cdef inline int prepare_arg(
     return 0
 
 
+cdef inline int prepare_tensor_map_arg(
+        vector.vector[void*]& data,
+        vector.vector[void*]& data_addresses,
+        TensorMapDescriptor arg,
+        const size_t idx) except -1:
+    cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
+    memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
+    data_addresses[idx] = ptr
+    data[idx] = ptr
+    return 0
+
+
 cdef inline int prepare_ctypes_arg(
         vector.vector[void*]& data,
         vector.vector[void*]& data_addresses,
@@ -273,6 +291,9 @@ cdef class ParamHolder:
                     # it's a CUdeviceptr:
                     self.data_addresses[i] = <void*><intptr_t>(arg.handle.getPtr())
                 continue
+            elif arg_type is tensor_map_descriptor_type:
+                prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
+                continue
             elif arg_type is bool:
                 prepare_arg[cpp_bool](self.data, self.data_addresses, arg, i)
                 continue
@@ -322,6 +343,9 @@ cdef class ParamHolder:
                 elif isinstance(arg, driver.CUgraphConditionalHandle):
                     prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i)
                     continue
+                elif isinstance(arg, tensor_map_descriptor_type):
+                    prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
+                    continue
                 # TODO: support ctypes/numpy struct
                 raise TypeError("the argument is of unsupported type: " + str(type(arg)))
 
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
new file mode 100644
index 0000000000..3cfd571d89
--- /dev/null
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+
+cdef class TensorMapDescriptor:
+    cdef cydriver.CUtensorMap _tensor_map
+    cdef object _source_ref
+
+    cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
new file mode 100644
index 0000000000..9375acf944
--- /dev/null
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -0,0 +1,524 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t, uint32_t, uint64_t
+from cuda.bindings cimport cydriver
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+
+import enum
+
+import numpy
+
+from cuda.core._memoryview import StridedMemoryView
+
+
+try:
+    from ml_dtypes import bfloat16 as ml_bfloat16
+except ImportError:
+    ml_bfloat16 = None
+
+
+class TensorMapDataType(enum.IntEnum):
+    """Data types for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapDataType`` driver enum values.
+    """
+    UINT8 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8
+    UINT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16
+    UINT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32
+    INT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32
+    UINT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64
+    INT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64
+    FLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16
+    FLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32
+    FLOAT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64
+    BFLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16
+    FLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ
+    TFLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32
+    TFLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
+
+
+class TensorMapInterleave(enum.IntEnum):
+    """Interleave layout for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapInterleave`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_INTERLEAVE_NONE
+    INTERLEAVE_16B = cydriver.CU_TENSOR_MAP_INTERLEAVE_16B
+    INTERLEAVE_32B = cydriver.CU_TENSOR_MAP_INTERLEAVE_32B
+
+
+class TensorMapSwizzle(enum.IntEnum):
+    """Swizzle mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapSwizzle`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_SWIZZLE_NONE
+    SWIZZLE_32B = cydriver.CU_TENSOR_MAP_SWIZZLE_32B
+    SWIZZLE_64B = cydriver.CU_TENSOR_MAP_SWIZZLE_64B
+    SWIZZLE_128B = cydriver.CU_TENSOR_MAP_SWIZZLE_128B
+
+
+class TensorMapL2Promotion(enum.IntEnum):
+    """L2 promotion mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapL2promotion`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_L2_PROMOTION_NONE
+    L2_64B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_64B
+    L2_128B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_128B
+    L2_256B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+
+
+class TensorMapOOBFill(enum.IntEnum):
+    """Out-of-bounds fill mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapFloatOOBfill`` driver enum values.
+    """
+    NONE = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+    NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+
+
+# Mapping from numpy dtype to TMA data type
+_NUMPY_DTYPE_TO_TMA = {
+    numpy.dtype(numpy.uint8): TensorMapDataType.UINT8,
+    numpy.dtype(numpy.uint16): TensorMapDataType.UINT16,
+    numpy.dtype(numpy.uint32): TensorMapDataType.UINT32,
+    numpy.dtype(numpy.int32): TensorMapDataType.INT32,
+    numpy.dtype(numpy.uint64): TensorMapDataType.UINT64,
+    numpy.dtype(numpy.int64): TensorMapDataType.INT64,
+    numpy.dtype(numpy.float16): TensorMapDataType.FLOAT16,
+    numpy.dtype(numpy.float32): TensorMapDataType.FLOAT32,
+    numpy.dtype(numpy.float64): TensorMapDataType.FLOAT64,
+}
+
+if ml_bfloat16 is not None:
+    _NUMPY_DTYPE_TO_TMA[numpy.dtype(ml_bfloat16)] = TensorMapDataType.BFLOAT16
+
+
+# Mapping from TMA data type to element size in bytes
+_TMA_DATA_TYPE_SIZE = {
+    TensorMapDataType.UINT8: 1,
+    TensorMapDataType.UINT16: 2,
+    TensorMapDataType.UINT32: 4,
+    TensorMapDataType.INT32: 4,
+    TensorMapDataType.UINT64: 8,
+    TensorMapDataType.INT64: 8,
+    TensorMapDataType.FLOAT16: 2,
+    TensorMapDataType.FLOAT32: 4,
+    TensorMapDataType.FLOAT64: 8,
+    TensorMapDataType.BFLOAT16: 2,
+    TensorMapDataType.FLOAT32_FTZ: 4,
+    TensorMapDataType.TFLOAT32: 4,
+    TensorMapDataType.TFLOAT32_FTZ: 4,
+}
+
+
+def _resolve_data_type(view, data_type):
+    """Resolve the TMA data type from an explicit value or the view's dtype."""
+
+    if data_type is not None:
+        if not isinstance(data_type, TensorMapDataType):
+            raise TypeError(
+                f"data_type must be a TensorMapDataType, got {type(data_type)}")
+        return data_type
+
+    dt = view.dtype
+    if dt is None:
+        raise ValueError(
+            "Cannot infer TMA data type from the tensor; "
+            "please specify data_type explicitly")
+
+    tma_dt = _NUMPY_DTYPE_TO_TMA.get(dt)
+    if tma_dt is None:
+        raise ValueError(
+            f"Unsupported dtype {dt} for TMA; "
+            f"supported dtypes: {list(_NUMPY_DTYPE_TO_TMA.keys())}. "
+            "You may also specify data_type explicitly.")
+
+    return tma_dt
+
+
+cdef class TensorMapDescriptor:
+    """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs.
+
+    A ``TensorMapDescriptor`` wraps the opaque 128-byte ``CUtensorMap`` struct
+    used by the hardware TMA unit for efficient bulk data movement between
+    global and shared memory.
+
+    Instances are created via the class methods :meth:`from_tiled` and
+    :meth:`from_im2col`, and can be passed directly to
+    :func:`~cuda.core.launch` as a kernel argument.
+    """
+
+    def __init__(self):
+        raise RuntimeError(
+            "TensorMapDescriptor cannot be instantiated directly. "
+            "Use TensorMapDescriptor.from_tiled() or "
+            "TensorMapDescriptor.from_im2col().")
+
+    cdef void* _get_data_ptr(self):
+        return <void*>&self._tensor_map
+
+    @staticmethod
+    def from_tiled(tensor, box_dim, *,
+                   element_strides=None,
+                   data_type=None,
+                   interleave=TensorMapInterleave.NONE,
+                   swizzle=TensorMapSwizzle.NONE,
+                   l2_promotion=TensorMapL2Promotion.NONE,
+                   oob_fill=TensorMapOOBFill.NONE):
+        """Create a tiled TMA descriptor from a tensor object.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        box_dim : tuple of int
+            The size of each tile dimension (in elements). Must have the
+            same rank as the tensor and each value must be in [1, 256].
+            Specified in the same (row-major) order as the tensor shape.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+            Specified in the same (row-major) order as the tensor shape.
+        data_type : TensorMapDataType, optional
+            Explicit data type override. If ``None``, inferred from the
+            tensor's dtype.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``NONE``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [1, 5], the pointer is not
+            16-byte aligned, or dimension/stride constraints are violated.
+        """
+        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+
+        # Obtain a StridedMemoryView from the tensor
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        # Keep a strong reference to prevent GC
+        desc._source_ref = tensor
+
+        # Resolve data type
+        tma_dt = _resolve_data_type(view, data_type)
+        cdef int c_data_type_int = int(tma_dt)
+        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
+
+        # Get tensor metadata
+        cdef intptr_t global_address = view.ptr
+        shape = view.shape
+        strides = view.strides  # in elements, can be None for C-contiguous
+
+        cdef int rank = len(shape)
+        if rank < 1 or rank > 5:
+            raise ValueError(
+                f"Tensor rank must be between 1 and 5, got {rank}")
+
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)")
+
+        if len(box_dim) != rank:
+            raise ValueError(
+                f"box_dim must have {rank} elements (same as tensor rank), "
+                f"got {len(box_dim)}")
+
+        for i, bd in enumerate(box_dim):
+            if bd < 1 or bd > 256:
+                raise ValueError(
+                    f"box_dim[{i}] must be in [1, 256], got {bd}")
+
+        if element_strides is not None:
+            if len(element_strides) != rank:
+                raise ValueError(
+                    f"element_strides must have {rank} elements, got {len(element_strides)}")
+        else:
+            element_strides = (1,) * rank
+
+        # Compute byte strides from element strides
+        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+        if strides is not None:
+            byte_strides = tuple(s * elem_size for s in strides)
+        else:
+            # C-contiguous: strides in bytes, row-major
+            byte_strides = []
+            stride = elem_size
+            for i in range(rank - 1, -1, -1):
+                byte_strides.append(stride)
+                stride *= shape[i]
+            byte_strides.reverse()
+
+        # Reverse dimensions for column-major cuTensorMap convention
+        # Python/DLPack: row-major (dim 0 = outermost)
+        # cuTensorMap: column-major (dim 0 = innermost)
+        cdef uint64_t[5] c_global_dim
+        cdef uint64_t[4] c_global_strides  # rank - 1 elements
+        cdef uint32_t[5] c_box_dim
+        cdef uint32_t[5] c_element_strides
+        cdef int i_c
+
+        for i_c in range(rank):
+            # Reverse: Python dim i -> cuTensorMap dim (rank - 1 - i)
+            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+            c_box_dim[i_c] = <uint32_t>box_dim[rank - 1 - i_c]
+            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+        # globalStrides: rank-1 elements (byte strides for dims 1..N-1 in col-major order)
+        # The innermost stride (dim 0) is implicit = element size
+        for i_c in range(rank - 1):
+            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+        cdef uint32_t c_rank = <uint32_t>rank
+        cdef int c_interleave_int = int(interleave)
+        cdef int c_swizzle_int = int(swizzle)
+        cdef int c_l2_promotion_int = int(l2_promotion)
+        cdef int c_oob_fill_int = int(oob_fill)
+        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapEncodeTiled(
+                &desc._tensor_map,
+                c_data_type,
+                c_rank,
+                <void*>global_address,
+                c_global_dim,
+                c_global_strides,
+                c_box_dim,
+                c_element_strides,
+                c_interleave,
+                c_swizzle,
+                c_l2_promotion,
+                c_oob_fill,
+            ))
+
+        return desc
+
+    @staticmethod
+    def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner,
+                    channels_per_pixel, pixels_per_column, *,
+                    element_strides=None,
+                    data_type=None,
+                    interleave=TensorMapInterleave.NONE,
+                    swizzle=TensorMapSwizzle.NONE,
+                    l2_promotion=TensorMapL2Promotion.NONE,
+                    oob_fill=TensorMapOOBFill.NONE):
+        """Create an im2col TMA descriptor from a tensor object.
+
+        Im2col layout is used for convolution-style data access patterns.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        pixel_box_lower_corner : tuple of int
+            Lower corner of the pixel bounding box for each spatial
+            dimension (rank - 2 elements). Specified in row-major order
+            matching the tensor's spatial dimensions.
+        pixel_box_upper_corner : tuple of int
+            Upper corner of the pixel bounding box for each spatial
+            dimension (rank - 2 elements). Specified in row-major order
+            matching the tensor's spatial dimensions.
+        channels_per_pixel : int
+            Number of channels per pixel.
+        pixels_per_column : int
+            Number of pixels per column.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+        data_type : TensorMapDataType, optional
+            Explicit data type override. If ``None``, inferred from the
+            tensor's dtype.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``NONE``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [3, 5], the pointer is not
+            16-byte aligned, or other constraints are violated.
+        """
+        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+
+        # Obtain a StridedMemoryView from the tensor
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        desc._source_ref = tensor
+
+        tma_dt = _resolve_data_type(view, data_type)
+        cdef int c_data_type_int = int(tma_dt)
+        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
+
+        cdef intptr_t global_address = view.ptr
+        shape = view.shape
+        strides = view.strides
+
+        cdef int rank = len(shape)
+        if rank < 3 or rank > 5:
+            raise ValueError(
+                f"Im2col tensor rank must be between 3 and 5, got {rank}")
+
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x}")
+
+        cdef int n_spatial = rank - 2
+        if len(pixel_box_lower_corner) != n_spatial:
+            raise ValueError(
+                f"pixel_box_lower_corner must have {n_spatial} elements "
+                f"(rank - 2), got {len(pixel_box_lower_corner)}")
+        if len(pixel_box_upper_corner) != n_spatial:
+            raise ValueError(
+                f"pixel_box_upper_corner must have {n_spatial} elements "
+                f"(rank - 2), got {len(pixel_box_upper_corner)}")
+
+        if element_strides is not None:
+            if len(element_strides) != rank:
+                raise ValueError(
+                    f"element_strides must have {rank} elements, got {len(element_strides)}")
+        else:
+            element_strides = (1,) * rank
+
+        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+        if strides is not None:
+            byte_strides = tuple(s * elem_size for s in strides)
+        else:
+            byte_strides = []
+            stride = elem_size
+            for i in range(rank - 1, -1, -1):
+                byte_strides.append(stride)
+                stride *= shape[i]
+            byte_strides.reverse()
+
+        # Reverse all dimension arrays for column-major convention
+        cdef uint64_t[5] c_global_dim
+        cdef uint64_t[4] c_global_strides
+        cdef uint32_t[5] c_element_strides
+        cdef int[3] c_pixel_box_lower  # max 3 spatial dims (rank 5 - 2)
+        cdef int[3] c_pixel_box_upper
+        cdef int i_c
+
+        for i_c in range(rank):
+            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+        for i_c in range(rank - 1):
+            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+        # Reverse spatial dimensions for lower/upper corners
+        for i_c in range(n_spatial):
+            c_pixel_box_lower[i_c] = <int>pixel_box_lower_corner[n_spatial - 1 - i_c]
+            c_pixel_box_upper[i_c] = <int>pixel_box_upper_corner[n_spatial - 1 - i_c]
+
+        cdef uint32_t c_rank = <uint32_t>rank
+        cdef uint32_t c_channels = <uint32_t>channels_per_pixel
+        cdef uint32_t c_pixels = <uint32_t>pixels_per_column
+        cdef int c_interleave_int = int(interleave)
+        cdef int c_swizzle_int = int(swizzle)
+        cdef int c_l2_promotion_int = int(l2_promotion)
+        cdef int c_oob_fill_int = int(oob_fill)
+        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2col(
+                &desc._tensor_map,
+                c_data_type,
+                c_rank,
+                <void*>global_address,
+                c_global_dim,
+                c_global_strides,
+                c_pixel_box_lower,
+                c_pixel_box_upper,
+                c_channels,
+                c_pixels,
+                c_element_strides,
+                c_interleave,
+                c_swizzle,
+                c_l2_promotion,
+                c_oob_fill,
+            ))
+
+        return desc
+
+    def replace_address(self, tensor):
+        """Replace the global memory address in this tensor map descriptor.
+
+        This is useful when the tensor data has been reallocated but the
+        shape, strides, and other parameters remain the same.
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        """
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        cdef intptr_t global_address = view.ptr
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x}")
+
+        self._source_ref = tensor
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress(
+                &self._tensor_map,
+                <void*>global_address,
+            ))
+
+    def __repr__(self):
+        return f"TensorMapDescriptor()"
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
new file mode 100644
index 0000000000..de51570ec4
--- /dev/null
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -0,0 +1,195 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates how to use TMA (Tensor Memory Accelerator) descriptors
+# with cuda.core on Hopper+ GPUs (compute capability >= 9.0).
+#
+# TMA enables efficient bulk data movement between global and shared memory using
+# hardware-managed tensor map descriptors. This example shows:
+#
+#   1. Creating a TMA tiled descriptor from a CuPy device array
+#   2. Passing the descriptor to a kernel via launch()
+#   3. Using TMA to load tiles into shared memory (via inline PTX)
+#   4. Updating the descriptor's source address with replace_address()
+#
+# Requirements:
+#   - Hopper or later GPU (compute capability >= 9.0)
+#   - CuPy
+#   - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set)
+#
+# ################################################################################
+
+import sys
+
+import cupy as cp
+import numpy as np
+
+from cuda.core import (
+    Device,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    TensorMapDescriptor,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Check for Hopper+ GPU
+# ---------------------------------------------------------------------------
+dev = Device()
+arch = dev.compute_capability
+if arch < (9, 0):
+    print(
+        "TMA requires compute capability >= 9.0 (Hopper or later)",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+dev.set_current()
+
+arch_str = "".join(f"{i}" for i in arch)
+
+# ---------------------------------------------------------------------------
+# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
+# copies the tile to an output buffer so we can verify correctness.
+#
+# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be
+# compiled with NVRTC without pulling in the full driver-API header.
+#
+# Key points:
+#   - The tensor map is passed by value with __grid_constant__ so the TMA
+#     hardware can read it from grid-constant memory.
+#   - Thread 0 in each block issues the TMA load and manages the mbarrier.
+#   - All threads wait on the mbarrier, then copy from shared to global.
+# ---------------------------------------------------------------------------
+TILE_SIZE = 128  # elements per tile (must match the kernel constant)
+
+code = r"""
+// Minimal definition of the 128-byte opaque tensor map struct.
+struct __align__(64) TensorMap { unsigned long long opaque[16]; };
+
+static constexpr int TILE_SIZE = 128;
+
+extern "C"
+__global__ void tma_copy(
+    const __grid_constant__ TensorMap tensor_map,
+    float* output,
+    int N)
+{
+    __shared__ __align__(128) float smem[TILE_SIZE];
+    __shared__ __align__(8) unsigned long long mbar;
+
+    const int tid        = threadIdx.x;
+    const int tile_start = blockIdx.x * TILE_SIZE;
+
+    // ---- Thread 0: set up mbarrier and issue the TMA load ----
+    if (tid == 0)
+    {
+        // Initialise a single-phase mbarrier (1 arriving thread).
+        asm volatile(
+            "mbarrier.init.shared.b64 [%0], 1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start'
+        // from the tensor described by 'tensor_map' into shared memory.
+        asm volatile(
+            "cp.async.bulk.tensor.1d.shared::cluster.global.tile"
+            ".mbarrier::complete_tx::bytes"
+            " [%0], [%1, {%2}], [%3];"
+            :: "r"((unsigned)__cvta_generic_to_shared(smem)),
+               "l"(&tensor_map),
+               "r"(tile_start),
+               "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Tell the mbarrier how many bytes the TMA will deliver.
+        asm volatile(
+            "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)),
+               "r"((unsigned)(TILE_SIZE * sizeof(float))));
+    }
+
+    __syncthreads();
+
+    // ---- Wait for the TMA load to complete ----
+    if (tid == 0)
+    {
+        asm volatile(
+            "{ .reg .pred P;                                           \n"
+            "WAIT:                                                     \n"
+            "  mbarrier.try_wait.parity.shared.b64 P, [%0], 0;         \n"
+            "  @!P bra WAIT;                                           \n"
+            "}                                                         \n"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+    }
+
+    __syncthreads();
+
+    // ---- Copy the tile from shared memory to the output buffer ----
+    if (tid < TILE_SIZE)
+    {
+        const int idx = tile_start + tid;
+        if (idx < N)
+            output[idx] = smem[tid];
+    }
+}
+"""
+
+# ---------------------------------------------------------------------------
+# Compile the kernel
+# ---------------------------------------------------------------------------
+prog = Program(
+    code,
+    code_type="c++",
+    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+)
+mod = prog.compile("cubin")
+ker = mod.get_kernel("tma_copy")
+
+# ---------------------------------------------------------------------------
+# 1) Prepare input data on the device
+# ---------------------------------------------------------------------------
+N = 1024
+a = cp.arange(N, dtype=cp.float32)  # [0, 1, 2, ..., N-1]
+output = cp.zeros(N, dtype=cp.float32)
+dev.sync()  # cupy uses its own stream
+
+# ---------------------------------------------------------------------------
+# 2) Create a TMA tiled descriptor
+#    from_tiled() accepts any DLPack / __cuda_array_interface__ object.
+#    The dtype (float32) is inferred automatically from the CuPy array.
+# ---------------------------------------------------------------------------
+tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+
+# ---------------------------------------------------------------------------
+# 3) Launch the kernel
+#    The TensorMapDescriptor is passed directly as a kernel argument — the
+#    128-byte struct is copied into kernel parameter space automatically.
+# ---------------------------------------------------------------------------
+n_tiles = N // TILE_SIZE
+config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
+launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output, a), "TMA copy produced incorrect results"
+print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
+
+# ---------------------------------------------------------------------------
+# 4) Demonstrate replace_address()
+#    Create a second tensor with different content, point the *same*
+#    descriptor at it, and re-launch without rebuilding the descriptor.
+# ---------------------------------------------------------------------------
+b = cp.full(N, fill_value=42.0, dtype=cp.float32)
+dev.sync()
+
+tensor_map.replace_address(b)
+
+output2 = cp.zeros(N, dtype=cp.float32)
+dev.sync()
+
+launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output2, b), "replace_address produced incorrect results"
+print("replace_address verified: descriptor reused with new source tensor")
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
new file mode 100644
index 0000000000..0aa5968a85
--- /dev/null
+++ b/cuda_core/tests/test_tensor_map.py
@@ -0,0 +1,279 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import numpy as np
+
+from cuda.core import (
+    Device,
+    TensorMapDescriptor,
+    TensorMapDataType,
+    TensorMapInterleave,
+    TensorMapL2Promotion,
+    TensorMapOOBFill,
+    TensorMapSwizzle,
+)
+
+
+@pytest.fixture
+def dev(init_cuda):
+    return Device()
+
+
+@pytest.fixture
+def skip_if_no_tma(dev):
+    if not dev.properties.tensor_map_access_supported:
+        pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
+
+
+def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256):
+    """Allocate a device buffer and return it with proper alignment."""
+    n_elements = 1
+    for s in shape:
+        n_elements *= s
+    buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment)
+    return buf
+
+
+class TestTensorMapEnums:
+    """Test that enum wrappers expose the expected values."""
+
+    def test_data_type_values(self):
+        assert TensorMapDataType.UINT8 == 0
+        assert TensorMapDataType.FLOAT32 == 7
+        assert TensorMapDataType.FLOAT64 == 8
+        assert TensorMapDataType.BFLOAT16 == 9
+
+    def test_interleave_values(self):
+        assert TensorMapInterleave.NONE == 0
+        assert TensorMapInterleave.INTERLEAVE_16B == 1
+        assert TensorMapInterleave.INTERLEAVE_32B == 2
+
+    def test_swizzle_values(self):
+        assert TensorMapSwizzle.NONE == 0
+        assert TensorMapSwizzle.SWIZZLE_32B == 1
+        assert TensorMapSwizzle.SWIZZLE_64B == 2
+        assert TensorMapSwizzle.SWIZZLE_128B == 3
+
+    def test_l2_promotion_values(self):
+        assert TensorMapL2Promotion.NONE == 0
+        assert TensorMapL2Promotion.L2_64B == 1
+        assert TensorMapL2Promotion.L2_128B == 2
+        assert TensorMapL2Promotion.L2_256B == 3
+
+    def test_oob_fill_values(self):
+        assert TensorMapOOBFill.NONE == 0
+        assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1
+
+
+class TestTensorMapDescriptorCreation:
+    """Test TensorMapDescriptor factory methods."""
+
+    def test_cannot_instantiate_directly(self):
+        with pytest.raises(RuntimeError, match="cannot be instantiated directly"):
+            TensorMapDescriptor()
+
+    def test_from_tiled_1d(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)  # 1024 float32 elements
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+        assert repr(desc) == "TensorMapDescriptor()"
+
+    def test_from_tiled_2d(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_tiled_3d(self, dev, skip_if_no_tma):
+        buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(8, 8, 8),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+            swizzle=TensorMapSwizzle.SWIZZLE_128B,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+            l2_promotion=TensorMapL2Promotion.L2_128B,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+            oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA,
+        )
+        assert desc is not None
+
+
+class TestTensorMapDescriptorValidation:
+    """Test validation in TensorMapDescriptor factory methods."""
+
+    def test_invalid_rank_zero(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64)
+        with pytest.raises(ValueError, match="rank must be between 1 and 5"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="box_dim must have 1 elements"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(32, 32),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_box_dim_out_of_range(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match=r"box_dim\[0\] must be in \[1, 256\]"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(512,),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_element_strides_rank_mismatch(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="element_strides must have 1 elements"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(64,),
+                element_strides=(1, 1),
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_invalid_data_type(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(TypeError, match="data_type must be a TensorMapDataType"):
+            TensorMapDescriptor.from_tiled(
+                buf,
+                box_dim=(64,),
+                data_type=42,
+            )
+
+
+class TestTensorMapDtypeMapping:
+    """Test automatic dtype inference from numpy dtypes."""
+
+    @pytest.mark.parametrize("np_dtype,expected_tma_dt", [
+        (np.uint8, TensorMapDataType.UINT8),
+        (np.uint16, TensorMapDataType.UINT16),
+        (np.uint32, TensorMapDataType.UINT32),
+        (np.int32, TensorMapDataType.INT32),
+        (np.uint64, TensorMapDataType.UINT64),
+        (np.int64, TensorMapDataType.INT64),
+        (np.float16, TensorMapDataType.FLOAT16),
+        (np.float32, TensorMapDataType.FLOAT32),
+        (np.float64, TensorMapDataType.FLOAT64),
+    ])
+    def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma):
+        from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+        assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt
+
+    def test_bfloat16_mapping(self):
+        try:
+            from ml_dtypes import bfloat16
+            from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+            assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16
+        except ImportError:
+            pytest.skip("ml_dtypes not installed")
+
+
+class TestTensorMapReplaceAddress:
+    """Test replace_address functionality."""
+
+    def test_replace_address(self, dev, skip_if_no_tma):
+        buf1 = dev.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf1,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+
+        buf2 = dev.allocate(1024 * 4)
+        desc.replace_address(buf2)
+        # No exception means success
+
+    def test_replace_address_requires_device_accessible(self, dev, skip_if_no_tma):
+        buf1 = dev.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf1,
+            box_dim=(64,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        # Create a host-only array (not device-accessible)
+        host_arr = np.zeros(1024, dtype=np.float32)
+        with pytest.raises(ValueError, match="device-accessible"):
+            desc.replace_address(host_arr)
+
+
+class TestTensorMapIm2col:
+    """Test im2col TMA descriptor creation."""
+
+    def test_from_im2col_3d(self, dev, skip_if_no_tma):
+        # 3D tensor: batch=1, height=32, channels=64
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        desc = TensorMapDescriptor.from_im2col(
+            buf,
+            pixel_box_lower_corner=(0,),
+            pixel_box_upper_corner=(4,),
+            channels_per_pixel=64,
+            pixels_per_column=4,
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_im2col_rank_validation(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="Im2col tensor rank must be between 3 and 5"):
+            TensorMapDescriptor.from_im2col(
+                buf,
+                pixel_box_lower_corner=(),
+                pixel_box_upper_corner=(),
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )
+
+    def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"):
+            TensorMapDescriptor.from_im2col(
+                buf,
+                pixel_box_lower_corner=(0, 0),
+                pixel_box_upper_corner=(4,),
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )

From 77a3c8ecab4734eacbf7f36af964b103ca2ecf69 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 13:53:49 -0800
Subject: [PATCH 02/13] tma wide

---
 cuda_core/cuda/core/__init__.py     |   1 -
 cuda_core/cuda/core/_tensor_map.pxd |   1 +
 cuda_core/cuda/core/_tensor_map.pyx | 208 +++++++++++++++++++++++++++-
 cuda_core/pixi.toml                 |  13 +-
 cuda_core/tests/test_tensor_map.py  | 133 +++++++++++++++++-
 5 files changed, 341 insertions(+), 15 deletions(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 84e4f3d356..d5f3693721 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -60,7 +60,6 @@
     StridedMemoryView,
     args_viewable_as_strided_memory,
 )
-<<<<<<< HEAD
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index 3cfd571d89..fa2254ba70 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -8,5 +8,6 @@ from cuda.bindings cimport cydriver
 cdef class TensorMapDescriptor:
     cdef cydriver.CUtensorMap _tensor_map
     cdef object _source_ref
+    cdef object _repr_info
 
     cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 9375acf944..6b072f7404 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -80,6 +80,16 @@ class TensorMapOOBFill(enum.IntEnum):
     NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 
 
+class TensorMapIm2ColWideMode(enum.IntEnum):
+    """Im2col wide mode for tensor map descriptors.
+
+    These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values.
+    Supported on compute capability 10.0+.
+    """
+    W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
+    W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+
+
 # Mapping from numpy dtype to TMA data type
 _NUMPY_DTYPE_TO_TMA = {
     numpy.dtype(numpy.uint8): TensorMapDataType.UINT8,
@@ -316,6 +326,13 @@ cdef class TensorMapDescriptor:
                 c_oob_fill,
             ))
 
+        desc._repr_info = {
+            "method": "tiled",
+            "rank": rank,
+            "data_type": tma_dt,
+            "swizzle": swizzle,
+        }
+
         return desc
 
     @staticmethod
@@ -483,6 +500,176 @@ cdef class TensorMapDescriptor:
                 c_oob_fill,
             ))
 
+        desc._repr_info = {
+            "method": "im2col",
+            "rank": rank,
+            "data_type": tma_dt,
+            "swizzle": swizzle,
+        }
+
+        return desc
+
+    @staticmethod
+    def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
+                         channels_per_pixel, pixels_per_column, *,
+                         element_strides=None,
+                         data_type=None,
+                         interleave=TensorMapInterleave.NONE,
+                         mode=TensorMapIm2ColWideMode.W,
+                         swizzle=TensorMapSwizzle.SWIZZLE_128B,
+                         l2_promotion=TensorMapL2Promotion.NONE,
+                         oob_fill=TensorMapOOBFill.NONE):
+        """Create an im2col-wide TMA descriptor from a tensor object.
+
+        Im2col-wide layout loads elements exclusively along the W (width)
+        dimension. This variant is supported on compute capability 10.0+
+        (Blackwell and later).
+
+        Parameters
+        ----------
+        tensor : object
+            Any object supporting DLPack or ``__cuda_array_interface__``,
+            or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
+            device-accessible memory with a 16-byte-aligned pointer.
+        pixel_box_lower_corner_width : int
+            Lower corner of the pixel bounding box along the W dimension.
+        pixel_box_upper_corner_width : int
+            Upper corner of the pixel bounding box along the W dimension.
+        channels_per_pixel : int
+            Number of channels per pixel.
+        pixels_per_column : int
+            Number of pixels per column.
+        element_strides : tuple of int, optional
+            Per-dimension element traversal strides. Default is all 1s.
+        data_type : TensorMapDataType, optional
+            Explicit data type override. If ``None``, inferred from the
+            tensor's dtype.
+        interleave : TensorMapInterleave
+            Interleave layout. Default ``NONE``.
+        mode : TensorMapIm2ColWideMode
+            Im2col wide mode. Default ``W``.
+        swizzle : TensorMapSwizzle
+            Swizzle mode. Default ``SWIZZLE_128B``.
+        l2_promotion : TensorMapL2Promotion
+            L2 promotion mode. Default ``NONE``.
+        oob_fill : TensorMapOOBFill
+            Out-of-bounds fill mode. Default ``NONE``.
+
+        Returns
+        -------
+        TensorMapDescriptor
+
+        Raises
+        ------
+        ValueError
+            If the tensor rank is outside [3, 5], the pointer is not
+            16-byte aligned, or other constraints are violated.
+        """
+        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+
+        # Obtain a StridedMemoryView from the tensor
+        if isinstance(tensor, StridedMemoryView):
+            view = tensor
+        else:
+            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+        if not view.is_device_accessible:
+            raise ValueError("The tensor must be device-accessible")
+
+        desc._source_ref = tensor
+
+        tma_dt = _resolve_data_type(view, data_type)
+        cdef int c_data_type_int = int(tma_dt)
+        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
+
+        cdef intptr_t global_address = view.ptr
+        shape = view.shape
+        strides = view.strides
+
+        cdef int rank = len(shape)
+        if rank < 3 or rank > 5:
+            raise ValueError(
+                f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
+
+        if global_address % 16 != 0:
+            raise ValueError(
+                f"Global memory address must be 16-byte aligned, "
+                f"got address 0x{global_address:x}")
+
+        if element_strides is not None:
+            if len(element_strides) != rank:
+                raise ValueError(
+                    f"element_strides must have {rank} elements, got {len(element_strides)}")
+        else:
+            element_strides = (1,) * rank
+
+        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+        if strides is not None:
+            byte_strides = tuple(s * elem_size for s in strides)
+        else:
+            byte_strides = []
+            stride = elem_size
+            for i in range(rank - 1, -1, -1):
+                byte_strides.append(stride)
+                stride *= shape[i]
+            byte_strides.reverse()
+
+        # Reverse all dimension arrays for column-major convention
+        cdef uint64_t[5] c_global_dim
+        cdef uint64_t[4] c_global_strides
+        cdef uint32_t[5] c_element_strides
+        cdef int i_c
+
+        for i_c in range(rank):
+            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+        for i_c in range(rank - 1):
+            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+        cdef uint32_t c_rank = <uint32_t>rank
+        cdef int c_lower_w = <int>pixel_box_lower_corner_width
+        cdef int c_upper_w = <int>pixel_box_upper_corner_width
+        cdef uint32_t c_channels = <uint32_t>channels_per_pixel
+        cdef uint32_t c_pixels = <uint32_t>pixels_per_column
+        cdef int c_interleave_int = int(interleave)
+        cdef int c_mode_int = int(mode)
+        cdef int c_swizzle_int = int(swizzle)
+        cdef int c_l2_promotion_int = int(l2_promotion)
+        cdef int c_oob_fill_int = int(oob_fill)
+        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+        cdef cydriver.CUtensorMapIm2ColWideMode c_mode = <cydriver.CUtensorMapIm2ColWideMode>c_mode_int
+        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide(
+                &desc._tensor_map,
+                c_data_type,
+                c_rank,
+                <void*>global_address,
+                c_global_dim,
+                c_global_strides,
+                c_lower_w,
+                c_upper_w,
+                c_channels,
+                c_pixels,
+                c_element_strides,
+                c_interleave,
+                c_mode,
+                c_swizzle,
+                c_l2_promotion,
+                c_oob_fill,
+            ))
+
+        desc._repr_info = {
+            "method": "im2col_wide",
+            "rank": rank,
+            "data_type": tma_dt,
+            "swizzle": swizzle,
+        }
+
         return desc
 
     def replace_address(self, tensor):
@@ -512,13 +699,28 @@ cdef class TensorMapDescriptor:
                 f"Global memory address must be 16-byte aligned, "
                 f"got address 0x{global_address:x}")
 
-        self._source_ref = tensor
-
         with nogil:
             HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress(
                 &self._tensor_map,
                 <void*>global_address,
             ))
 
+        # Update the source reference only after the driver call succeeds,
+        # so we don't drop the old tensor (risking a dangling pointer in the
+        # CUtensorMap struct) if the call fails.
+        self._source_ref = tensor
+
     def __repr__(self):
-        return f"TensorMapDescriptor()"
+        info = self._repr_info
+        if info is None:
+            return "TensorMapDescriptor()"
+        parts = []
+        if "method" in info:
+            parts.append(info["method"])
+        if "rank" in info:
+            parts.append(f"rank={info['rank']}")
+        if "data_type" in info:
+            parts.append(f"dtype={info['data_type'].name}")
+        if "swizzle" in info:
+            parts.append(f"swizzle={info['swizzle'].name}")
+        return f"TensorMapDescriptor({', '.join(parts)})"
diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml
index 9dc6ac1ed9..e351fb63b6 100644
--- a/cuda_core/pixi.toml
+++ b/cuda_core/pixi.toml
@@ -75,16 +75,9 @@ cuda = "13"
 [feature.cu13.dependencies]
 cuda-version = "13.1.*"
 
-[feature.cu12.system-requirements]
-cuda = "12"
-
-[feature.cu12.dependencies]
-cuda-version = "12.*"
-
-# We keep both cu12 and cu13 because cuda.core works with either major version
-# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1)
-#       For cu12 testing, use conda-forge packages: temporarily change path to wildcard
-#       or skip cu12 locally: pixi run -e cu13 test
+# NOTE: cu12 environment is intentionally omitted because the path dependency
+# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing,
+# use conda-forge packages or CI workflows.
 [environments]
 default = { features = [
     "cu13",
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 0aa5968a85..f57e73ded8 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -9,6 +9,7 @@
     Device,
     TensorMapDescriptor,
     TensorMapDataType,
+    TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
     TensorMapOOBFill,
@@ -36,6 +37,23 @@ def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256):
     return buf
 
 
+class _DeviceArray:
+    """Wrap a Buffer with explicit shape via __cuda_array_interface__.
+
+    dev.allocate() returns a 1D byte buffer. For multi-dimensional TMA tests
+    we need the tensor to report a proper shape/dtype so the TMA encoder sees
+    the correct rank, dimensions, and strides.
+    """
+    def __init__(self, buf, shape, dtype=np.float32):
+        self._buf = buf  # prevent GC
+        self.__cuda_array_interface__ = {
+            "shape": tuple(shape),
+            "typestr": np.dtype(dtype).str,
+            "data": (int(buf.handle), False),
+            "version": 3,
+        }
+
+
 class TestTensorMapEnums:
     """Test that enum wrappers expose the expected values."""
 
@@ -66,6 +84,10 @@ def test_oob_fill_values(self):
         assert TensorMapOOBFill.NONE == 0
         assert TensorMapOOBFill.NAN_REQUEST_ZERO_FMA == 1
 
+    def test_im2col_wide_mode_values(self):
+        assert TensorMapIm2ColWideMode.W == 0
+        assert TensorMapIm2ColWideMode.W128 == 1
+
 
 class TestTensorMapDescriptorCreation:
     """Test TensorMapDescriptor factory methods."""
@@ -82,7 +104,7 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma):
             data_type=TensorMapDataType.FLOAT32,
         )
         assert desc is not None
-        assert repr(desc) == "TensorMapDescriptor()"
+        assert repr(desc) == "TensorMapDescriptor(tiled, rank=1, dtype=FLOAT32, swizzle=NONE)"
 
     def test_from_tiled_2d(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
@@ -102,6 +124,18 @@ def test_from_tiled_3d(self, dev, skip_if_no_tma):
         )
         assert desc is not None
 
+    def test_from_tiled_5d(self, dev, skip_if_no_tma):
+        # 5D: exercises all 5 c_global_dim / 4 c_global_strides slots
+        shape = (2, 4, 4, 4, 8)
+        n_bytes = 2 * 4 * 4 * 4 * 8 * 4  # float32
+        buf = dev.allocate(n_bytes)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_tiled(
+            tensor,
+            box_dim=(1, 2, 2, 2, 8),
+        )
+        assert desc is not None
+
     def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
         desc = TensorMapDescriptor.from_tiled(
@@ -277,3 +311,100 @@ def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
                 pixels_per_column=4,
                 data_type=TensorMapDataType.FLOAT32,
             )
+
+    def test_from_im2col_4d(self, dev, skip_if_no_tma):
+        # NHWC layout: N=1, H=8, W=8, C=64 — 2 spatial dims
+        # Exercises spatial corner reversal with n_spatial=2:
+        #   Python [H_lower, W_lower] -> driver [W_lower, H_lower]
+        shape = (1, 8, 8, 64)
+        buf = dev.allocate(1 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col(
+            tensor,
+            pixel_box_lower_corner=(0, 0),
+            pixel_box_upper_corner=(4, 4),
+            channels_per_pixel=64,
+            pixels_per_column=16,
+        )
+        assert desc is not None
+
+    def test_from_im2col_5d(self, dev, skip_if_no_tma):
+        # NDHWC layout: N=1, D=4, H=8, W=8, C=64 — 3 spatial dims
+        # Exercises the full spatial corner reversal:
+        #   Python [D, H, W] -> driver [W, H, D]
+        shape = (1, 4, 8, 8, 64)
+        buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col(
+            tensor,
+            pixel_box_lower_corner=(0, 0, 0),
+            pixel_box_upper_corner=(2, 4, 4),
+            channels_per_pixel=64,
+            pixels_per_column=32,
+        )
+        assert desc is not None
+
+
+class TestTensorMapIm2colWide:
+    """Test im2col-wide TMA descriptor creation (compute capability 10.0+)."""
+
+    @pytest.fixture
+    def skip_if_no_im2col_wide(self, dev):
+        cc = dev.compute_capability
+        if cc.major < 10:
+            pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)")
+
+    def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide):
+        # 3D tensor: batch=1, width=32, channels=64
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        desc = TensorMapDescriptor.from_im2col_wide(
+            buf,
+            pixel_box_lower_corner_width=0,
+            pixel_box_upper_corner_width=4,
+            channels_per_pixel=64,
+            pixels_per_column=4,
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_im2col_wide_4d(self, dev, skip_if_no_im2col_wide):
+        # NHWC layout: N=1, H=8, W=8, C=64
+        # Wide mode only uses scalar W corners, even with higher rank
+        shape = (1, 8, 8, 64)
+        buf = dev.allocate(1 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col_wide(
+            tensor,
+            pixel_box_lower_corner_width=0,
+            pixel_box_upper_corner_width=4,
+            channels_per_pixel=64,
+            pixels_per_column=16,
+        )
+        assert desc is not None
+
+    def test_from_im2col_wide_5d(self, dev, skip_if_no_im2col_wide):
+        # NDHWC layout: N=1, D=4, H=8, W=8, C=64
+        # Max rank boundary — verifies all 5 dim/stride slots are filled
+        shape = (1, 4, 8, 8, 64)
+        buf = dev.allocate(1 * 4 * 8 * 8 * 64 * 4)
+        tensor = _DeviceArray(buf, shape)
+        desc = TensorMapDescriptor.from_im2col_wide(
+            tensor,
+            pixel_box_lower_corner_width=0,
+            pixel_box_upper_corner_width=4,
+            channels_per_pixel=64,
+            pixels_per_column=32,
+        )
+        assert desc is not None
+
+    def test_from_im2col_wide_rank_validation(self, dev, skip_if_no_im2col_wide):
+        buf = dev.allocate(1024 * 4)
+        with pytest.raises(ValueError, match="Im2col-wide tensor rank must be between 3 and 5"):
+            TensorMapDescriptor.from_im2col_wide(
+                buf,
+                pixel_box_lower_corner_width=0,
+                pixel_box_upper_corner_width=4,
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )

From 19c4a0f6ce25b30faffec5dbb0b78dca2496af8f Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 15:15:56 -0800
Subject: [PATCH 03/13] clean up

---
 cuda_core/cuda/core/_tensor_map.pxd  |   2 +-
 cuda_core/cuda/core/_tensor_map.pyx  | 185 ++++++++++-----------------
 cuda_core/examples/tma_tensor_map.py |   2 +-
 cuda_core/tests/test_tensor_map.py   |   2 +-
 4 files changed, 72 insertions(+), 119 deletions(-)

diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index fa2254ba70..b2b165b5aa 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 6b072f7404..9edcccb928 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -150,6 +150,55 @@ def _resolve_data_type(view, data_type):
     return tma_dt
 
 
+def _get_validated_view(tensor):
+    """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer."""
+    if isinstance(tensor, StridedMemoryView):
+        view = tensor
+    else:
+        # stream_ptr=-1: no stream synchronization needed because descriptor
+        # creation only reads tensor metadata, it does not move data.
+        view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+
+    if not view.is_device_accessible:
+        raise ValueError("The tensor must be device-accessible")
+
+    if view.ptr % 16 != 0:
+        raise ValueError(
+            f"Global memory address must be 16-byte aligned, "
+            f"got address 0x{view.ptr:x}")
+
+    return view
+
+
+def _compute_byte_strides(shape, strides, elem_size):
+    """Compute byte strides from element strides or C-contiguous fallback.
+
+    Returns a tuple of byte strides in row-major order.
+    """
+    if strides is not None:
+        return tuple(s * elem_size for s in strides)
+
+    # C-contiguous: compute byte strides from shape, innermost first
+    rank = len(shape)
+    byte_strides = []
+    stride = elem_size
+    for i in range(rank - 1, -1, -1):
+        byte_strides.append(stride)
+        stride *= shape[i]
+    byte_strides.reverse()
+    return tuple(byte_strides)
+
+
+def _validate_element_strides(element_strides, rank):
+    """Validate or default element_strides to all-ones."""
+    if element_strides is not None:
+        if len(element_strides) != rank:
+            raise ValueError(
+                f"element_strides must have {rank} elements, got {len(element_strides)}")
+        return element_strides
+    return (1,) * rank
+
+
 cdef class TensorMapDescriptor:
     """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs.
 
@@ -171,8 +220,8 @@ cdef class TensorMapDescriptor:
     cdef void* _get_data_ptr(self):
         return <void*>&self._tensor_map
 
-    @staticmethod
-    def from_tiled(tensor, box_dim, *,
+    @classmethod
+    def from_tiled(cls, tensor, box_dim, *,
                    element_strides=None,
                    data_type=None,
                    interleave=TensorMapInterleave.NONE,
@@ -216,40 +265,23 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [1, 5], the pointer is not
             16-byte aligned, or dimension/stride constraints are violated.
         """
-        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
+        cdef TensorMapDescriptor desc = cls.__new__(cls)
 
-        # Obtain a StridedMemoryView from the tensor
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
-
-        # Keep a strong reference to prevent GC
+        view = _get_validated_view(tensor)
         desc._source_ref = tensor
 
-        # Resolve data type
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
         cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
 
-        # Get tensor metadata
         cdef intptr_t global_address = view.ptr
         shape = view.shape
-        strides = view.strides  # in elements, can be None for C-contiguous
 
         cdef int rank = len(shape)
         if rank < 1 or rank > 5:
             raise ValueError(
                 f"Tensor rank must be between 1 and 5, got {rank}")
 
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x} (misaligned by {global_address % 16} bytes)")
-
         if len(box_dim) != rank:
             raise ValueError(
                 f"box_dim must have {rank} elements (same as tensor rank), "
@@ -260,25 +292,10 @@ cdef class TensorMapDescriptor:
                 raise ValueError(
                     f"box_dim[{i}] must be in [1, 256], got {bd}")
 
-        if element_strides is not None:
-            if len(element_strides) != rank:
-                raise ValueError(
-                    f"element_strides must have {rank} elements, got {len(element_strides)}")
-        else:
-            element_strides = (1,) * rank
+        element_strides = _validate_element_strides(element_strides, rank)
 
-        # Compute byte strides from element strides
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        if strides is not None:
-            byte_strides = tuple(s * elem_size for s in strides)
-        else:
-            # C-contiguous: strides in bytes, row-major
-            byte_strides = []
-            stride = elem_size
-            for i in range(rank - 1, -1, -1):
-                byte_strides.append(stride)
-                stride *= shape[i]
-            byte_strides.reverse()
+        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
         # Reverse dimensions for column-major cuTensorMap convention
         # Python/DLPack: row-major (dim 0 = outermost)
@@ -335,8 +352,8 @@ cdef class TensorMapDescriptor:
 
         return desc
 
-    @staticmethod
-    def from_im2col(tensor, pixel_box_lower_corner, pixel_box_upper_corner,
+    @classmethod
+    def from_im2col(cls, tensor, pixel_box_lower_corner, pixel_box_upper_corner,
                     channels_per_pixel, pixels_per_column, *,
                     element_strides=None,
                     data_type=None,
@@ -390,17 +407,9 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [3, 5], the pointer is not
             16-byte aligned, or other constraints are violated.
         """
-        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
-
-        # Obtain a StridedMemoryView from the tensor
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
+        cdef TensorMapDescriptor desc = cls.__new__(cls)
 
+        view = _get_validated_view(tensor)
         desc._source_ref = tensor
 
         tma_dt = _resolve_data_type(view, data_type)
@@ -409,18 +418,12 @@ cdef class TensorMapDescriptor:
 
         cdef intptr_t global_address = view.ptr
         shape = view.shape
-        strides = view.strides
 
         cdef int rank = len(shape)
         if rank < 3 or rank > 5:
             raise ValueError(
                 f"Im2col tensor rank must be between 3 and 5, got {rank}")
 
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x}")
-
         cdef int n_spatial = rank - 2
         if len(pixel_box_lower_corner) != n_spatial:
             raise ValueError(
@@ -431,23 +434,10 @@ cdef class TensorMapDescriptor:
                 f"pixel_box_upper_corner must have {n_spatial} elements "
                 f"(rank - 2), got {len(pixel_box_upper_corner)}")
 
-        if element_strides is not None:
-            if len(element_strides) != rank:
-                raise ValueError(
-                    f"element_strides must have {rank} elements, got {len(element_strides)}")
-        else:
-            element_strides = (1,) * rank
+        element_strides = _validate_element_strides(element_strides, rank)
 
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        if strides is not None:
-            byte_strides = tuple(s * elem_size for s in strides)
-        else:
-            byte_strides = []
-            stride = elem_size
-            for i in range(rank - 1, -1, -1):
-                byte_strides.append(stride)
-                stride *= shape[i]
-            byte_strides.reverse()
+        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
         # Reverse all dimension arrays for column-major convention
         cdef uint64_t[5] c_global_dim
@@ -509,8 +499,8 @@ cdef class TensorMapDescriptor:
 
         return desc
 
-    @staticmethod
-    def from_im2col_wide(tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
+    @classmethod
+    def from_im2col_wide(cls, tensor, pixel_box_lower_corner_width, pixel_box_upper_corner_width,
                          channels_per_pixel, pixels_per_column, *,
                          element_strides=None,
                          data_type=None,
@@ -565,17 +555,9 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [3, 5], the pointer is not
             16-byte aligned, or other constraints are violated.
         """
-        cdef TensorMapDescriptor desc = TensorMapDescriptor.__new__(TensorMapDescriptor)
-
-        # Obtain a StridedMemoryView from the tensor
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
+        cdef TensorMapDescriptor desc = cls.__new__(cls)
 
+        view = _get_validated_view(tensor)
         desc._source_ref = tensor
 
         tma_dt = _resolve_data_type(view, data_type)
@@ -584,35 +566,16 @@ cdef class TensorMapDescriptor:
 
         cdef intptr_t global_address = view.ptr
         shape = view.shape
-        strides = view.strides
 
         cdef int rank = len(shape)
         if rank < 3 or rank > 5:
             raise ValueError(
                 f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
 
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x}")
-
-        if element_strides is not None:
-            if len(element_strides) != rank:
-                raise ValueError(
-                    f"element_strides must have {rank} elements, got {len(element_strides)}")
-        else:
-            element_strides = (1,) * rank
+        element_strides = _validate_element_strides(element_strides, rank)
 
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        if strides is not None:
-            byte_strides = tuple(s * elem_size for s in strides)
-        else:
-            byte_strides = []
-            stride = elem_size
-            for i in range(rank - 1, -1, -1):
-                byte_strides.append(stride)
-                stride *= shape[i]
-            byte_strides.reverse()
+        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
         # Reverse all dimension arrays for column-major convention
         cdef uint64_t[5] c_global_dim
@@ -685,19 +648,9 @@ cdef class TensorMapDescriptor:
             or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
             device-accessible memory with a 16-byte-aligned pointer.
         """
-        if isinstance(tensor, StridedMemoryView):
-            view = tensor
-        else:
-            view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
-
-        if not view.is_device_accessible:
-            raise ValueError("The tensor must be device-accessible")
+        view = _get_validated_view(tensor)
 
         cdef intptr_t global_address = view.ptr
-        if global_address % 16 != 0:
-            raise ValueError(
-                f"Global memory address must be 16-byte aligned, "
-                f"got address 0x{global_address:x}")
 
         with nogil:
             HANDLE_RETURN(cydriver.cuTensorMapReplaceAddress(
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index de51570ec4..e93c02dc08 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index f57e73ded8..790307857b 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest

From 35a04b97367b5cf39a493d2ab6c325e70d970cb8 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 15:55:00 -0800
Subject: [PATCH 04/13] Add comments to prepare_tensor_map_arg explaining
 allocation and lifetime

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cuda_core/cuda/core/_kernel_arg_handler.pyx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index e88def13cd..b55a4abc7a 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -135,8 +135,13 @@ cdef inline int prepare_tensor_map_arg(
         vector.vector[void*]& data_addresses,
         TensorMapDescriptor arg,
         const size_t idx) except -1:
+    # Allocate a temporary buffer for the 128-byte CUtensorMap struct.
+    # We copy rather than pointing directly at arg._tensor_map for lifetime
+    # safety: ParamHolder owns and frees its argument buffers independently.
     cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
     memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
+    # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__,
+    # data_addresses[idx] is the pointer passed to cuLaunchKernel.
     data_addresses[idx] = ptr
     data[idx] = ptr
     return 0

From bb19e4f5c03cb65fd0d53a1cd1a58a8159acaee8 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 17:04:26 -0800
Subject: [PATCH 05/13] Address Copilot review feedback

- Remove unused _alloc_device_tensor helper from tests
- Add test for rank > 5 (6D tensor) to verify upper bound validation
- Add NULL check for PyMem_Malloc in prepare_tensor_map_arg

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cuda_core/cuda/core/_kernel_arg_handler.pyx |  2 ++
 cuda_core/tests/test_tensor_map.py          | 21 +++++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index b55a4abc7a..5619ba7492 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -139,6 +139,8 @@ cdef inline int prepare_tensor_map_arg(
     # We copy rather than pointing directly at arg._tensor_map for lifetime
     # safety: ParamHolder owns and frees its argument buffers independently.
     cdef void* ptr = PyMem_Malloc(sizeof(cydriver.CUtensorMap))
+    if ptr is NULL:
+        raise MemoryError("Failed to allocate memory for CUtensorMap")
     memcpy(ptr, arg._get_data_ptr(), sizeof(cydriver.CUtensorMap))
     # data[idx] is tracked so the allocation is freed in ParamHolder.__dealloc__,
     # data_addresses[idx] is the pointer passed to cuLaunchKernel.
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 790307857b..96dd14a419 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -28,14 +28,6 @@ def skip_if_no_tma(dev):
         pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
 
 
-def _alloc_device_tensor(dev, shape, dtype=np.float32, alignment=256):
-    """Allocate a device buffer and return it with proper alignment."""
-    n_elements = 1
-    for s in shape:
-        n_elements *= s
-    buf = dev.allocate(n_elements * np.dtype(dtype).itemsize + alignment)
-    return buf
-
 
 class _DeviceArray:
     """Wrap a Buffer with explicit shape via __cuda_array_interface__.
@@ -179,6 +171,19 @@ def test_invalid_rank_zero(self, dev, skip_if_no_tma):
                 data_type=TensorMapDataType.FLOAT32,
             )
 
+    def test_invalid_rank_six(self, dev, skip_if_no_tma):
+        shape = (2, 2, 2, 2, 2, 2)
+        n_elements = 1
+        for s in shape:
+            n_elements *= s
+        buf = dev.allocate(n_elements * 4)
+        arr = _DeviceArray(buf, shape)
+        with pytest.raises(ValueError, match="rank must be between 1 and 5"):
+            TensorMapDescriptor.from_tiled(
+                arr,
+                box_dim=(2,) * 6,
+            )
+
     def test_box_dim_rank_mismatch(self, dev, skip_if_no_tma):
         buf = dev.allocate(1024 * 4)
         with pytest.raises(ValueError, match="box_dim must have 1 elements"):

From 23a8900834a1b62a163664d3ab31fcbdf29bba02 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 24 Feb 2026 17:20:05 -0800
Subject: [PATCH 06/13] Split TMA example into two focused files

Move the replace_address() demonstration into its own self-contained
example (tma_replace_address.py) so each file covers a single concept.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cuda_core/examples/tma_replace_address.py | 189 ++++++++++++++++++++++
 cuda_core/examples/tma_tensor_map.py      |  20 ---
 2 files changed, 189 insertions(+), 20 deletions(-)
 create mode 100644 cuda_core/examples/tma_replace_address.py

diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
new file mode 100644
index 0000000000..7f11cf551c
--- /dev/null
+++ b/cuda_core/examples/tma_replace_address.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates how to use replace_address() to repoint a TMA
+# (Tensor Memory Accelerator) descriptor at a different tensor without
+# rebuilding the descriptor from scratch.
+#
+# The workflow is:
+#
+#   1. Create a TMA tiled descriptor and launch a kernel to verify it works
+#   2. Allocate a second tensor with different content
+#   3. Call replace_address() to repoint the same descriptor at the new tensor
+#   4. Re-launch the kernel and verify it reads from the new tensor
+#
+# This is useful when the tensor layout (shape, dtype, tile size) stays the
+# same but the underlying data buffer changes, e.g. double-buffering or
+# iterating over a sequence of same-shaped tensors.
+#
+# Requirements:
+#   - Hopper or later GPU (compute capability >= 9.0)
+#   - CuPy
+#   - CUDA toolkit headers (CUDA_PATH or CUDA_HOME set)
+#
+# ################################################################################
+
+import sys
+
+import cupy as cp
+import numpy as np
+
+from cuda.core import (
+    Device,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    TensorMapDescriptor,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Check for Hopper+ GPU
+# ---------------------------------------------------------------------------
+dev = Device()
+arch = dev.compute_capability
+if arch < (9, 0):
+    print(
+        "TMA requires compute capability >= 9.0 (Hopper or later)",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+dev.set_current()
+
+arch_str = "".join(f"{i}" for i in arch)
+
+# ---------------------------------------------------------------------------
+# CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
+# copies the tile to an output buffer so we can verify correctness.
+#
+# The CUtensorMap struct (128 bytes) is defined inline so the kernel can be
+# compiled with NVRTC without pulling in the full driver-API header.
+#
+# Key points:
+#   - The tensor map is passed by value with __grid_constant__ so the TMA
+#     hardware can read it from grid-constant memory.
+#   - Thread 0 in each block issues the TMA load and manages the mbarrier.
+#   - All threads wait on the mbarrier, then copy from shared to global.
+# ---------------------------------------------------------------------------
+TILE_SIZE = 128  # elements per tile (must match the kernel constant)
+
+code = r"""
+// Minimal definition of the 128-byte opaque tensor map struct.
+struct __align__(64) TensorMap { unsigned long long opaque[16]; };
+
+static constexpr int TILE_SIZE = 128;
+
+extern "C"
+__global__ void tma_copy(
+    const __grid_constant__ TensorMap tensor_map,
+    float* output,
+    int N)
+{
+    __shared__ __align__(128) float smem[TILE_SIZE];
+    __shared__ __align__(8) unsigned long long mbar;
+
+    const int tid        = threadIdx.x;
+    const int tile_start = blockIdx.x * TILE_SIZE;
+
+    // ---- Thread 0: set up mbarrier and issue the TMA load ----
+    if (tid == 0)
+    {
+        // Initialise a single-phase mbarrier (1 arriving thread).
+        asm volatile(
+            "mbarrier.init.shared.b64 [%0], 1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Ask TMA to copy TILE_SIZE floats starting at element 'tile_start'
+        // from the tensor described by 'tensor_map' into shared memory.
+        asm volatile(
+            "cp.async.bulk.tensor.1d.shared::cluster.global.tile"
+            ".mbarrier::complete_tx::bytes"
+            " [%0], [%1, {%2}], [%3];"
+            :: "r"((unsigned)__cvta_generic_to_shared(smem)),
+               "l"(&tensor_map),
+               "r"(tile_start),
+               "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+
+        // Tell the mbarrier how many bytes the TMA will deliver.
+        asm volatile(
+            "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)),
+               "r"((unsigned)(TILE_SIZE * sizeof(float))));
+    }
+
+    __syncthreads();
+
+    // ---- Wait for the TMA load to complete ----
+    if (tid == 0)
+    {
+        asm volatile(
+            "{ .reg .pred P;                                           \n"
+            "WAIT:                                                     \n"
+            "  mbarrier.try_wait.parity.shared.b64 P, [%0], 0;         \n"
+            "  @!P bra WAIT;                                           \n"
+            "}                                                         \n"
+            :: "r"((unsigned)__cvta_generic_to_shared(&mbar)));
+    }
+
+    __syncthreads();
+
+    // ---- Copy the tile from shared memory to the output buffer ----
+    if (tid < TILE_SIZE)
+    {
+        const int idx = tile_start + tid;
+        if (idx < N)
+            output[idx] = smem[tid];
+    }
+}
+"""
+
+# ---------------------------------------------------------------------------
+# Compile the kernel
+# ---------------------------------------------------------------------------
+prog = Program(
+    code,
+    code_type="c++",
+    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+)
+mod = prog.compile("cubin")
+ker = mod.get_kernel("tma_copy")
+
+# ---------------------------------------------------------------------------
+# 1) Prepare input data and verify the initial TMA copy
+# ---------------------------------------------------------------------------
+N = 1024
+a = cp.arange(N, dtype=cp.float32)  # [0, 1, 2, ..., N-1]
+output = cp.zeros(N, dtype=cp.float32)
+dev.sync()  # cupy uses its own stream
+
+tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+
+n_tiles = N // TILE_SIZE
+config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
+launch(dev.default_stream, config, ker, tensor_map, output.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output, a), "TMA copy produced incorrect results"
+print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
+
+# ---------------------------------------------------------------------------
+# 2) Demonstrate replace_address()
+#    Create a second tensor with different content, point the *same*
+#    descriptor at it, and re-launch without rebuilding the descriptor.
+# ---------------------------------------------------------------------------
+b = cp.full(N, fill_value=42.0, dtype=cp.float32)
+dev.sync()
+
+tensor_map.replace_address(b)
+
+output2 = cp.zeros(N, dtype=cp.float32)
+dev.sync()
+
+launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
+dev.sync()
+
+assert cp.array_equal(output2, b), "replace_address produced incorrect results"
+print("replace_address verified: descriptor reused with new source tensor")
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index e93c02dc08..63c0ac7b5c 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -13,7 +13,6 @@
 #   1. Creating a TMA tiled descriptor from a CuPy device array
 #   2. Passing the descriptor to a kernel via launch()
 #   3. Using TMA to load tiles into shared memory (via inline PTX)
-#   4. Updating the descriptor's source address with replace_address()
 #
 # Requirements:
 #   - Hopper or later GPU (compute capability >= 9.0)
@@ -174,22 +173,3 @@
 
 assert cp.array_equal(output, a), "TMA copy produced incorrect results"
 print(f"TMA copy verified: {N} elements across {n_tiles} tiles")
-
-# ---------------------------------------------------------------------------
-# 4) Demonstrate replace_address()
-#    Create a second tensor with different content, point the *same*
-#    descriptor at it, and re-launch without rebuilding the descriptor.
-# ---------------------------------------------------------------------------
-b = cp.full(N, fill_value=42.0, dtype=cp.float32)
-dev.sync()
-
-tensor_map.replace_address(b)
-
-output2 = cp.zeros(N, dtype=cp.float32)
-dev.sync()
-
-launch(dev.default_stream, config, ker, tensor_map, output2.data.ptr, np.int32(N))
-dev.sync()
-
-assert cp.array_equal(output2, b), "replace_address produced incorrect results"
-print("replace_address verified: descriptor reused with new source tensor")

From 0a1b7202b9bbaee509c8a6b44cc47edb9d51d03b Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 25 Feb 2026 14:14:38 -0800
Subject: [PATCH 07/13] pre-commit

---
 cuda_core/cuda/core/__init__.py           |  1 +
 cuda_core/examples/tma_replace_address.py |  1 -
 cuda_core/examples/tma_tensor_map.py      |  1 -
 cuda_core/tests/test_tensor_map.py        | 37 ++++++++++++-----------
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index d5f3693721..84e4f3d356 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -60,6 +60,7 @@
     StridedMemoryView,
     args_viewable_as_strided_memory,
 )
+<<<<<<< HEAD
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
index 7f11cf551c..21fb91c254 100644
--- a/cuda_core/examples/tma_replace_address.py
+++ b/cuda_core/examples/tma_replace_address.py
@@ -30,7 +30,6 @@
 
 import cupy as cp
 import numpy as np
-
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index 63c0ac7b5c..c85c5d96bd 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -25,7 +25,6 @@
 
 import cupy as cp
 import numpy as np
-
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 96dd14a419..1d6e8195b6 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -1,14 +1,12 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
-
 import numpy as np
-
+import pytest
 from cuda.core import (
     Device,
-    TensorMapDescriptor,
     TensorMapDataType,
+    TensorMapDescriptor,
     TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
@@ -28,7 +26,6 @@ def skip_if_no_tma(dev):
         pytest.skip("Device does not support TMA (requires compute capability 9.0+)")
 
 
-
 class _DeviceArray:
     """Wrap a Buffer with explicit shape via __cuda_array_interface__.
 
@@ -36,6 +33,7 @@ class _DeviceArray:
     we need the tensor to report a proper shape/dtype so the TMA encoder sees
     the correct rank, dimensions, and strides.
     """
+
     def __init__(self, buf, shape, dtype=np.float32):
         self._buf = buf  # prevent GC
         self.__cuda_array_interface__ = {
@@ -225,25 +223,30 @@ def test_invalid_data_type(self, dev, skip_if_no_tma):
 class TestTensorMapDtypeMapping:
     """Test automatic dtype inference from numpy dtypes."""
 
-    @pytest.mark.parametrize("np_dtype,expected_tma_dt", [
-        (np.uint8, TensorMapDataType.UINT8),
-        (np.uint16, TensorMapDataType.UINT16),
-        (np.uint32, TensorMapDataType.UINT32),
-        (np.int32, TensorMapDataType.INT32),
-        (np.uint64, TensorMapDataType.UINT64),
-        (np.int64, TensorMapDataType.INT64),
-        (np.float16, TensorMapDataType.FLOAT16),
-        (np.float32, TensorMapDataType.FLOAT32),
-        (np.float64, TensorMapDataType.FLOAT64),
-    ])
+    @pytest.mark.parametrize(
+        "np_dtype,expected_tma_dt",
+        [
+            (np.uint8, TensorMapDataType.UINT8),
+            (np.uint16, TensorMapDataType.UINT16),
+            (np.uint32, TensorMapDataType.UINT32),
+            (np.int32, TensorMapDataType.INT32),
+            (np.uint64, TensorMapDataType.UINT64),
+            (np.int64, TensorMapDataType.INT64),
+            (np.float16, TensorMapDataType.FLOAT16),
+            (np.float32, TensorMapDataType.FLOAT32),
+            (np.float64, TensorMapDataType.FLOAT64),
+        ],
+    )
     def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma):
         from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+
         assert _NUMPY_DTYPE_TO_TMA[np.dtype(np_dtype)] == expected_tma_dt
 
     def test_bfloat16_mapping(self):
         try:
-            from ml_dtypes import bfloat16
             from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+            from ml_dtypes import bfloat16
+
             assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16
         except ImportError:
             pytest.skip("ml_dtypes not installed")

From 44fbdcf80f7c917bbd9ef790da0c7001cd2b747f Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 25 Feb 2026 15:07:54 -0800
Subject: [PATCH 08/13] adding stride meta data to gpu allocated memory

---
 cuda_core/tests/test_tensor_map.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 1d6e8195b6..3e030dd29d 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -98,8 +98,9 @@ def test_from_tiled_1d(self, dev, skip_if_no_tma):
 
     def test_from_tiled_2d(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)  # 64x64 float32
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -107,8 +108,9 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma):
 
     def test_from_tiled_3d(self, dev, skip_if_no_tma):
         buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
+        tensor = _DeviceArray(buf, (16, 16, 16))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(8, 8, 8),
             data_type=TensorMapDataType.FLOAT32,
         )
@@ -128,8 +130,9 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma):
 
     def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             swizzle=TensorMapSwizzle.SWIZZLE_128B,
@@ -138,8 +141,9 @@ def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
 
     def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             l2_promotion=TensorMapL2Promotion.L2_128B,
@@ -148,8 +152,9 @@ def test_from_tiled_with_l2_promotion(self, dev, skip_if_no_tma):
 
     def test_from_tiled_with_oob_fill(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
         desc = TensorMapDescriptor.from_tiled(
-            buf,
+            tensor,
             box_dim=(32, 32),
             data_type=TensorMapDataType.FLOAT32,
             oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA,
@@ -162,9 +167,10 @@ class TestTensorMapDescriptorValidation:
 
     def test_invalid_rank_zero(self, dev, skip_if_no_tma):
         buf = dev.allocate(64)
+        tensor = _DeviceArray(buf, ())  # 0-dim tensor
         with pytest.raises(ValueError, match="rank must be between 1 and 5"):
             TensorMapDescriptor.from_tiled(
-                buf,
+                tensor,
                 box_dim=(),
                 data_type=TensorMapDataType.FLOAT32,
             )
@@ -286,8 +292,9 @@ class TestTensorMapIm2col:
     def test_from_im2col_3d(self, dev, skip_if_no_tma):
         # 3D tensor: batch=1, height=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))
         desc = TensorMapDescriptor.from_im2col(
-            buf,
+            tensor,
             pixel_box_lower_corner=(0,),
             pixel_box_upper_corner=(4,),
             channels_per_pixel=64,
@@ -310,9 +317,10 @@ def test_from_im2col_rank_validation(self, dev, skip_if_no_tma):
 
     def test_from_im2col_corner_rank_mismatch(self, dev, skip_if_no_tma):
         buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))  # 3D: n_spatial = 1
         with pytest.raises(ValueError, match="pixel_box_lower_corner must have 1 elements"):
             TensorMapDescriptor.from_im2col(
-                buf,
+                tensor,
                 pixel_box_lower_corner=(0, 0),
                 pixel_box_upper_corner=(4,),
                 channels_per_pixel=64,

From bdf39a24efcb528a2610967ea5f6459c0fee2472 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Wed, 25 Feb 2026 15:59:31 -0800
Subject: [PATCH 09/13] im2col fixes

---
 cuda_core/tests/test_tensor_map.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 3e030dd29d..8b5e27ce6f 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -373,8 +373,9 @@ def skip_if_no_im2col_wide(self, dev):
     def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide):
         # 3D tensor: batch=1, width=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))
         desc = TensorMapDescriptor.from_im2col_wide(
-            buf,
+            tensor,
             pixel_box_lower_corner_width=0,
             pixel_box_upper_corner_width=4,
             channels_per_pixel=64,

From 96a3e84275f8d88e6476b1e71ef04bf6fedba6c8 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Tue, 3 Mar 2026 18:41:07 -0500
Subject: [PATCH 10/13] Reuse CCCL TMA descriptor construction for tiled
 TensorMap and keep validated views alive to avoid DLPack-backed pointer
 lifetime hazards. Add explicit tiled element-stride coverage and acknowledge
 the DLPack include-layout compatibility follow-up in NVIDIA/cccl#7871.

Made-with: Cursor
---
 .gitignore                                 |   1 +
 cuda_core/cuda/core/__init__.py            |   1 -
 cuda_core/cuda/core/_cpp/tensor_map.cpp    | 149 +++++++++
 cuda_core/cuda/core/_cpp/tensor_map_cccl.h |  43 +++
 cuda_core/cuda/core/_tensor_map.pxd        |   1 +
 cuda_core/cuda/core/_tensor_map.pyx        | 358 ++++++++++++++++-----
 cuda_core/examples/tma_replace_address.py  |   1 +
 cuda_core/examples/tma_tensor_map.py       |   1 +
 cuda_core/tests/test_tensor_map.py         |  27 +-
 9 files changed, 497 insertions(+), 85 deletions(-)
 create mode 100644 cuda_core/cuda/core/_cpp/tensor_map.cpp
 create mode 100644 cuda_core/cuda/core/_cpp/tensor_map_cccl.h

diff --git a/.gitignore b/.gitignore
index 7d9bcd38c3..18c919ca5e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ __pycache__/
 !*_impl.cpp
 !cuda_bindings/cuda/bindings/_lib/param_packer.cpp
 !cuda_bindings/cuda/bindings/_bindings/loader.cpp
+!cuda_core/cuda/core/_cpp/*.cpp
 cache_driver
 cache_runtime
 cache_nvrtc
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 84e4f3d356..d5f3693721 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -60,7 +60,6 @@
     StridedMemoryView,
     args_viewable_as_strided_memory,
 )
-<<<<<<< HEAD
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
diff --git a/cuda_core/cuda/core/_cpp/tensor_map.cpp b/cuda_core/cuda/core/_cpp/tensor_map.cpp
new file mode 100644
index 0000000000..af09aa1b2f
--- /dev/null
+++ b/cuda_core/cuda/core/_cpp/tensor_map.cpp
@@ -0,0 +1,149 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tensor_map_cccl.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <exception>
+
+#if defined(__has_include)
+#  if __has_include(<cuda/tma>)
+#    include <cuda/tma>
+#    define CUDA_CORE_HAS_CUDA_TMA 1
+#  else
+#    define CUDA_CORE_HAS_CUDA_TMA 0
+#  endif
+#  if __has_include(<dlpack/dlpack.h>)
+#    include <dlpack/dlpack.h>
+#    define CUDA_CORE_HAS_DLPACK_H 1
+#  else
+#    define CUDA_CORE_HAS_DLPACK_H 0
+#  endif
+#else
+#  define CUDA_CORE_HAS_CUDA_TMA 0
+#  define CUDA_CORE_HAS_DLPACK_H 0
+#endif
+
+static inline void cuda_core_write_err(char* err, size_t cap, const char* msg) noexcept
+{
+  if (!err || cap == 0)
+    return;
+  if (!msg)
+  {
+    err[0] = '\0';
+    return;
+  }
+  size_t n = ::strlen(msg);
+  if (n >= cap)
+    n = cap - 1;
+  ::memcpy(err, msg, n);
+  err[n] = '\0';
+}
+
+int cuda_core_cccl_make_tma_descriptor_tiled(
+  void* out_tensor_map,
+  void* data,
+  int device_type,
+  int device_id,
+  int ndim,
+  const int64_t* shape,
+  const int64_t* strides,
+  uint8_t dtype_code,
+  uint8_t dtype_bits,
+  uint16_t dtype_lanes,
+  const int* box_sizes,
+  const int* elem_strides,
+  int interleave_layout,
+  int swizzle,
+  int l2_fetch_size,
+  int oob_fill,
+  char* err,
+  size_t err_cap) noexcept
+{
+#if !(CUDA_CORE_HAS_CUDA_TMA && CUDA_CORE_HAS_DLPACK_H)
+  (void)out_tensor_map;
+  (void)data;
+  (void)device_type;
+  (void)device_id;
+  (void)ndim;
+  (void)shape;
+  (void)strides;
+  (void)dtype_code;
+  (void)dtype_bits;
+  (void)dtype_lanes;
+  (void)box_sizes;
+  (void)elem_strides;
+  (void)interleave_layout;
+  (void)swizzle;
+  (void)l2_fetch_size;
+  (void)oob_fill;
+  cuda_core_write_err(err, err_cap, "CCCL <cuda/tma> and/or <dlpack/dlpack.h> not available at build time");
+  return 1;
+#else
+  try
+  {
+    if (!out_tensor_map)
+    {
+      cuda_core_write_err(err, err_cap, "out_tensor_map is NULL");
+      return 1;
+    }
+    if (!data)
+    {
+      cuda_core_write_err(err, err_cap, "tensor data pointer is NULL");
+      return 1;
+    }
+    if (!shape || !box_sizes || ndim <= 0)
+    {
+      cuda_core_write_err(err, err_cap, "invalid rank/shape/box_sizes");
+      return 1;
+    }
+
+    DLTensor t{};
+    t.data        = data;
+    t.device      = {static_cast<DLDeviceType>(device_type), device_id};
+    t.ndim        = ndim;
+    t.dtype.code  = dtype_code;
+    t.dtype.bits  = dtype_bits;
+    t.dtype.lanes = dtype_lanes;
+    // CCCL promises not to mutate the arrays, but DLPack uses non-const pointers.
+    t.shape       = const_cast<int64_t*>(shape);
+    t.strides     = const_cast<int64_t*>(strides);
+    t.byte_offset = 0;
+
+    const auto layout = static_cast<cuda::tma_interleave_layout>(interleave_layout);
+    const auto swz    = static_cast<cuda::tma_swizzle>(swizzle);
+    const auto l2     = static_cast<cuda::tma_l2_fetch_size>(l2_fetch_size);
+    const auto oob    = static_cast<cuda::tma_oob_fill>(oob_fill);
+
+    auto box = cuda::std::span<const int>(box_sizes, static_cast<size_t>(ndim));
+
+    CUtensorMap desc{};
+    if (elem_strides)
+    {
+      auto es = cuda::std::span<const int>(elem_strides, static_cast<size_t>(ndim));
+      desc    = cuda::make_tma_descriptor(t, box, es, layout, swz, l2, oob);
+    }
+    else
+    {
+      desc = cuda::make_tma_descriptor(t, box, layout, swz, l2, oob);
+    }
+
+    ::memcpy(out_tensor_map, &desc, sizeof(CUtensorMap));
+    cuda_core_write_err(err, err_cap, nullptr);
+    return 0;
+  }
+  catch (const std::exception& e)
+  {
+    cuda_core_write_err(err, err_cap, e.what());
+    return 1;
+  }
+  catch (...)
+  {
+    cuda_core_write_err(err, err_cap, "unknown error while building TMA descriptor");
+    return 1;
+  }
+#endif
+}
diff --git a/cuda_core/cuda/core/_cpp/tensor_map_cccl.h b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h
new file mode 100644
index 0000000000..71be425182
--- /dev/null
+++ b/cuda_core/cuda/core/_cpp/tensor_map_cccl.h
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef CUDA_CORE_TENSOR_MAP_CCCL_H_
+#define CUDA_CORE_TENSOR_MAP_CCCL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Build a tiled CUtensorMap using CCCL's cuda::make_tma_descriptor (from <cuda/tma>).
+//
+// Returns 0 on success; on failure returns non-zero and writes a best-effort
+// human-readable message into (err, err_cap) if provided.
+int cuda_core_cccl_make_tma_descriptor_tiled(
+  void* out_tensor_map,
+  void* data,
+  int device_type,
+  int device_id,
+  int ndim,
+  const int64_t* shape,        // length ndim
+  const int64_t* strides,      // length ndim, or NULL for contiguous
+  uint8_t dtype_code,
+  uint8_t dtype_bits,
+  uint16_t dtype_lanes,
+  const int* box_sizes,        // length ndim
+  const int* elem_strides,     // length ndim, or NULL for all-ones overload
+  int interleave_layout,
+  int swizzle,
+  int l2_fetch_size,
+  int oob_fill,
+  char* err,
+  size_t err_cap) noexcept;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // CUDA_CORE_TENSOR_MAP_CCCL_H_
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index b2b165b5aa..07e8adebc9 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver
 cdef class TensorMapDescriptor:
     cdef cydriver.CUtensorMap _tensor_map
     cdef object _source_ref
+    cdef object _view_ref
     cdef object _repr_info
 
     cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 9edcccb928..6054add9b6 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -2,9 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport intptr_t, uint32_t, uint64_t
+from libc.stdint cimport intptr_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stddef cimport size_t
 from cuda.bindings cimport cydriver
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._dlpack cimport kDLInt, kDLUInt, kDLFloat, kDLBfloat, _kDLCUDA
 
 import enum
 
@@ -12,6 +14,27 @@ import numpy
 
 from cuda.core._memoryview import StridedMemoryView
 
+cdef extern from "_cpp/tensor_map_cccl.h":
+    int cuda_core_cccl_make_tma_descriptor_tiled(
+        void* out_tensor_map,
+        void* data,
+        int device_type,
+        int device_id,
+        int ndim,
+        const int64_t* shape,
+        const int64_t* strides,
+        uint8_t dtype_code,
+        uint8_t dtype_bits,
+        uint16_t dtype_lanes,
+        const int* box_sizes,
+        const int* elem_strides,
+        int interleave_layout,
+        int swizzle,
+        int l2_fetch_size,
+        int oob_fill,
+        char* err,
+        size_t err_cap) nogil
+
 
 try:
     from ml_dtypes import bfloat16 as ml_bfloat16
@@ -80,14 +103,25 @@ class TensorMapOOBFill(enum.IntEnum):
     NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 
 
-class TensorMapIm2ColWideMode(enum.IntEnum):
-    """Im2col wide mode for tensor map descriptors.
+IF CUDA_CORE_BUILD_MAJOR >= 13:
+    class TensorMapIm2ColWideMode(enum.IntEnum):
+        """Im2col wide mode for tensor map descriptors.
 
-    These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values.
-    Supported on compute capability 10.0+.
-    """
-    W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
-    W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+        These correspond to the ``CUtensorMapIm2ColWideMode`` driver enum values.
+        Supported on compute capability 10.0+.
+        """
+        W = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
+        W128 = cydriver.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+ELSE:
+    class TensorMapIm2ColWideMode(enum.IntEnum):
+        """Im2col wide mode for tensor map descriptors.
+
+        This enum is always defined for API stability, but the
+        :meth:`TensorMapDescriptor.from_im2col_wide` factory requires a CUDA 13+
+        build and will raise otherwise.
+        """
+        W = 0
+        W128 = 1
 
 
 # Mapping from numpy dtype to TMA data type
@@ -150,6 +184,65 @@ def _resolve_data_type(view, data_type):
     return tma_dt
 
 
+cdef inline bint _tma_dtype_to_dlpack(
+    object tma_dt,
+    uint8_t* out_code,
+    uint8_t* out_bits,
+    uint16_t* out_lanes,
+) noexcept:
+    if tma_dt == TensorMapDataType.UINT8:
+        out_code[0] = <uint8_t>kDLUInt
+        out_bits[0] = <uint8_t>8
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.UINT16:
+        out_code[0] = <uint8_t>kDLUInt
+        out_bits[0] = <uint8_t>16
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.UINT32:
+        out_code[0] = <uint8_t>kDLUInt
+        out_bits[0] = <uint8_t>32
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.UINT64:
+        out_code[0] = <uint8_t>kDLUInt
+        out_bits[0] = <uint8_t>64
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.INT32:
+        out_code[0] = <uint8_t>kDLInt
+        out_bits[0] = <uint8_t>32
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.INT64:
+        out_code[0] = <uint8_t>kDLInt
+        out_bits[0] = <uint8_t>64
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.FLOAT16:
+        out_code[0] = <uint8_t>kDLFloat
+        out_bits[0] = <uint8_t>16
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.FLOAT32:
+        out_code[0] = <uint8_t>kDLFloat
+        out_bits[0] = <uint8_t>32
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.FLOAT64:
+        out_code[0] = <uint8_t>kDLFloat
+        out_bits[0] = <uint8_t>64
+        out_lanes[0] = <uint16_t>1
+        return True
+    if tma_dt == TensorMapDataType.BFLOAT16:
+        out_code[0] = <uint8_t>kDLBfloat
+        out_bits[0] = <uint8_t>16
+        out_lanes[0] = <uint16_t>1
+        return True
+    return False
+
+
 def _get_validated_view(tensor):
     """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer."""
     if isinstance(tensor, StridedMemoryView):
@@ -268,7 +361,11 @@ cdef class TensorMapDescriptor:
         cdef TensorMapDescriptor desc = cls.__new__(cls)
 
         view = _get_validated_view(tensor)
+        # Keep both the original tensor object and the validated view alive.
+        # For DLPack exporters, the view may hold the owning capsule whose
+        # deleter can free the backing allocation when released.
         desc._source_ref = tensor
+        desc._view_ref = view
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -292,8 +389,92 @@ cdef class TensorMapDescriptor:
                 raise ValueError(
                     f"box_dim[{i}] must be in [1, 256], got {bd}")
 
+        cdef bint elem_strides_provided = element_strides is not None
         element_strides = _validate_element_strides(element_strides, rank)
 
+        # Reuse CCCL/libcu++'s DLPack -> CUtensorMap conversion when possible.
+        # This avoids maintaining a second, independent validation/encoding implementation.
+        cdef uint8_t dl_code
+        cdef uint8_t dl_bits
+        cdef uint16_t dl_lanes
+        cdef int64_t c_shape[5]
+        cdef int64_t c_strides[5]
+        cdef int c_box_sizes[5]
+        cdef int c_elem_strides[5]
+        cdef const int64_t* c_strides_ptr
+        cdef const int* c_elem_strides_ptr
+        cdef char errbuf[512]
+        cdef int i_cccl
+        cdef int device_type
+        cdef int c_device_id
+        cdef int c_cccl_interleave_int
+        cdef int c_cccl_swizzle_int
+        cdef int c_cccl_l2_promotion_int
+        cdef int c_cccl_oob_fill_int
+        cdef int rc
+        if _tma_dtype_to_dlpack(tma_dt, &dl_code, &dl_bits, &dl_lanes):
+            c_strides_ptr = NULL
+            c_elem_strides_ptr = NULL
+            errbuf[0] = 0
+
+            for i_cccl in range(rank):
+                c_shape[i_cccl] = <int64_t>shape[i_cccl]
+                c_box_sizes[i_cccl] = <int>box_dim[i_cccl]
+                if elem_strides_provided:
+                    c_elem_strides[i_cccl] = <int>element_strides[i_cccl]
+
+            if view.strides is not None:
+                for i_cccl in range(rank):
+                    c_strides[i_cccl] = <int64_t>view.strides[i_cccl]
+                c_strides_ptr = &c_strides[0]
+
+            if elem_strides_provided:
+                c_elem_strides_ptr = &c_elem_strides[0]
+
+            device_type = <int>_kDLCUDA
+            c_device_id = <int>view.device_id
+            c_cccl_interleave_int = int(interleave)
+            c_cccl_swizzle_int = int(swizzle)
+            c_cccl_l2_promotion_int = int(l2_promotion)
+            c_cccl_oob_fill_int = int(oob_fill)
+
+            with nogil:
+                rc = cuda_core_cccl_make_tma_descriptor_tiled(
+                    <void*>&desc._tensor_map,
+                    <void*>global_address,
+                    device_type,
+                    c_device_id,
+                    rank,
+                    &c_shape[0],
+                    c_strides_ptr,
+                    dl_code,
+                    dl_bits,
+                    dl_lanes,
+                    &c_box_sizes[0],
+                    c_elem_strides_ptr,
+                    c_cccl_interleave_int,
+                    c_cccl_swizzle_int,
+                    c_cccl_l2_promotion_int,
+                    c_cccl_oob_fill_int,
+                    &errbuf[0],
+                    <size_t>sizeof(errbuf),
+                )
+
+            if rc == 0:
+                desc._repr_info = {
+                    "method": "tiled",
+                    "rank": rank,
+                    "data_type": tma_dt,
+                    "swizzle": swizzle,
+                }
+                return desc
+
+            msg = errbuf[:].split(b"\0", 1)[0].decode("utf-8", errors="replace")
+            # If CCCL isn't available at build time, fall back to the direct
+            # driver API path to preserve functionality on older toolchains.
+            if "not available at build time" not in msg:
+                raise ValueError(f"Failed to build TMA descriptor via CCCL: {msg}")
+
         cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
         byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
 
@@ -411,6 +592,7 @@ cdef class TensorMapDescriptor:
 
         view = _get_validated_view(tensor)
         desc._source_ref = tensor
+        desc._view_ref = view
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -447,6 +629,10 @@ cdef class TensorMapDescriptor:
         cdef int[3] c_pixel_box_upper
         cdef int i_c
 
+        for i_c in range(3):
+            c_pixel_box_lower[i_c] = 0
+            c_pixel_box_upper[i_c] = 0
+
         for i_c in range(rank):
             c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
             c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
@@ -555,85 +741,90 @@ cdef class TensorMapDescriptor:
             If the tensor rank is outside [3, 5], the pointer is not
             16-byte aligned, or other constraints are violated.
         """
-        cdef TensorMapDescriptor desc = cls.__new__(cls)
-
-        view = _get_validated_view(tensor)
-        desc._source_ref = tensor
-
-        tma_dt = _resolve_data_type(view, data_type)
-        cdef int c_data_type_int = int(tma_dt)
-        cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
-
-        cdef intptr_t global_address = view.ptr
-        shape = view.shape
-
-        cdef int rank = len(shape)
-        if rank < 3 or rank > 5:
-            raise ValueError(
-                f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
-
-        element_strides = _validate_element_strides(element_strides, rank)
-
-        cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
-        byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
-
-        # Reverse all dimension arrays for column-major convention
-        cdef uint64_t[5] c_global_dim
-        cdef uint64_t[4] c_global_strides
-        cdef uint32_t[5] c_element_strides
-        cdef int i_c
-
-        for i_c in range(rank):
-            c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
-            c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+        IF CUDA_CORE_BUILD_MAJOR < 13:
+            raise RuntimeError(
+                "TensorMapDescriptor.from_im2col_wide requires a CUDA 13+ build")
+        ELSE:
+            cdef TensorMapDescriptor desc = cls.__new__(cls)
 
-        for i_c in range(rank - 1):
-            c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+            view = _get_validated_view(tensor)
+            desc._source_ref = tensor
+            desc._view_ref = view
 
-        cdef uint32_t c_rank = <uint32_t>rank
-        cdef int c_lower_w = <int>pixel_box_lower_corner_width
-        cdef int c_upper_w = <int>pixel_box_upper_corner_width
-        cdef uint32_t c_channels = <uint32_t>channels_per_pixel
-        cdef uint32_t c_pixels = <uint32_t>pixels_per_column
-        cdef int c_interleave_int = int(interleave)
-        cdef int c_mode_int = int(mode)
-        cdef int c_swizzle_int = int(swizzle)
-        cdef int c_l2_promotion_int = int(l2_promotion)
-        cdef int c_oob_fill_int = int(oob_fill)
-        cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
-        cdef cydriver.CUtensorMapIm2ColWideMode c_mode = <cydriver.CUtensorMapIm2ColWideMode>c_mode_int
-        cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
-        cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
-        cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+            tma_dt = _resolve_data_type(view, data_type)
+            cdef int c_data_type_int = int(tma_dt)
+            cdef cydriver.CUtensorMapDataType c_data_type = <cydriver.CUtensorMapDataType>c_data_type_int
 
-        with nogil:
-            HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide(
-                &desc._tensor_map,
-                c_data_type,
-                c_rank,
-                <void*>global_address,
-                c_global_dim,
-                c_global_strides,
-                c_lower_w,
-                c_upper_w,
-                c_channels,
-                c_pixels,
-                c_element_strides,
-                c_interleave,
-                c_mode,
-                c_swizzle,
-                c_l2_promotion,
-                c_oob_fill,
-            ))
+            cdef intptr_t global_address = view.ptr
+            shape = view.shape
 
-        desc._repr_info = {
-            "method": "im2col_wide",
-            "rank": rank,
-            "data_type": tma_dt,
-            "swizzle": swizzle,
-        }
-
-        return desc
+            cdef int rank = len(shape)
+            if rank < 3 or rank > 5:
+                raise ValueError(
+                    f"Im2col-wide tensor rank must be between 3 and 5, got {rank}")
+
+            element_strides = _validate_element_strides(element_strides, rank)
+
+            cdef int elem_size = _TMA_DATA_TYPE_SIZE[tma_dt]
+            byte_strides = _compute_byte_strides(shape, view.strides, elem_size)
+
+            # Reverse all dimension arrays for column-major convention
+            cdef uint64_t[5] c_global_dim
+            cdef uint64_t[4] c_global_strides
+            cdef uint32_t[5] c_element_strides
+            cdef int i_c
+
+            for i_c in range(rank):
+                c_global_dim[i_c] = <uint64_t>shape[rank - 1 - i_c]
+                c_element_strides[i_c] = <uint32_t>element_strides[rank - 1 - i_c]
+
+            for i_c in range(rank - 1):
+                c_global_strides[i_c] = <uint64_t>byte_strides[rank - 2 - i_c]
+
+            cdef uint32_t c_rank = <uint32_t>rank
+            cdef int c_lower_w = <int>pixel_box_lower_corner_width
+            cdef int c_upper_w = <int>pixel_box_upper_corner_width
+            cdef uint32_t c_channels = <uint32_t>channels_per_pixel
+            cdef uint32_t c_pixels = <uint32_t>pixels_per_column
+            cdef int c_interleave_int = int(interleave)
+            cdef int c_mode_int = int(mode)
+            cdef int c_swizzle_int = int(swizzle)
+            cdef int c_l2_promotion_int = int(l2_promotion)
+            cdef int c_oob_fill_int = int(oob_fill)
+            cdef cydriver.CUtensorMapInterleave c_interleave = <cydriver.CUtensorMapInterleave>c_interleave_int
+            cdef cydriver.CUtensorMapIm2ColWideMode c_mode = <cydriver.CUtensorMapIm2ColWideMode>c_mode_int
+            cdef cydriver.CUtensorMapSwizzle c_swizzle = <cydriver.CUtensorMapSwizzle>c_swizzle_int
+            cdef cydriver.CUtensorMapL2promotion c_l2_promotion = <cydriver.CUtensorMapL2promotion>c_l2_promotion_int
+            cdef cydriver.CUtensorMapFloatOOBfill c_oob_fill = <cydriver.CUtensorMapFloatOOBfill>c_oob_fill_int
+
+            with nogil:
+                HANDLE_RETURN(cydriver.cuTensorMapEncodeIm2colWide(
+                    &desc._tensor_map,
+                    c_data_type,
+                    c_rank,
+                    <void*>global_address,
+                    c_global_dim,
+                    c_global_strides,
+                    c_lower_w,
+                    c_upper_w,
+                    c_channels,
+                    c_pixels,
+                    c_element_strides,
+                    c_interleave,
+                    c_mode,
+                    c_swizzle,
+                    c_l2_promotion,
+                    c_oob_fill,
+                ))
+
+            desc._repr_info = {
+                "method": "im2col_wide",
+                "rank": rank,
+                "data_type": tma_dt,
+                "swizzle": swizzle,
+            }
+
+            return desc
 
     def replace_address(self, tensor):
         """Replace the global memory address in this tensor map descriptor.
@@ -662,6 +853,7 @@ cdef class TensorMapDescriptor:
         # so we don't drop the old tensor (risking a dangling pointer in the
         # CUtensorMap struct) if the call fails.
         self._source_ref = tensor
+        self._view_ref = view
 
     def __repr__(self):
         info = self._repr_info
diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
index 21fb91c254..7f11cf551c 100644
--- a/cuda_core/examples/tma_replace_address.py
+++ b/cuda_core/examples/tma_replace_address.py
@@ -30,6 +30,7 @@
 
 import cupy as cp
 import numpy as np
+
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index c85c5d96bd..63c0ac7b5c 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -25,6 +25,7 @@
 
 import cupy as cp
 import numpy as np
+
 from cuda.core import (
     Device,
     LaunchConfig,
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 8b5e27ce6f..4db456d863 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pytest
+
 from cuda.core import (
     Device,
     TensorMapDataType,
@@ -128,6 +129,29 @@ def test_from_tiled_5d(self, dev, skip_if_no_tma):
         )
         assert desc is not None
 
+    def test_from_tiled_with_element_strides_buffer(self, dev, skip_if_no_tma):
+        # Use a Buffer input (DLPack path) and explicit element_strides.
+        buf = dev.allocate(1024 * 4)
+        desc = TensorMapDescriptor.from_tiled(
+            buf,
+            box_dim=(64,),
+            element_strides=(2,),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
+    def test_from_tiled_with_element_strides_cai(self, dev, skip_if_no_tma):
+        # Use a CAI-style tensor wrapper and explicit element_strides.
+        buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
+        desc = TensorMapDescriptor.from_tiled(
+            tensor,
+            box_dim=(32, 32),
+            element_strides=(2, 1),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
     def test_from_tiled_with_swizzle(self, dev, skip_if_no_tma):
         buf = dev.allocate(64 * 64 * 4)
         tensor = _DeviceArray(buf, (64, 64))
@@ -250,9 +274,10 @@ def test_dtype_mapping(self, np_dtype, expected_tma_dt, dev, skip_if_no_tma):
 
     def test_bfloat16_mapping(self):
         try:
-            from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
             from ml_dtypes import bfloat16
 
+            from cuda.core._tensor_map import _NUMPY_DTYPE_TO_TMA
+
             assert _NUMPY_DTYPE_TO_TMA[np.dtype(bfloat16)] == TensorMapDataType.BFLOAT16
         except ImportError:
             pytest.skip("ml_dtypes not installed")

From 1a6b41608658499486fa5705eaea93551425442b Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 7 Mar 2026 08:00:36 -0500
Subject: [PATCH 11/13] Skip im2col-wide TensorMap tests when runtime support
 is unavailable.

Probe support in the fixture and skip when cuda.core is built without CUDA 13 im2col-wide support or when the driver/GPU reports CUDA_ERROR_INVALID_VALUE, so unsupported RTXPRO6000 lanes don't block unrelated changes.

Made-with: Cursor
---
 cuda_core/tests/test_tensor_map.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 4db456d863..2d6dc138c0 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -395,6 +395,31 @@ def skip_if_no_im2col_wide(self, dev):
         if cc.major < 10:
             pytest.skip("Device does not support im2col-wide (requires compute capability 10.0+)")
 
+        # Some environments in CI exercise this test module with a cuda.core
+        # build that does not include im2col-wide symbols (CUDA < 13 build),
+        # or with driver/GPU combinations that reject im2col-wide descriptor
+        # encoding for otherwise valid inputs. Probe once per test invocation
+        # and skip only for those known unsupported cases.
+        buf = dev.allocate(1 * 32 * 64 * 4)
+        tensor = _DeviceArray(buf, (1, 32, 64))
+        try:
+            TensorMapDescriptor.from_im2col_wide(
+                tensor,
+                pixel_box_lower_corner_width=0,
+                pixel_box_upper_corner_width=4,
+                channels_per_pixel=64,
+                pixels_per_column=4,
+                data_type=TensorMapDataType.FLOAT32,
+            )
+        except RuntimeError as e:
+            if "requires a CUDA 13+ build" in str(e):
+                pytest.skip("Im2col-wide requires cuda.core built with CUDA 13+")
+            raise
+        except Exception as e:
+            if "CUDA_ERROR_INVALID_VALUE" in str(e):
+                pytest.skip("Im2col-wide unsupported on this driver/GPU combination")
+            raise
+
     def test_from_im2col_wide_3d(self, dev, skip_if_no_im2col_wide):
         # 3D tensor: batch=1, width=32, channels=64
         buf = dev.allocate(1 * 32 * 64 * 4)

From 892ee605e500b2e9225f7077bd66e61e1fc4eb97 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 7 Mar 2026 09:41:46 -0500
Subject: [PATCH 12/13] Align TensorMap API surface with review feedback and
 enforce context safety.

Expose only TensorMapDescriptor in cuda.core, add StridedMemoryView.as_tensor_map(), remove redundant tensor-map fallback packing, and track/check descriptor context/device compatibility before replacement and kernel launch argument packing.

Made-with: Cursor
---
 .gitignore                                  |  1 -
 cuda_core/cuda/core/__init__.py             | 10 +----
 cuda_core/cuda/core/_kernel_arg_handler.pyx |  4 +-
 cuda_core/cuda/core/_memoryview.pyx         | 33 ++++++++++++++
 cuda_core/cuda/core/_tensor_map.pxd         |  4 ++
 cuda_core/cuda/core/_tensor_map.pyx         | 48 +++++++++++++++++++++
 cuda_core/examples/tma_replace_address.py   |  4 +-
 cuda_core/examples/tma_tensor_map.py        | 11 ++---
 cuda_core/tests/test_tensor_map.py          | 15 ++++++-
 9 files changed, 107 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 18c919ca5e..7d9bcd38c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,6 @@ __pycache__/
 !*_impl.cpp
 !cuda_bindings/cuda/bindings/_lib/param_packer.cpp
 !cuda_bindings/cuda/bindings/_bindings/loader.cpp
-!cuda_core/cuda/core/_cpp/*.cpp
 cache_driver
 cache_runtime
 cache_nvrtc
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index d5f3693721..c32ff79ecc 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -68,12 +68,4 @@
     Stream,
     StreamOptions,
 )
-from cuda.core._tensor_map import (
-    TensorMapDataType,
-    TensorMapDescriptor,
-    TensorMapIm2ColWideMode,
-    TensorMapInterleave,
-    TensorMapL2Promotion,
-    TensorMapOOBFill,
-    TensorMapSwizzle,
-)
+from cuda.core._tensor_map import TensorMapDescriptor
diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
index 5619ba7492..28a981fed2 100644
--- a/cuda_core/cuda/core/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -135,6 +135,7 @@ cdef inline int prepare_tensor_map_arg(
         vector.vector[void*]& data_addresses,
         TensorMapDescriptor arg,
         const size_t idx) except -1:
+    arg._check_context_compat()
     # Allocate a temporary buffer for the 128-byte CUtensorMap struct.
     # We copy rather than pointing directly at arg._tensor_map for lifetime
     # safety: ParamHolder owns and frees its argument buffers independently.
@@ -350,9 +351,6 @@ cdef class ParamHolder:
                 elif isinstance(arg, driver.CUgraphConditionalHandle):
                     prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i)
                     continue
-                elif isinstance(arg, tensor_map_descriptor_type):
-                    prepare_tensor_map_arg(self.data, self.data_addresses, <TensorMapDescriptor>arg, i)
-                    continue
                 # TODO: support ctypes/numpy struct
                 raise TypeError("the argument is of unsupported type: " + str(type(arg)))
 
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index 0e1df726c0..78002b3ff1 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -316,6 +316,39 @@ cdef class StridedMemoryView:
         view_buffer_strided(view, self.get_buffer(), layout, dtype, self.readonly)
         return view
 
+    def as_tensor_map(
+        self,
+        box_dim,
+        *,
+        element_strides=None,
+        data_type=None,
+        interleave=None,
+        swizzle=None,
+        l2_promotion=None,
+        oob_fill=None,
+    ):
+        """Create a tiled :obj:`TensorMapDescriptor` from this view.
+
+        This is a convenience wrapper around
+        :meth:`cuda.core._tensor_map.TensorMapDescriptor.from_tiled`.
+        """
+        from cuda.core._tensor_map import TensorMapDescriptor
+
+        kwargs = {}
+        if element_strides is not None:
+            kwargs["element_strides"] = element_strides
+        if data_type is not None:
+            kwargs["data_type"] = data_type
+        if interleave is not None:
+            kwargs["interleave"] = interleave
+        if swizzle is not None:
+            kwargs["swizzle"] = swizzle
+        if l2_promotion is not None:
+            kwargs["l2_promotion"] = l2_promotion
+        if oob_fill is not None:
+            kwargs["oob_fill"] = oob_fill
+        return TensorMapDescriptor.from_tiled(self, box_dim, **kwargs)
+
     def copy_from(
         self, other : StridedMemoryView, stream : Stream,
         allocator = None,
diff --git a/cuda_core/cuda/core/_tensor_map.pxd b/cuda_core/cuda/core/_tensor_map.pxd
index 07e8adebc9..4c60b7fc70 100644
--- a/cuda_core/cuda/core/_tensor_map.pxd
+++ b/cuda_core/cuda/core/_tensor_map.pxd
@@ -3,12 +3,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
+from libc.stdint cimport intptr_t
 
 
 cdef class TensorMapDescriptor:
     cdef cydriver.CUtensorMap _tensor_map
+    cdef int _device_id
+    cdef intptr_t _context
     cdef object _source_ref
     cdef object _view_ref
     cdef object _repr_info
 
+    cdef int _check_context_compat(self) except -1
     cdef void* _get_data_ptr(self)
diff --git a/cuda_core/cuda/core/_tensor_map.pyx b/cuda_core/cuda/core/_tensor_map.pyx
index 6054add9b6..9d83a709cc 100644
--- a/cuda_core/cuda/core/_tensor_map.pyx
+++ b/cuda_core/cuda/core/_tensor_map.pyx
@@ -263,6 +263,22 @@ def _get_validated_view(tensor):
     return view
 
 
+cdef inline intptr_t _get_current_context_ptr() except? 0:
+    cdef cydriver.CUcontext ctx
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    if ctx == NULL:
+        raise RuntimeError("TensorMapDescriptor requires an active CUDA context")
+    return <intptr_t>ctx
+
+
+cdef inline int _get_current_device_id() except -1:
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
 def _compute_byte_strides(shape, strides, elem_size):
     """Compute byte strides from element strides or C-contiguous fallback.
 
@@ -313,6 +329,28 @@ cdef class TensorMapDescriptor:
     cdef void* _get_data_ptr(self):
         return <void*>&self._tensor_map
 
+    cdef int _check_context_compat(self) except -1:
+        cdef cydriver.CUcontext current_ctx
+        cdef cydriver.CUdevice current_dev
+        cdef int current_dev_id
+        if self._context == 0 and self._device_id < 0:
+            return 0
+        with nogil:
+            HANDLE_RETURN(cydriver.cuCtxGetCurrent(&current_ctx))
+        if current_ctx == NULL:
+            raise RuntimeError("TensorMapDescriptor requires an active CUDA context")
+        if self._context != 0 and <intptr_t>current_ctx != self._context:
+            raise RuntimeError(
+                "TensorMapDescriptor was created in a different CUDA context")
+        with nogil:
+            HANDLE_RETURN(cydriver.cuCtxGetDevice(&current_dev))
+        current_dev_id = <int>current_dev
+        if self._device_id >= 0 and current_dev_id != self._device_id:
+            raise RuntimeError(
+                f"TensorMapDescriptor belongs to device {self._device_id}, "
+                f"but current device is {current_dev_id}")
+        return 0
+
     @classmethod
     def from_tiled(cls, tensor, box_dim, *,
                    element_strides=None,
@@ -366,6 +404,8 @@ cdef class TensorMapDescriptor:
         # deleter can free the backing allocation when released.
         desc._source_ref = tensor
         desc._view_ref = view
+        desc._context = _get_current_context_ptr()
+        desc._device_id = _get_current_device_id()
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -593,6 +633,8 @@ cdef class TensorMapDescriptor:
         view = _get_validated_view(tensor)
         desc._source_ref = tensor
         desc._view_ref = view
+        desc._context = _get_current_context_ptr()
+        desc._device_id = _get_current_device_id()
 
         tma_dt = _resolve_data_type(view, data_type)
         cdef int c_data_type_int = int(tma_dt)
@@ -750,6 +792,8 @@ cdef class TensorMapDescriptor:
             view = _get_validated_view(tensor)
             desc._source_ref = tensor
             desc._view_ref = view
+            desc._context = _get_current_context_ptr()
+            desc._device_id = _get_current_device_id()
 
             tma_dt = _resolve_data_type(view, data_type)
             cdef int c_data_type_int = int(tma_dt)
@@ -839,7 +883,11 @@ cdef class TensorMapDescriptor:
             or a :obj:`~cuda.core.StridedMemoryView`. Must refer to
             device-accessible memory with a 16-byte-aligned pointer.
         """
+        self._check_context_compat()
         view = _get_validated_view(tensor)
+        if view.device_id != self._device_id:
+            raise ValueError(
+                f"replace_address expects tensor on device {self._device_id}, got {view.device_id}")
 
         cdef intptr_t global_address = view.ptr
 
diff --git a/cuda_core/examples/tma_replace_address.py b/cuda_core/examples/tma_replace_address.py
index 7f11cf551c..a734301fd6 100644
--- a/cuda_core/examples/tma_replace_address.py
+++ b/cuda_core/examples/tma_replace_address.py
@@ -36,7 +36,7 @@
     LaunchConfig,
     Program,
     ProgramOptions,
-    TensorMapDescriptor,
+    StridedMemoryView,
     launch,
 )
 
@@ -159,7 +159,7 @@
 output = cp.zeros(N, dtype=cp.float32)
 dev.sync()  # cupy uses its own stream
 
-tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,))
 
 n_tiles = N // TILE_SIZE
 config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
diff --git a/cuda_core/examples/tma_tensor_map.py b/cuda_core/examples/tma_tensor_map.py
index 63c0ac7b5c..2a5ce9ad86 100644
--- a/cuda_core/examples/tma_tensor_map.py
+++ b/cuda_core/examples/tma_tensor_map.py
@@ -31,7 +31,7 @@
     LaunchConfig,
     Program,
     ProgramOptions,
-    TensorMapDescriptor,
+    StridedMemoryView,
     launch,
 )
 
@@ -48,8 +48,6 @@
     sys.exit(0)
 dev.set_current()
 
-arch_str = "".join(f"{i}" for i in arch)
-
 # ---------------------------------------------------------------------------
 # CUDA kernel that uses TMA to load a 1-D tile into shared memory, then
 # copies the tile to an output buffer so we can verify correctness.
@@ -141,7 +139,7 @@
 prog = Program(
     code,
     code_type="c++",
-    options=ProgramOptions(std="c++17", arch=f"sm_{arch_str}"),
+    options=ProgramOptions(std="c++17", arch=f"sm_{dev.arch}"),
 )
 mod = prog.compile("cubin")
 ker = mod.get_kernel("tma_copy")
@@ -155,11 +153,10 @@
 dev.sync()  # cupy uses its own stream
 
 # ---------------------------------------------------------------------------
-# 2) Create a TMA tiled descriptor
-#    from_tiled() accepts any DLPack / __cuda_array_interface__ object.
+# 2) Create a TMA tiled descriptor from a StridedMemoryView.
 #    The dtype (float32) is inferred automatically from the CuPy array.
 # ---------------------------------------------------------------------------
-tensor_map = TensorMapDescriptor.from_tiled(a, box_dim=(TILE_SIZE,))
+tensor_map = StridedMemoryView.from_any_interface(a, stream_ptr=-1).as_tensor_map(box_dim=(TILE_SIZE,))
 
 # ---------------------------------------------------------------------------
 # 3) Launch the kernel
diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 2d6dc138c0..dee1f1e2e1 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -6,8 +6,11 @@
 
 from cuda.core import (
     Device,
-    TensorMapDataType,
+    StridedMemoryView,
     TensorMapDescriptor,
+)
+from cuda.core._tensor_map import (
+    TensorMapDataType,
     TensorMapIm2ColWideMode,
     TensorMapInterleave,
     TensorMapL2Promotion,
@@ -107,6 +110,16 @@ def test_from_tiled_2d(self, dev, skip_if_no_tma):
         )
         assert desc is not None
 
+    def test_strided_memory_view_as_tensor_map(self, dev, skip_if_no_tma):
+        buf = dev.allocate(64 * 64 * 4)
+        tensor = _DeviceArray(buf, (64, 64))
+        view = StridedMemoryView.from_any_interface(tensor, stream_ptr=-1)
+        desc = view.as_tensor_map(
+            box_dim=(32, 32),
+            data_type=TensorMapDataType.FLOAT32,
+        )
+        assert desc is not None
+
     def test_from_tiled_3d(self, dev, skip_if_no_tma):
         buf = dev.allocate(16 * 16 * 16 * 4)  # 16x16x16 float32
         tensor = _DeviceArray(buf, (16, 16, 16))

From 5a0e141ebf5b4d7a65818c20d4adc4648d37571f Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Sat, 7 Mar 2026 09:54:32 -0500
Subject: [PATCH 13/13] Restore cu12 feature definitions in cuda_core pixi
 manifest.

Bring back the cu12 feature blocks so pixi can parse the manifest and local test commands no longer fail early with a missing feature error.

Made-with: Cursor
---
 cuda_core/pixi.toml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml
index e351fb63b6..9dc6ac1ed9 100644
--- a/cuda_core/pixi.toml
+++ b/cuda_core/pixi.toml
@@ -75,9 +75,16 @@ cuda = "13"
 [feature.cu13.dependencies]
 cuda-version = "13.1.*"
 
-# NOTE: cu12 environment is intentionally omitted because the path dependency
-# to ../cuda_bindings (v13.1) makes it unsolvable locally. For cu12 testing,
-# use conda-forge packages or CI workflows.
+[feature.cu12.system-requirements]
+cuda = "12"
+
+[feature.cu12.dependencies]
+cuda-version = "12.*"
+
+# We keep both cu12 and cu13 because cuda.core works with either major version
+# NOTE: Path dependency to ../cuda_bindings only works for cu13 (local bindings is v13.1)
+#       For cu12 testing, use conda-forge packages: temporarily change path to wildcard
+#       or skip cu12 locally: pixi run -e cu13 test
 [environments]
 default = { features = [
     "cu13",