NVIDIA · leofang · Feb 18, 2026 · cpcloud · Mar 4, 2026 · seberg
diff --git a/cuda_core/cuda/core/_utils/dtype_utils.pyx b/cuda_core/cuda/core/_utils/dtype_utils.pyx
@@ -0,0 +1,183 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy
+
+
+cdef Py_ssize_t _get_cuda_scalar_alignment(object descr):
+    """Return the CUDA alignment for a scalar (non-structured) NumPy dtype.
+
+    Uses standard C/CUDA alignment rules: alignment equals the itemsize
+    for types up to 16 bytes, capped at 16.  The result is always a
+    power of two.
+    """
+    cdef Py_ssize_t size = descr.itemsize
+    # Cap at 16 (e.g. complex128 is 16 bytes, no CUDA type exceeds 16-byte alignment)
+    if size > 16:
+        return 16
+    # Round down to nearest power of 2
+    cdef Py_ssize_t alignment = 1
+    while alignment * 2 <= size:
+        alignment *= 2
+    return alignment
+
+
+def make_aligned_dtype(dtype, *, int alignment=-1, int recurse=False):
+    """Create a new structured dtype with sufficient alignment for GPU use.
+
+    Many CUDA kernels require structure members to be naturally aligned
+    (each field's offset is a multiple of its own alignment) and the
+    overall structure size to be a multiple of the largest member
+    alignment.  :func:`make_aligned_dtype` recomputes field offsets and
+    the total ``itemsize`` so that these constraints are satisfied.
+
+    Parameters
+    ----------
+    dtype : numpy.dtype or dtype-like
+        A NumPy dtype or anything accepted by :class:`numpy.dtype`.
+        Typically a structured dtype such as
+        ``numpy.dtype([("x", "f4"), ("y", "f4"), ("z", "f4")])``.
+    alignment : int, optional
+        Desired minimum alignment (in bytes) of the resulting dtype.
+        Must be a positive power of two.  When ``-1`` (the default) the
+        alignment is inferred from the fields.
+
+        If the requested alignment is *smaller* than the minimum
+        alignment inferred from the fields, a :exc:`ValueError` is
+        raised.  If it is *larger*, the extra alignment is recorded in
+        the dtype's ``metadata`` under the key ``"__cuda_alignment__"``
+        and the ``itemsize`` is padded accordingly.
+
+    recurse : bool, optional
+        When ``True``, nested structured dtypes are recursively
+        re-aligned as if ``alignment=-1`` were passed for each
+        sub-structure.  When ``False`` (the default), the alignment
+        of nested dtypes is taken as-is.
+
+    Returns
+    -------
+    numpy.dtype
+        A new structured dtype whose field offsets, ``itemsize``, and
+        (optionally) metadata satisfy GPU alignment requirements.
+
+    Raises
+    ------
+    ValueError
+        If *alignment* is not a positive power of two, or if *alignment*
+        is smaller than the minimum alignment inferred from the fields.
+
+    Notes
+    -----
+    * By default this function does **not** recurse into nested
+      structures.  You can nest a dtype with a specific alignment by
+      creating it with :func:`make_aligned_dtype` first and then
+      embedding it in an outer dtype.
+    * NumPy promotion (e.g. in :func:`numpy.concatenate`) may
+      "canonicalize" the dtype and drop the struct layout and alignment
+      metadata.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from cuda.core.utils import make_aligned_dtype
+    >>> dt = np.dtype([("x", "f4"), ("y", "i1")])
+    >>> dt.itemsize
+    5
+    >>> aligned = make_aligned_dtype(dt)
+    >>> aligned.itemsize
+    8
+
+    Requesting a specific alignment:
+
+    >>> aligned16 = make_aligned_dtype(dt, alignment=16)
+    >>> aligned16.itemsize
+    16
+    >>> aligned16.metadata
+    {'__cuda_alignment__': 16}
+    """
+    cdef Py_ssize_t final_alignment = 1
+    cdef Py_ssize_t itemsize = 0
+    cdef Py_ssize_t curr_offset = 0
+    cdef Py_ssize_t min_offset = 0
+    cdef Py_ssize_t subalignment
+    descr = numpy.dtype(dtype, align=True)
+
+    if alignment != -1 and (alignment <= 0 or (alignment & (alignment - 1)) != 0):
+        raise ValueError("Alignment must be a positive power of 2.")
+
+    if descr.names is None:
+        # Non-structured dtype: alignment is just the scalar alignment
+        final_alignment = descr.alignment
+        curr_offset = descr.itemsize
+    else:
+        names = []
+        offsets = []
+        subdtypes = []
+
+        for name in descr.names:
+            field_info = descr.fields[name]
+            subdtype = field_info[0]
+            offset = field_info[1]
+
+            if offset < min_offset:
+                raise ValueError(
+                    "make_aligned_dtype() only supports well-behaved "
+                    "in-order fields (it ignores field offsets).")
+
+            min_offset = offset + subdtype.itemsize
+
+            if subdtype.names is None:
+                subalignment = _get_cuda_scalar_alignment(subdtype)
+            elif not recurse:
+                subalignment = subdtype.alignment
+                if subdtype.metadata:
+                    subalignment = subdtype.metadata.get(
+                        "__cuda_alignment__", subalignment)
+            else:
+                subdtype = make_aligned_dtype(subdtype, recurse=recurse)
+                subalignment = subdtype.alignment
+                if subdtype.metadata:
+                    subalignment = subdtype.metadata.get(
+                        "__cuda_alignment__", subalignment)
+
+            if curr_offset % subalignment != 0:
+                curr_offset += subalignment - (curr_offset % subalignment)
+
+            final_alignment = max(final_alignment, subalignment)
+
+            names.append(name)
+            subdtypes.append(subdtype)
+            offsets.append(curr_offset)
+            curr_offset += subdtype.itemsize
+
+        dtype_info = dict(names=names, offsets=offsets, formats=subdtypes,
+                          itemsize=itemsize)
+
+    metadata = {}
+    if alignment != -1:
+        if alignment >= final_alignment:
+            final_alignment = alignment
+            metadata = {"metadata": {"__cuda_alignment__": alignment}}
+        else:
+            raise ValueError(
+                f"make_aligned_dtype(): given alignment={alignment} "
+                f"is smaller than minimum alignment {final_alignment}"
+            )
+
+    itemsize = (
+        (curr_offset + final_alignment - 1) // final_alignment
+        * final_alignment)
+
+    if descr.names is None:
+        if descr.itemsize != itemsize:
+            raise ValueError(
+                "Alignment larger than itemsize for non-structured dtype.")
+        return numpy.dtype(descr, **metadata)
+    else:
+        if descr.itemsize > itemsize:
+            raise ValueError(
+                "Input descriptor had larger itemsize than inferred.")
+        dtype_info["itemsize"] = itemsize
+
+    return numpy.dtype(dtype_info, align=True, **metadata)
diff --git a/cuda_core/cuda/core/utils.py b/cuda_core/cuda/core/utils.py
@@ -1,8 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.core._memoryview import (
     StridedMemoryView,  # noqa: F401
     args_viewable_as_strided_memory,  # noqa: F401
 )
+from cuda.core._utils.dtype_utils import (
+    make_aligned_dtype,  # noqa: F401
+)
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
@@ -142,6 +142,7 @@ Utility functions
    :toctree: generated/
 
    args_viewable_as_strided_memory
+   make_aligned_dtype
 
    :template: autosummary/cyclass.rst
 

diff --git a/cuda_core/docs/source/release/0.6.0-notes.rst b/cuda_core/docs/source/release/0.6.0-notes.rst
@@ -27,6 +27,13 @@ New features
 
   This replaces the previous undocumented workaround of using ``Stream.from_handle(0)`` to access the legacy default stream.
 
+- Added :func:`~cuda.core.utils.make_aligned_dtype` utility for creating
+  structured NumPy dtypes with GPU-compatible alignment.  Field offsets and
+  the total ``itemsize`` are recomputed so that each field is naturally aligned
+  and the structure size is a multiple of the largest member alignment.  An
+  explicit ``alignment`` can be requested and is stored in the dtype's metadata
+  under the key ``"__cuda_alignment__"``.  (Resolves :issue:`734`.)
+
 Fixes and enhancements
 -----------------------
 

diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
@@ -17,7 +17,7 @@
 import pytest
 from cuda.core import Device
 from cuda.core._layout import _StridedLayout
-from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
+from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory, make_aligned_dtype
 from pytest import param
 
 
@@ -524,3 +524,126 @@ def test_from_array_interface_unsupported_strides(init_cuda):
     with pytest.raises(ValueError, match="strides must be divisible by itemsize"):
         # TODO: ideally this would raise on construction
         smv.strides  # noqa: B018
+
+
+# --- Tests for make_aligned_dtype ---
+
+
+class TestMakeAlignedDtype:
+    def test_simple_struct_padding(self):
+        # int8 followed by float32: needs padding after int8
+        dt = np.dtype([("a", "i1"), ("b", "f4")])
+        result = make_aligned_dtype(dt)
+        assert result.fields["b"][1] == 4  # padded to 4-byte alignment
+        assert result.itemsize == 8  # 4 (pad+a) + 4 (b)
+
+    def test_already_aligned(self):
+        dt = np.dtype([("x", "f4"), ("y", "f4")])
+        result = make_aligned_dtype(dt)
+        assert result.fields["x"][1] == 0
+        assert result.fields["y"][1] == 4
+        assert result.itemsize == 8
+
+    def test_mixed_types(self):
+        dt = np.dtype([("a", "i1"), ("b", "f8"), ("c", "i4")])
+        result = make_aligned_dtype(dt)
+        assert result.fields["a"][1] == 0
+        assert result.fields["b"][1] == 8  # aligned to 8
+        assert result.fields["c"][1] == 16  # aligned to 4
+        assert result.itemsize % 8 == 0  # padded to max alignment (8)
+
+    def test_explicit_alignment(self):
+        dt = np.dtype([("x", "f4"), ("y", "i1")])
+        result = make_aligned_dtype(dt, alignment=16)
+        assert result.itemsize == 16
+        assert result.metadata == {"__cuda_alignment__": 16}
+
+    def test_alignment_too_small_raises(self):
+        dt = np.dtype([("x", "f8")])  # min alignment is 8
+        with pytest.raises(ValueError, match="smaller than minimum alignment"):
+            make_aligned_dtype(dt, alignment=2)
+
+    def test_alignment_not_power_of_two_raises(self):
+        dt = np.dtype([("x", "f4")])
+        with pytest.raises(ValueError, match="power of 2"):
+            make_aligned_dtype(dt, alignment=3)
+
+    def test_alignment_zero_raises(self):
+        dt = np.dtype([("x", "f4")])
+        with pytest.raises(ValueError, match="power of 2"):
+            make_aligned_dtype(dt, alignment=0)
+
+    def test_alignment_negative_raises(self):
+        dt = np.dtype([("x", "f4")])
+        with pytest.raises(ValueError, match="power of 2"):
+            make_aligned_dtype(dt, alignment=-2)
+
+    def test_recurse_nested(self):
+        inner = np.dtype([("a", "i1"), ("b", "f4")])
+        outer = np.dtype([("inner", inner), ("c", "f8")])
+        result = make_aligned_dtype(outer, recurse=True)
+        inner_result = result.fields["inner"][0]
+        assert inner_result.fields["b"][1] == 4
+        assert result.itemsize % 8 == 0
+
+    def test_no_recurse_nested(self):
+        inner = np.dtype([("a", "i1"), ("b", "f4")])
+        outer = np.dtype([("inner", inner), ("c", "f8")])
+        result_no_recurse = make_aligned_dtype(outer, recurse=False)
+        result_recurse = make_aligned_dtype(outer, recurse=True)
+        # Without recursion, inner is left as-is
+        inner_no_recurse = result_no_recurse.fields["inner"][0]
+        assert inner_no_recurse.fields["a"][1] == 0
+
+        # The outer dtype should still be aligned properly
+        assert result_no_recurse.itemsize % 8 == 0
+        assert result_recurse.itemsize % 8 == 0
+
+    def test_single_field(self):
+        dt = np.dtype([("x", "f8")])
+        result = make_aligned_dtype(dt)
+        assert result.fields["x"][1] == 0
+        assert result.itemsize == 8
+
+    def test_many_small_fields(self):
+        dt = np.dtype([("a", "i1"), ("b", "i1"), ("c", "i1"), ("d", "f4")])
+        result = make_aligned_dtype(dt)
+        assert result.fields["d"][1] == 4  # aligned to 4
+        assert result.itemsize % 4 == 0
+
+    def test_metadata_preserved_with_alignment(self):
+        dt = np.dtype([("x", "f4")])
+        result = make_aligned_dtype(dt, alignment=8)
+        assert result.metadata["__cuda_alignment__"] == 8
+
+    def test_default_alignment_no_metadata(self):
+        dt = np.dtype([("x", "f4"), ("y", "f4")])
+        result = make_aligned_dtype(dt)
+        assert result.metadata is None
+
+    def test_non_struct_dtype_passthrough(self):
+        # Non-structured dtypes should pass through if alignment matches
+        dt = np.dtype("f4")
+        result = make_aligned_dtype(dt)
+        assert result.itemsize == 4
+
+    def test_non_struct_alignment_too_large(self):
+        dt = np.dtype("f4")
+        with pytest.raises(ValueError, match="Alignment larger than itemsize"):
+            make_aligned_dtype(dt, alignment=8)
+
+    def test_three_floats(self):
+        # Common GPU struct: float3
+        dt = np.dtype([("x", "f4"), ("y", "f4"), ("z", "f4")])
+        result = make_aligned_dtype(dt)
+        assert result.fields["x"][1] == 0
+        assert result.fields["y"][1] == 4
+        assert result.fields["z"][1] == 8
+        assert result.itemsize == 12
+
+    def test_int_then_double(self):
+        dt = np.dtype([("i", "i4"), ("d", "f8")])
+        result = make_aligned_dtype(dt)
+        assert result.fields["i"][1] == 0
+        assert result.fields["d"][1] == 8
+        assert result.itemsize == 16