From 478101cde69535a961c285a200442e2002f6e2dc Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 3 Mar 2026 15:09:08 -0800 Subject: [PATCH 01/23] Convert _graph.py to _graph/ package for explicit graph construction work Rename cuda/core/_graph.py to cuda/core/_graph/__init__.py to create a package that will house the explicit graph construction module alongside the existing stream-capture-based implementation. Ref: #1317 Made-with: Cursor --- cuda_core/cuda/core/{_graph.py => _graph/__init__.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cuda_core/cuda/core/{_graph.py => _graph/__init__.py} (100%) diff --git a/cuda_core/cuda/core/_graph.py b/cuda_core/cuda/core/_graph/__init__.py similarity index 100% rename from cuda_core/cuda/core/_graph.py rename to cuda_core/cuda/core/_graph/__init__.py From ee55795183cc127910e6926485c4f2705b4db596 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 4 Mar 2026 09:09:09 -0800 Subject: [PATCH 02/23] Added GraphHandle to RAII module. --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 25 +++++++++++++++++++ cuda_core/cuda/core/_cpp/resource_handles.hpp | 25 +++++++++++++++++++ cuda_core/cuda/core/_resource_handles.pxd | 7 ++++++ cuda_core/cuda/core/_resource_handles.pyx | 11 ++++++++ 4 files changed, 68 insertions(+) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 033fa603e7..5655d0e4ac 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -56,6 +56,9 @@ decltype(&cuLibraryLoadData) p_cuLibraryLoadData = nullptr; decltype(&cuLibraryUnload) p_cuLibraryUnload = nullptr; decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr; +// Graph +decltype(&cuGraphDestroy) p_cuGraphDestroy = nullptr; + // GL interop pointers decltype(&cuGraphicsUnregisterResource) p_cuGraphicsUnregisterResource = nullptr; @@ -812,6 +815,28 @@ KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_li return KernelHandle(box, &box->resource); } +// ============================================================================ +// Graph Handles +// ============================================================================ + +namespace { +struct GraphBox { + CUgraph resource; +}; +} // namespace + +GraphHandle create_graph_handle(CUgraph graph) { + auto box = std::shared_ptr( + new GraphBox{graph}, + [](const GraphBox* b) { + GILReleaseGuard gil; + p_cuGraphDestroy(b->resource); + delete b; + } + ); + return GraphHandle(box, &box->resource); +} + // ============================================================================ // Graphics Resource Handles // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index d91f999ac6..c5e1132990 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -72,6 +72,9 @@ extern decltype(&cuLibraryLoadData) p_cuLibraryLoadData; extern decltype(&cuLibraryUnload) p_cuLibraryUnload; extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel; +// Graph +extern decltype(&cuGraphDestroy) p_cuGraphDestroy; + // Graphics interop extern decltype(&cuGraphicsUnregisterResource) p_cuGraphicsUnregisterResource; @@ -107,6 +110,7 @@ using EventHandle = std::shared_ptr; using MemoryPoolHandle = std::shared_ptr; using LibraryHandle = std::shared_ptr; using KernelHandle = std::shared_ptr; +using GraphHandle = std::shared_ptr; using GraphicsResourceHandle = std::shared_ptr; using NvrtcProgramHandle = std::shared_ptr; using NvvmProgramHandle = std::shared_ptr; @@ -311,6 +315,15 @@ KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* na // Use for borrowed kernels. The library handle keeps the library alive. KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library); +// ============================================================================ +// Graph handle functions +// ============================================================================ + +// Wrap an externally-created CUgraph with RAII cleanup. +// When the last reference is released, cuGraphDestroy is called automatically. +// The caller must have already created the graph via cuGraphCreate. +GraphHandle create_graph_handle(CUgraph graph); + // ============================================================================ // Graphics resource handle functions // ============================================================================ @@ -380,6 +393,10 @@ inline CUkernel as_cu(const KernelHandle& h) noexcept { return h ? *h : nullptr; } +inline CUgraph as_cu(const GraphHandle& h) noexcept { + return h ? *h : nullptr; +} + inline CUgraphicsResource as_cu(const GraphicsResourceHandle& h) noexcept { return h ? *h : nullptr; } @@ -422,6 +439,10 @@ inline std::intptr_t as_intptr(const KernelHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } +inline std::intptr_t as_intptr(const GraphHandle& h) noexcept { + return reinterpret_cast(as_cu(h)); +} + inline std::intptr_t as_intptr(const GraphicsResourceHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } @@ -477,6 +498,10 @@ inline PyObject* as_py(const KernelHandle& h) noexcept { return detail::make_py("cuda.bindings.driver", "CUkernel", as_intptr(h)); } +inline PyObject* as_py(const GraphHandle& h) noexcept { + return detail::make_py("cuda.bindings.driver", "CUgraph", as_intptr(h)); +} + inline PyObject* as_py(const NvrtcProgramHandle& h) noexcept { return detail::make_py("cuda.bindings.nvrtc", "nvrtcProgram", as_intptr(h)); } diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 7a53a4f25f..7ddf0911de 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -25,6 +25,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const cydriver.CUdeviceptr] DevicePtrHandle ctypedef shared_ptr[const cydriver.CUlibrary] LibraryHandle ctypedef shared_ptr[const cydriver.CUkernel] KernelHandle + ctypedef shared_ptr[const cydriver.CUgraph] GraphHandle ctypedef shared_ptr[const cydriver.CUgraphicsResource] GraphicsResourceHandle ctypedef shared_ptr[const cynvrtc.nvrtcProgram] NvrtcProgramHandle ctypedef shared_ptr[const cynvvm.nvvmProgram] NvvmProgramHandle @@ -37,6 +38,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUdeviceptr as_cu(DevicePtrHandle h) noexcept nogil cydriver.CUlibrary as_cu(LibraryHandle h) noexcept nogil cydriver.CUkernel as_cu(KernelHandle h) noexcept nogil + cydriver.CUgraph as_cu(GraphHandle h) noexcept nogil cydriver.CUgraphicsResource as_cu(GraphicsResourceHandle h) noexcept nogil cynvrtc.nvrtcProgram as_cu(NvrtcProgramHandle h) noexcept nogil cynvvm.nvvmProgram as_cu(NvvmProgramHandle h) noexcept nogil\ @@ -49,6 +51,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": intptr_t as_intptr(DevicePtrHandle h) noexcept nogil intptr_t as_intptr(LibraryHandle h) noexcept nogil intptr_t as_intptr(KernelHandle h) noexcept nogil + intptr_t as_intptr(GraphHandle h) noexcept nogil intptr_t as_intptr(GraphicsResourceHandle h) noexcept nogil intptr_t as_intptr(NvrtcProgramHandle h) noexcept nogil intptr_t as_intptr(NvvmProgramHandle h) noexcept nogil @@ -61,6 +64,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": object as_py(DevicePtrHandle h) object as_py(LibraryHandle h) object as_py(KernelHandle h) + object as_py(GraphHandle h) object as_py(GraphicsResourceHandle h) object as_py(NvrtcProgramHandle h) object as_py(NvvmProgramHandle h) @@ -136,6 +140,9 @@ cdef KernelHandle create_kernel_handle(const LibraryHandle& h_library, const cha cdef KernelHandle create_kernel_handle_ref( cydriver.CUkernel kernel, const LibraryHandle& h_library) except+ nogil +# Graph handles +cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil + # Graphics resource handles cdef GraphicsResourceHandle create_graphics_resource_handle( cydriver.CUgraphicsResource resource) except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 47d0a86d04..a013f1c7cb 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -25,6 +25,7 @@ from ._resource_handles cimport ( DevicePtrHandle, LibraryHandle, KernelHandle, + GraphHandle, GraphicsResourceHandle, NvrtcProgramHandle, NvvmProgramHandle, @@ -124,6 +125,10 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": KernelHandle create_kernel_handle_ref "cuda_core::create_kernel_handle_ref" ( cydriver.CUkernel kernel, const LibraryHandle& h_library) except+ nogil + # Graph handles + GraphHandle create_graph_handle "cuda_core::create_graph_handle" ( + cydriver.CUgraph graph) except+ nogil + # Graphics resource handles GraphicsResourceHandle create_graphics_resource_handle "cuda_core::create_graphics_resource_handle" ( cydriver.CUgraphicsResource resource) except+ nogil @@ -207,6 +212,9 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": void* p_cuLibraryUnload "reinterpret_cast(cuda_core::p_cuLibraryUnload)" void* p_cuLibraryGetKernel "reinterpret_cast(cuda_core::p_cuLibraryGetKernel)" + # Graph + void* p_cuGraphDestroy "reinterpret_cast(cuda_core::p_cuGraphDestroy)" + # Graphics interop void* p_cuGraphicsUnregisterResource "reinterpret_cast(cuda_core::p_cuGraphicsUnregisterResource)" @@ -266,6 +274,9 @@ p_cuLibraryLoadData = _get_driver_fn("cuLibraryLoadData") p_cuLibraryUnload = _get_driver_fn("cuLibraryUnload") p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel") +# Graph +p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy") + # Graphics interop p_cuGraphicsUnregisterResource = _get_driver_fn("cuGraphicsUnregisterResource") From 3b3a71549f2962e484679f2f7f61070927d905a3 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 4 Mar 2026 13:26:28 -0800 Subject: [PATCH 03/23] Add GraphDef and Node classes for explicit graph construction Implement explicit CUDA graph construction API as an alternative to stream capture: - GraphDef: wraps CUgraph with instantiate(), debug_dot_print(), nodes(), and edges() methods - Node: fluent interface for building graphs with launch(), alloc(), free(), and join() methods - GraphAllocOptions: dataclass for allocation options (device, memory_type, peer_access) - Add __repr__, __eq__, __hash__ to GraphDef and Node for debugging and use in collections - Add pred/succ properties to Node for graph traversal - Refactor GraphDebugPrintOptions._to_flags() to share logic between GraphBuilder and GraphDef Made-with: Cursor --- cuda_core/cuda/core/_graph/__init__.py | 73 +-- cuda_core/cuda/core/_graph/_graphdef.pxd | 30 ++ cuda_core/cuda/core/_graph/_graphdef.pyx | 520 +++++++++++++++++++++ cuda_core/tests/graph/test_explicit.py | 558 +++++++++++++++++++++++ 4 files changed, 1146 insertions(+), 35 deletions(-) create mode 100644 cuda_core/cuda/core/_graph/_graphdef.pxd create mode 100644 cuda_core/cuda/core/_graph/_graphdef.pyx create mode 100644 cuda_core/tests/graph/test_explicit.py diff --git a/cuda_core/cuda/core/_graph/__init__.py b/cuda_core/cuda/core/_graph/__init__.py index 80482c38ac..14b801137a 100644 --- a/cuda_core/cuda/core/_graph/__init__.py +++ b/cuda_core/cuda/core/_graph/__init__.py @@ -91,6 +91,43 @@ class GraphDebugPrintOptions: extra_topo_info: bool = False conditional_node_params: bool = False + def _to_flags(self) -> int: + """Convert options to CUDA driver API flags (internal use).""" + flags = 0 + if self.verbose: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE + if self.runtime_types: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES + if self.kernel_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS + if self.memcpy_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS + if self.memset_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS + if self.host_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS + if self.event_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS + if self.ext_semas_signal_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS + if self.ext_semas_wait_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS + if self.kernel_node_attributes: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES + if self.handles: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES + if self.mem_alloc_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS + if self.mem_free_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS + if self.batch_mem_op_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS + if self.extra_topo_info: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO + if self.conditional_node_params: + flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS + return flags + @dataclass class GraphCompleteOptions: @@ -341,41 +378,7 @@ def debug_dot_print(self, path, options: GraphDebugPrintOptions | None = None): """ if not self._building_ended: raise RuntimeError("Graph has not finished building.") - flags = 0 - if options: - if options.verbose: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE - if options.runtime_types: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES - if options.kernel_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS - if options.memcpy_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS - if options.memset_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS - if options.host_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS - if options.event_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS - if options.ext_semas_signal_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS - if options.ext_semas_wait_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS - if options.kernel_node_attributes: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES - if options.handles: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES - if options.mem_alloc_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS - if options.mem_free_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS - if options.batch_mem_op_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS - if options.extra_topo_info: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO - if options.conditional_node_params: - flags |= driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS - + flags = options._to_flags() if options else 0 handle_return(driver.cuGraphDebugDotPrint(self._mnff.graph, path, flags)) def split(self, count: int) -> tuple[GraphBuilder, ...]: diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd new file mode 100644 index 0000000000..03ba795f4b --- /dev/null +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver +from cuda.core._resource_handles cimport GraphHandle + + +cdef class GraphDef +cdef class Node + + +cdef class GraphDef: + cdef: + GraphHandle _h_graph + object __weakref__ + + @staticmethod + cdef GraphDef _from_handle(GraphHandle h_graph) + + +cdef class Node: + cdef: + GraphHandle _h_graph + cydriver.CUgraphNode _node # NULL for root + cydriver.CUdeviceptr _dptr # non-zero for alloc nodes + object __weakref__ + + @staticmethod + cdef Node _create(GraphHandle h_graph, cydriver.CUgraphNode node, cydriver.CUdeviceptr dptr) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx new file mode 100644 index 0000000000..99e125bd0e --- /dev/null +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -0,0 +1,520 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Private module for explicit CUDA graph construction. + +This module provides GraphDef and Node classes for building CUDA graphs +explicitly (as opposed to stream capture). Both approaches produce the +same public Graph type for execution. +""" + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from cuda.core import Device + +from libc.stddef cimport size_t +from libc.stdint cimport uintptr_t +from libc.string cimport memset + +from libcpp.vector cimport vector + +from cuda.bindings cimport cydriver + +from cuda.core._resource_handles cimport ( + GraphHandle, + create_graph_handle, + as_cu, + as_intptr, +) +from cuda.core._module cimport Kernel +from cuda.core._launch_config cimport LaunchConfig +from cuda.core._kernel_arg_handler cimport ParamHolder +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN + +from cuda.core._utils.cuda_utils import driver + + +@dataclass +class GraphAllocOptions: + """Options for graph memory allocation nodes. + + Attributes + ---------- + device : int or Device, optional + The device on which to allocate memory. If None (default), + uses the current CUDA context's device. + memory_type : str, optional + Type of memory to allocate. One of: + + - ``"device"`` (default): Pinned device memory, optimal for GPU kernels. + - ``"host"``: Pinned host memory, accessible from both host and device. + Useful for graphs containing host callback nodes. Note: may not be + supported on all systems/drivers. + - ``"managed"``: Managed/unified memory that automatically migrates + between host and device. Useful for mixed host/device access patterns. + + peer_access : list of int or Device, optional + List of devices that should have read-write access to the + allocated memory. If None (default), only the allocating + device has access. + + Notes + ----- + - IPC (inter-process communication) is not supported for graph + memory allocation nodes per CUDA documentation. + - The allocation uses the device's default memory pool. + """ + + device: int | Device | None = None + memory_type: str = "device" + peer_access: list | None = None + + +cdef class GraphDef: + """Represents a CUDA graph definition (CUgraph). + + A GraphDef is used to construct a graph explicitly by adding nodes + and specifying dependencies. Once construction is complete, call + instantiate() to obtain an executable Graph. + """ + + def __init__(self): + """Create a new empty graph definition.""" + cdef cydriver.CUgraph graph = NULL + with nogil: + HANDLE_RETURN(cydriver.cuGraphCreate(&graph, 0)) + self._h_graph = create_graph_handle(graph) + + @staticmethod + cdef GraphDef _from_handle(GraphHandle h_graph): + """Create a GraphDef from an existing GraphHandle (internal use).""" + cdef GraphDef g = GraphDef.__new__(GraphDef) + g._h_graph = h_graph + return g + + def __repr__(self): + return f"" + + def __eq__(self, other): + if not isinstance(other, GraphDef): + return NotImplemented + return as_intptr(self._h_graph) == as_intptr((other)._h_graph) + + def __hash__(self): + return hash(as_intptr(self._h_graph)) + + @property + def root(self): + """Return the root Node for this graph. + + The root node has no dependencies. Operations added from the root + will be entry points to the graph. + """ + return Node._create(self._h_graph, NULL, 0) + + def instantiate(self): + """Instantiate the graph definition into an executable Graph. + + Returns + ------- + Graph + An executable graph that can be launched on a stream. + """ + from cuda.core._graph import Graph + from cuda.core._utils.cuda_utils import handle_return + + graph_exec = handle_return(driver.cuGraphInstantiate( + driver.CUgraph(as_intptr(self._h_graph)), 0)) + return Graph._init(graph_exec) + + def debug_dot_print(self, path: str, options=None): + """Write a GraphViz DOT representation of the graph to a file. + + Parameters + ---------- + path : str + File path for the DOT output. + options : GraphDebugPrintOptions, optional + Customizable options for the debug print. + """ + from cuda.core._graph import GraphDebugPrintOptions + + cdef unsigned int flags = 0 + if options is not None: + if not isinstance(options, GraphDebugPrintOptions): + raise TypeError("options must be a GraphDebugPrintOptions instance") + flags = options._to_flags() + + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef bytes path_bytes = path.encode('utf-8') + cdef const char* c_path = path_bytes + with nogil: + HANDLE_RETURN(cydriver.cuGraphDebugDotPrint(graph, c_path, flags)) + + def nodes(self): + """Return all nodes in the graph. + + Returns + ------- + tuple of Node + All nodes in the graph (excluding the virtual root). + """ + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef size_t num_nodes = 0 + + with nogil: + HANDLE_RETURN(cydriver.cuGraphGetNodes(graph, NULL, &num_nodes)) + + if num_nodes == 0: + return () + + cdef vector[cydriver.CUgraphNode] nodes_vec + nodes_vec.resize(num_nodes) + with nogil: + HANDLE_RETURN(cydriver.cuGraphGetNodes(graph, nodes_vec.data(), &num_nodes)) + + return tuple(Node._create(self._h_graph, nodes_vec[i], 0) for i in range(num_nodes)) + + def edges(self): + """Return all edges in the graph as (from_node, to_node) pairs. + + Returns + ------- + tuple of tuple + Each element is a (from_node, to_node) pair representing + a dependency edge in the graph. + """ + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef size_t num_edges = 0 + + with nogil: + HANDLE_RETURN(cydriver.cuGraphGetEdges(graph, NULL, NULL, NULL, &num_edges)) + + if num_edges == 0: + return () + + cdef vector[cydriver.CUgraphNode] from_nodes + cdef vector[cydriver.CUgraphNode] to_nodes + from_nodes.resize(num_edges) + to_nodes.resize(num_edges) + with nogil: + HANDLE_RETURN(cydriver.cuGraphGetEdges( + graph, from_nodes.data(), to_nodes.data(), NULL, &num_edges)) + + return tuple( + (Node._create(self._h_graph, from_nodes[i], 0), + Node._create(self._h_graph, to_nodes[i], 0)) + for i in range(num_edges) + ) + + @property + def handle(self): + """Return the underlying CUgraph handle.""" + return driver.CUgraph(as_intptr(self._h_graph)) + + +cdef class Node: + """Represents a node (or potential node) in a CUDA graph. + + Nodes are created by calling methods on other Nodes. Each method + returns a new Node that depends on the current node(s). + + The root node (obtained from GraphDef.root) has a NULL internal + node handle, representing graph entry points. + """ + + @staticmethod + cdef Node _create(GraphHandle h_graph, cydriver.CUgraphNode node, cydriver.CUdeviceptr dptr): + """Internal factory method to create a Node.""" + cdef Node n = Node.__new__(Node) + n._h_graph = h_graph + n._node = node + n._dptr = dptr + return n + + def __repr__(self): + if self._node == NULL: + return "" + if self._dptr != 0: + return f"self._node:x} dptr=0x{self._dptr:x}>" + return f"self._node:x}>" + + def __eq__(self, other): + if not isinstance(other, Node): + return NotImplemented + cdef Node o = other + return (as_intptr(self._h_graph) == as_intptr(o._h_graph) and + self._node == o._node) + + def __hash__(self): + return hash((as_intptr(self._h_graph), self._node)) + + @property + def graph(self): + """Return the GraphDef this node belongs to.""" + return GraphDef._from_handle(self._h_graph) + + @property + def dptr(self): + """Return the device pointer for allocation nodes. + + Returns 0 for non-allocation nodes. + """ + return self._dptr + + @property + def pred(self): + """Return the predecessor nodes (dependencies) of this node. + + Returns + ------- + tuple of Node + The nodes that this node depends on. + """ + if self._node == NULL: + return () + + cdef size_t num_deps = 0 + cdef cydriver.CUgraphNode node = self._node + + with nogil: + HANDLE_RETURN(cydriver.cuGraphNodeGetDependencies(node, NULL, NULL, &num_deps)) + + if num_deps == 0: + return () + + cdef vector[cydriver.CUgraphNode] deps + deps.resize(num_deps) + with nogil: + HANDLE_RETURN(cydriver.cuGraphNodeGetDependencies(node, deps.data(), NULL, &num_deps)) + + return tuple(Node._create(self._h_graph, deps[i], 0) for i in range(num_deps)) + + @property + def succ(self): + """Return the successor nodes (dependents) of this node. + + Returns + ------- + tuple of Node + The nodes that depend on this node. + """ + if self._node == NULL: + return () + + cdef size_t num_deps = 0 + cdef cydriver.CUgraphNode node = self._node + + with nogil: + HANDLE_RETURN(cydriver.cuGraphNodeGetDependentNodes(node, NULL, NULL, &num_deps)) + + if num_deps == 0: + return () + + cdef vector[cydriver.CUgraphNode] deps + deps.resize(num_deps) + with nogil: + HANDLE_RETURN(cydriver.cuGraphNodeGetDependentNodes(node, deps.data(), NULL, &num_deps)) + + return tuple(Node._create(self._h_graph, deps[i], 0) for i in range(num_deps)) + + def launch(self, config, kernel, *args): + """Add a kernel launch node depending on this node. + + Parameters + ---------- + config : LaunchConfig + Launch configuration (grid, block, shared memory, etc.) + kernel : Kernel + The kernel to launch. + *args + Kernel arguments. + + Returns + ------- + Node + A new Node representing the kernel launch. + """ + cdef LaunchConfig conf = config + cdef Kernel ker = kernel + cdef ParamHolder ker_args = ParamHolder(args) + + cdef cydriver.CUDA_KERNEL_NODE_PARAMS node_params + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + + if self._node != NULL: + deps = &self._node + num_deps = 1 + + node_params.kern = as_cu(ker._h_kernel) + node_params.func = NULL + node_params.gridDimX = conf.grid[0] + node_params.gridDimY = conf.grid[1] + node_params.gridDimZ = conf.grid[2] + node_params.blockDimX = conf.block[0] + node_params.blockDimY = conf.block[1] + node_params.blockDimZ = conf.block[2] + node_params.sharedMemBytes = conf.shmem_size + node_params.kernelParams = (ker_args.ptr) + node_params.extra = NULL + node_params.ctx = NULL + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddKernelNode(&new_node, graph, deps, num_deps, &node_params)) + + return Node._create(self._h_graph, new_node, 0) + + def join(self, *nodes): + """Create an empty node that depends on this node and all given nodes. + + This is used to synchronize multiple branches of execution. + + Parameters + ---------- + *nodes : Node + Additional nodes to depend on. + + Returns + ------- + Node + A new Node that depends on all input nodes. + """ + cdef vector[cydriver.CUgraphNode] deps + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef Node other + cdef cydriver.CUgraphNode* deps_ptr = NULL + cdef size_t num_deps = 0 + + if self._node != NULL: + deps.push_back(self._node) + for other in nodes: + if (other)._node != NULL: + deps.push_back((other)._node) + + num_deps = deps.size() + if num_deps > 0: + deps_ptr = deps.data() + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddEmptyNode(&new_node, graph, deps_ptr, num_deps)) + + return Node._create(self._h_graph, new_node, 0) + + def alloc(self, size_t size, options: GraphAllocOptions | None = None): + """Add a memory allocation node depending on this node. + + Parameters + ---------- + size : int + Number of bytes to allocate. + options : GraphAllocOptions, optional + Allocation options. If None, allocates on the current device. + + Returns + ------- + Node + A new Node representing the allocation. Access the allocated + device pointer via the dptr property. + """ + cdef int device_id + cdef cydriver.CUdevice dev + + if options is None or options.device is None: + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + device_id = dev + else: + device_id = getattr(options.device, 'device_id', options.device) + + cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS alloc_params + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + + if self._node != NULL: + deps = &self._node + num_deps = 1 + + cdef vector[cydriver.CUmemAccessDesc] access_descs + cdef int peer_id + + if options is not None and options.peer_access is not None: + for peer_dev in options.peer_access: + peer_id = getattr(peer_dev, 'device_id', peer_dev) + access_descs.push_back(cydriver.CUmemAccessDesc_st( + cydriver.CUmemLocation_st( + cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, + peer_id + ), + cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + )) + + cdef str memory_type = "device" + if options is not None and options.memory_type is not None: + memory_type = options.memory_type + + memset(&alloc_params, 0, sizeof(alloc_params)) + alloc_params.poolProps.handleTypes = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + alloc_params.bytesize = size + + if memory_type == "device": + alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + alloc_params.poolProps.location.id = device_id + elif memory_type == "host": + alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST + alloc_params.poolProps.location.id = 0 + elif memory_type == "managed": + alloc_params.poolProps.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED + alloc_params.poolProps.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + alloc_params.poolProps.location.id = device_id + else: + raise ValueError(f"Invalid memory_type: {memory_type!r}. " + "Must be 'device', 'host', or 'managed'.") + + if access_descs.size() > 0: + alloc_params.accessDescs = access_descs.data() + alloc_params.accessDescCount = access_descs.size() + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddMemAllocNode(&new_node, graph, deps, num_deps, &alloc_params)) + + return Node._create(self._h_graph, new_node, alloc_params.dptr) + + def free(self, dptr): + """Add a memory free node depending on this node. + + Parameters + ---------- + dptr : int + Device pointer to free (typically from Node.dptr of an alloc node). + + Returns + ------- + Node + A new Node representing the free operation. + """ + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + cdef cydriver.CUdeviceptr c_dptr = dptr + + if self._node != NULL: + deps = &self._node + num_deps = 1 + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddMemFreeNode(&new_node, graph, deps, num_deps, c_dptr)) + + return Node._create(self._h_graph, new_node, 0) diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py new file mode 100644 index 0000000000..7d922911c9 --- /dev/null +++ b/cuda_core/tests/graph/test_explicit.py @@ -0,0 +1,558 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +"""Tests for explicit CUDA graph construction (GraphDef and Node).""" + +import itertools +import tempfile +from pathlib import Path + +import pytest +from helpers.graph_kernels import compile_common_kernels + +from cuda.core import Device, LaunchConfig +from cuda.core._graph import GraphDebugPrintOptions +from cuda.core._graph._graphdef import GraphAllocOptions, GraphDef, Node + +ALLOC_SIZE = 1024 + + +# ============================================================================= +# Fixtures - Sample objects +# ============================================================================= + + +@pytest.fixture +def sample_graphdef(init_cuda): + """A sample GraphDef.""" + return GraphDef() + + +@pytest.fixture +def sample_graphdef_alt(init_cuda): + """An alternate GraphDef (for inequality testing).""" + return GraphDef() + + +@pytest.fixture +def sample_root_node(sample_graphdef): + """A root Node (virtual, NULL handle).""" + return sample_graphdef.root + + +@pytest.fixture +def sample_root_node_alt(sample_graphdef_alt): + """An alternate root Node from different graph.""" + return sample_graphdef_alt.root + + +@pytest.fixture +def sample_empty_node(sample_graphdef): + """An empty Node (join node).""" + return sample_graphdef.root.join() + + +@pytest.fixture +def sample_empty_node_alt(sample_graphdef): + """An alternate empty Node from same graph.""" + return sample_graphdef.root.join() + + +@pytest.fixture +def sample_alloc_node(sample_graphdef): + """An allocation Node.""" + return sample_graphdef.root.alloc(ALLOC_SIZE) + + +@pytest.fixture +def sample_alloc_node_alt(sample_graphdef): + """An alternate allocation Node from same graph.""" + return sample_graphdef.root.alloc(ALLOC_SIZE) + + +@pytest.fixture +def sample_kernel_node(sample_graphdef, init_cuda): + """A kernel launch Node.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + return sample_graphdef.root.launch(config, kernel) + + +@pytest.fixture +def dot_file(): + """Temporary DOT file path, cleaned up after test.""" + path = Path(tempfile.mktemp(suffix=".dot")) + yield path + path.unlink(missing_ok=True) + + +# ============================================================================= +# Type groupings +# ============================================================================= + +# All types that support __hash__ +HASH_TYPES = [ + "sample_graphdef", + "sample_root_node", + "sample_empty_node", + "sample_alloc_node", +] + +# All types that support __eq__ +EQ_TYPES = [ + "sample_graphdef", + "sample_root_node", + "sample_empty_node", + "sample_alloc_node", +] + +# All types (for repr testing) +ALL_TYPES = [ + "sample_graphdef", + "sample_root_node", + "sample_empty_node", + "sample_alloc_node", + "sample_kernel_node", +] + +# Pairs of distinct objects for inequality testing (a != b) +DISTINCT_PAIRS = [ + ("sample_graphdef", "sample_graphdef_alt"), + ("sample_root_node", "sample_root_node_alt"), + ("sample_empty_node", "sample_empty_node_alt"), + ("sample_alloc_node", "sample_alloc_node_alt"), +] + +# Repr patterns +REPR_PATTERNS = [ + ("sample_graphdef", r""), + ("sample_root_node", r""), + ("sample_empty_node", r""), + ("sample_alloc_node", r""), + ("sample_kernel_node", r""), +] + + +# ============================================================================= +# Hash tests +# ============================================================================= + + +@pytest.mark.parametrize("fixture_name", HASH_TYPES) +def test_hash_consistent(fixture_name, request): + """Hash is consistent across multiple calls.""" + obj = request.getfixturevalue(fixture_name) + assert hash(obj) == hash(obj) + + +@pytest.mark.parametrize("a_name,b_name", DISTINCT_PAIRS) +def test_hash_distinct(a_name, b_name, request): + """Distinct objects have different hashes.""" + obj_a = request.getfixturevalue(a_name) + obj_b = request.getfixturevalue(b_name) + assert hash(obj_a) != hash(obj_b) + + +# ============================================================================= +# Equality tests (identity-based) +# ============================================================================= + + +@pytest.mark.parametrize("fixture_name", EQ_TYPES) +def test_equals_self(fixture_name, request): + """Object equals itself.""" + obj = request.getfixturevalue(fixture_name) + assert obj == obj + + +@pytest.mark.parametrize("fixture_name", EQ_TYPES) +def test_not_equal_to_other_types(fixture_name, request): + """Object not equal to unrelated types.""" + obj = request.getfixturevalue(fixture_name) + assert obj.__eq__("string") is NotImplemented + assert obj.__eq__(42) is NotImplemented + assert obj.__eq__(None) is NotImplemented + + +@pytest.mark.parametrize("a_name,b_name", DISTINCT_PAIRS) +def test_distinct_objects_not_equal(a_name, b_name, request): + """Distinct objects of same type are not equal.""" + obj_a = request.getfixturevalue(a_name) + obj_b = request.getfixturevalue(b_name) + assert obj_a is not obj_b + assert obj_a != obj_b + + +@pytest.mark.parametrize("a_name,b_name", list(itertools.combinations(EQ_TYPES, 2))) +def test_cross_type_equality_by_identity(a_name, b_name, request): + """Cross-type equality: equal iff same object identity.""" + obj_a = request.getfixturevalue(a_name) + obj_b = request.getfixturevalue(b_name) + if obj_a is obj_b: + assert obj_a == obj_b + else: + assert obj_a != obj_b + + +# ============================================================================= +# Collection usage tests +# ============================================================================= + + +@pytest.mark.parametrize("fixture_name", HASH_TYPES) +def test_usable_in_set(fixture_name, request): + """Object can be added to a set.""" + obj = request.getfixturevalue(fixture_name) + s = {obj} + assert obj in s + + +@pytest.mark.parametrize("fixture_name", HASH_TYPES) +def test_usable_as_dict_key(fixture_name, request): + """Object can be used as dictionary key.""" + obj = request.getfixturevalue(fixture_name) + d = {obj: "value"} + assert d[obj] == "value" + + +# ============================================================================= +# Repr tests +# ============================================================================= + + +@pytest.mark.parametrize("fixture_name,pattern", REPR_PATTERNS) +def test_repr_format(fixture_name, pattern, request): + """repr() matches expected pattern.""" + import re + + obj = request.getfixturevalue(fixture_name) + assert re.fullmatch(pattern, repr(obj)) + + +# ============================================================================= +# GraphDef-specific tests +# ============================================================================= + + +def test_graphdef_handle_valid(sample_graphdef): + """GraphDef has a valid non-null handle.""" + assert sample_graphdef.handle is not None + assert int(sample_graphdef.handle) != 0 + + +def test_graphdef_root_returns_node(sample_graphdef): + """GraphDef.root returns a Node instance.""" + assert isinstance(sample_graphdef.root, Node) + + +def test_graphdef_root_is_virtual(sample_graphdef): + """Root node is virtual (no pred/succ).""" + root = sample_graphdef.root + assert root.pred == () + assert root.succ == () + + +# ============================================================================= +# Node property tests +# ============================================================================= + + +def test_node_graph_property(sample_graphdef): + """Node.graph returns the parent GraphDef.""" + node = sample_graphdef.root.join() + assert node.graph == sample_graphdef + + +def test_node_dptr_zero_for_non_alloc(sample_empty_node): + """Non-alloc nodes have dptr=0.""" + assert sample_empty_node.dptr == 0 + + +def test_node_dptr_nonzero_for_alloc(sample_alloc_node): + """Alloc nodes have non-zero dptr.""" + assert sample_alloc_node.dptr != 0 + + +# ============================================================================= +# Graph building: join +# ============================================================================= + + +def test_join_from_root(sample_graphdef): + """Join from root creates entry node with no predecessors.""" + node = sample_graphdef.root.join() + assert isinstance(node, Node) + assert len(node.pred) == 0 + + +def test_join_single_dependency(sample_graphdef): + """Join from a node creates dependency.""" + n1 = sample_graphdef.root.join() + n2 = n1.join() + assert n1 in n2.pred + assert len(n2.pred) == 1 + + +@pytest.mark.parametrize("num_deps", [2, 3, 5]) +def test_join_multiple_dependencies(sample_graphdef, num_deps): + """Join N nodes creates node depending on all.""" + nodes = [sample_graphdef.root.join() for _ in range(num_deps)] + joined = nodes[0].join(*nodes[1:]) + assert set(joined.pred) == set(nodes) + + +# ============================================================================= +# Graph building: alloc/free +# ============================================================================= + + +def test_alloc_returns_valid_dptr(sample_graphdef): + """Alloc returns node with valid device pointer.""" + node = sample_graphdef.root.alloc(ALLOC_SIZE) + assert node.dptr != 0 + + +def test_alloc_zero_size_fails(sample_graphdef): + """Alloc with zero size raises error (CUDA limitation).""" + from cuda.core._utils.cuda_utils import CUDAError + + with pytest.raises(CUDAError): + sample_graphdef.root.alloc(0) + + +def test_free_creates_dependency(sample_graphdef): + """Free node depends on its predecessor.""" + alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + free = alloc.free(alloc.dptr) + assert alloc in free.pred + assert free.dptr == 0 + + +def test_alloc_free_chain(sample_graphdef): + """Alloc and free can be chained.""" + a1 = sample_graphdef.root.alloc(ALLOC_SIZE) + a2 = a1.alloc(ALLOC_SIZE) + f2 = a2.free(a2.dptr) + f1 = f2.free(a1.dptr) + assert a1 in a2.pred + assert a2 in f2.pred + assert f2 in f1.pred + + +# ============================================================================= +# Allocation options +# ============================================================================= + + +@pytest.mark.parametrize("memory_type", ["device", "managed"]) +def test_alloc_memory_type(sample_graphdef, memory_type): + """Allocation succeeds for supported memory types.""" + options = GraphAllocOptions(memory_type=memory_type) + node = sample_graphdef.root.alloc(ALLOC_SIZE, options) + assert node.dptr != 0 + + +def test_alloc_memory_type_invalid(sample_graphdef): + """Invalid memory type raises ValueError.""" + options = GraphAllocOptions(memory_type="invalid") + with pytest.raises(ValueError, match="Invalid memory_type"): + sample_graphdef.root.alloc(ALLOC_SIZE, options) + + +@pytest.mark.parametrize( + "device_spec", + [ + pytest.param(lambda d: d.device_id, id="device_id"), + pytest.param(lambda d: d, id="Device_object"), + ], +) +def test_alloc_device_option(sample_graphdef, device_spec): + """Device can be specified as int or Device object.""" + device = Device() + options = GraphAllocOptions(device=device_spec(device)) + node = sample_graphdef.root.alloc(ALLOC_SIZE, options) + assert node.dptr != 0 + + +def test_alloc_peer_access(mempool_device_x2): + """Allocation with peer access list succeeds.""" + d0, d1 = mempool_device_x2 + g = GraphDef() + options = GraphAllocOptions(device=d0.device_id, peer_access=[d1.device_id]) + node = g.root.alloc(ALLOC_SIZE, options) + assert node.dptr != 0 + + +# ============================================================================= +# Graph traversal: nodes, edges, pred, succ +# ============================================================================= + + +def test_empty_graph_has_no_nodes(sample_graphdef): + """Empty graph returns no nodes.""" + assert sample_graphdef.nodes() == () + + +def test_empty_graph_has_no_edges(sample_graphdef): + """Empty graph returns no edges.""" + assert sample_graphdef.edges() == () + + +def test_nodes_returns_all_nodes(sample_graphdef): + """nodes() returns all added nodes.""" + n1 = sample_graphdef.root.join() + n2 = sample_graphdef.root.join() + n3 = n1.join(n2) + nodes = sample_graphdef.nodes() + assert len(nodes) == 3 + assert set(nodes) == {n1, n2, n3} + + +def test_edges_returns_dependency_pairs(sample_graphdef): + """edges() returns (from, to) pairs for all dependencies.""" + n1 = sample_graphdef.root.join() + n2 = n1.join() + edges = sample_graphdef.edges() + assert (n1, n2) in edges + + +def test_edges_multiple(sample_graphdef): + """edges() with fan-in topology.""" + n1 = sample_graphdef.root.join() + n2 = sample_graphdef.root.join() + n3 = n1.join(n2) + edges = sample_graphdef.edges() + assert len(edges) == 2 + assert (n1, n3) in edges + assert (n2, n3) in edges + + +@pytest.mark.parametrize("direction", ["pred", "succ"]) +def test_traversal_single(sample_graphdef, direction): + """Single predecessor/successor relationship.""" + n1 = sample_graphdef.root.join() + n2 = n1.join() + if direction == "pred": + assert n1 in n2.pred + assert len(n2.pred) == 1 + else: + assert n2 in n1.succ + assert len(n1.succ) == 1 + + +@pytest.mark.parametrize("direction", ["pred", "succ"]) +def test_traversal_multiple(sample_graphdef, direction): + """Multiple predecessors/successors.""" + if direction == "pred": + n1 = sample_graphdef.root.join() + n2 = sample_graphdef.root.join() + n3 = n1.join(n2) + assert set(n3.pred) == {n1, n2} + else: + n1 = sample_graphdef.root.join() + n2 = n1.join() + n3 = n1.join() + assert set(n1.succ) == {n2, n3} + + +# ============================================================================= +# Kernel launch +# ============================================================================= + + +def test_launch_creates_node(sample_graphdef, init_cuda): + """launch() creates a kernel node.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + node = sample_graphdef.root.launch(config, kernel) + assert isinstance(node, Node) + assert node.dptr == 0 + + +def test_launch_chain_dependencies(sample_graphdef, init_cuda): + """Chained launches create correct dependencies.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + n1 = sample_graphdef.root.launch(config, kernel) + n2 = n1.launch(config, kernel) + n3 = n2.launch(config, kernel) + assert n1 in n2.pred + assert n2 in n3.pred + assert n1 not in n3.pred + + +# ============================================================================= +# Graph instantiation and execution +# ============================================================================= + + +def test_instantiate_empty_graph(sample_graphdef): + """Empty graph can be instantiated.""" + graph = sample_graphdef.instantiate() + assert graph is not None + + +def test_instantiate_with_nodes(sample_graphdef): + """Graph with nodes can be instantiated.""" + sample_graphdef.root.join() + sample_graphdef.root.join() + graph = sample_graphdef.instantiate() + assert graph is not None + + +def test_instantiate_and_execute_kernel(sample_graphdef, init_cuda): + """Graph with kernel can be instantiated and executed.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + sample_graphdef.root.launch(config, kernel) + graph = sample_graphdef.instantiate() + + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + +def test_instantiate_and_execute_alloc_free(sample_graphdef): + """Graph with alloc/free can be executed.""" + alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + alloc.free(alloc.dptr) + graph = sample_graphdef.instantiate() + + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + +# ============================================================================= +# Debug output +# ============================================================================= + + +def test_debug_dot_print_creates_file(sample_graphdef, dot_file): + """debug_dot_print writes a DOT file.""" + sample_graphdef.root.join() + sample_graphdef.debug_dot_print(str(dot_file)) + assert dot_file.exists() + content = dot_file.read_text() + assert "digraph" in content + + +def test_debug_dot_print_with_options(sample_graphdef, dot_file): + """debug_dot_print accepts GraphDebugPrintOptions.""" + sample_graphdef.root.join() + options = GraphDebugPrintOptions(verbose=True, handles=True) + sample_graphdef.debug_dot_print(str(dot_file), options) + assert dot_file.exists() + + +def test_debug_dot_print_invalid_options(sample_graphdef, dot_file): + """debug_dot_print rejects invalid options type.""" + sample_graphdef.root.join() + with pytest.raises(TypeError, match="options must be a GraphDebugPrintOptions"): + sample_graphdef.debug_dot_print(str(dot_file), "invalid") From 7559d2faec160410d599c1ae1be3d722d48ec81f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 4 Mar 2026 18:57:32 -0800 Subject: [PATCH 04/23] Add Node class hierarchy with type-specific properties and parameterized tests Introduce AllocNode, KernelNode, EmptyNode, FreeNode subclasses with properties populated from the CUDA driver API. AllocNode exposes dptr, bytesize, device_id, memory_type, peer_access, and options; KernelNode exposes grid, block, shmem_size, kernel, and config. Node.pred/succ results are cached with automatic invalidation in builder methods. Restructure test_explicit.py around GraphSpec (topology) and NodeSpec (type + expected attributes) so that adding a new node type requires only a builder function and one _NODE_SPECS entry. Move object protocol tests to test_object_protocols.py for all node subclasses including FreeNode and KernelNode. Made-with: Cursor --- cuda_core/cuda/core/_graph/_graphdef.pxd | 61 ++- cuda_core/cuda/core/_graph/_graphdef.pyx | 392 ++++++++++++-- cuda_core/tests/graph/test_explicit.py | 642 ++++++++++++----------- cuda_core/tests/test_object_protocols.py | 124 +++++ 4 files changed, 868 insertions(+), 351 deletions(-) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index 03ba795f4b..0557f5a15b 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -2,12 +2,18 @@ # # SPDX-License-Identifier: Apache-2.0 +from libc.stddef cimport size_t + from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport GraphHandle cdef class GraphDef cdef class Node +cdef class EmptyNode(Node) +cdef class KernelNode(Node) +cdef class AllocNode(Node) +cdef class FreeNode(Node) cdef class GraphDef: @@ -23,8 +29,59 @@ cdef class Node: cdef: GraphHandle _h_graph cydriver.CUgraphNode _node # NULL for root - cydriver.CUdeviceptr _dptr # non-zero for alloc nodes + tuple _pred_cache + tuple _succ_cache object __weakref__ @staticmethod - cdef Node _create(GraphHandle h_graph, cydriver.CUgraphNode node, cydriver.CUdeviceptr dptr) + cdef Node _create(GraphHandle h_graph, cydriver.CUgraphNode node) + + +cdef class EmptyNode(Node): + @staticmethod + cdef EmptyNode _create_impl(GraphHandle h_graph, cydriver.CUgraphNode node) + + +cdef class KernelNode(Node): + cdef: + tuple _grid + tuple _block + unsigned int _shmem_size + cydriver.CUkernel _kern + + @staticmethod + cdef KernelNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + tuple grid, tuple block, unsigned int shmem_size, + cydriver.CUkernel kern) + + @staticmethod + cdef KernelNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + + +cdef class AllocNode(Node): + cdef: + cydriver.CUdeviceptr _dptr + size_t _bytesize + int _device_id + str _memory_type + tuple _peer_access + + @staticmethod + cdef AllocNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUdeviceptr dptr, size_t bytesize, + int device_id, str memory_type, tuple peer_access) + + @staticmethod + cdef AllocNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + + +cdef class FreeNode(Node): + cdef: + cydriver.CUdeviceptr _dptr + + @staticmethod + cdef FreeNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUdeviceptr dptr) + + @staticmethod + cdef FreeNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 99e125bd0e..493278de66 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -5,9 +5,16 @@ """ Private module for explicit CUDA graph construction. -This module provides GraphDef and Node classes for building CUDA graphs -explicitly (as opposed to stream capture). Both approaches produce the -same public Graph type for execution. +This module provides GraphDef and a Node class hierarchy for building CUDA +graphs explicitly (as opposed to stream capture). Both approaches produce +the same public Graph type for execution. + +Node hierarchy: + Node (base — also used for the virtual root) + ├── EmptyNode (synchronization / join point) + ├── KernelNode (kernel launch) + ├── AllocNode (memory allocation, exposes dptr and bytesize) + └── FreeNode (memory free, exposes dptr) """ from dataclasses import dataclass @@ -114,7 +121,10 @@ cdef class GraphDef: The root node has no dependencies. Operations added from the root will be entry points to the graph. """ - return Node._create(self._h_graph, NULL, 0) + cdef Node n = Node.__new__(Node) + n._h_graph = self._h_graph + n._node = NULL + return n def instantiate(self): """Instantiate the graph definition into an executable Graph. @@ -177,7 +187,7 @@ cdef class GraphDef: with nogil: HANDLE_RETURN(cydriver.cuGraphGetNodes(graph, nodes_vec.data(), &num_nodes)) - return tuple(Node._create(self._h_graph, nodes_vec[i], 0) for i in range(num_nodes)) + return tuple(Node._create(self._h_graph, nodes_vec[i]) for i in range(num_nodes)) def edges(self): """Return all edges in the graph as (from_node, to_node) pairs. @@ -206,8 +216,8 @@ cdef class GraphDef: graph, from_nodes.data(), to_nodes.data(), NULL, &num_edges)) return tuple( - (Node._create(self._h_graph, from_nodes[i], 0), - Node._create(self._h_graph, to_nodes[i], 0)) + (Node._create(self._h_graph, from_nodes[i]), + Node._create(self._h_graph, to_nodes[i])) for i in range(num_edges) ) @@ -218,29 +228,45 @@ cdef class GraphDef: cdef class Node: - """Represents a node (or potential node) in a CUDA graph. + """Base class for all graph nodes. Nodes are created by calling methods on other Nodes. Each method - returns a new Node that depends on the current node(s). + returns a new Node subclass that depends on the current node(s). - The root node (obtained from GraphDef.root) has a NULL internal - node handle, representing graph entry points. + The root node (obtained from GraphDef.root) is a base Node with a + NULL internal handle, representing graph entry points. """ @staticmethod - cdef Node _create(GraphHandle h_graph, cydriver.CUgraphNode node, cydriver.CUdeviceptr dptr): - """Internal factory method to create a Node.""" - cdef Node n = Node.__new__(Node) - n._h_graph = h_graph - n._node = node - n._dptr = dptr - return n + cdef Node _create(GraphHandle h_graph, cydriver.CUgraphNode node): + """Factory: dispatch to the right subclass based on node type.""" + if node == NULL: + n = Node.__new__(Node) + (n)._h_graph = h_graph + (n)._node = NULL + return n + + cdef cydriver.CUgraphNodeType node_type + with nogil: + HANDLE_RETURN(cydriver.cuGraphNodeGetType(node, &node_type)) + + if node_type == cydriver.CU_GRAPH_NODE_TYPE_EMPTY: + return EmptyNode._create_impl(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_KERNEL: + return KernelNode._create_from_driver(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEM_ALLOC: + return AllocNode._create_from_driver(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEM_FREE: + return FreeNode._create_from_driver(h_graph, node) + else: + n = Node.__new__(Node) + (n)._h_graph = h_graph + (n)._node = node + return n def __repr__(self): if self._node == NULL: return "" - if self._dptr != 0: - return f"self._node:x} dptr=0x{self._dptr:x}>" return f"self._node:x}>" def __eq__(self, other): @@ -253,30 +279,45 @@ cdef class Node: def __hash__(self): return hash((as_intptr(self._h_graph), self._node)) + @property + def type(self): + """Return the CUDA graph node type. + + Returns + ------- + CUgraphNodeType or None + The node type enum value, or None for the virtual root node. + """ + if self._node == NULL: + return None + cdef cydriver.CUgraphNodeType node_type + with nogil: + HANDLE_RETURN(cydriver.cuGraphNodeGetType(self._node, &node_type)) + return driver.CUgraphNodeType(node_type) + @property def graph(self): """Return the GraphDef this node belongs to.""" return GraphDef._from_handle(self._h_graph) - @property - def dptr(self): - """Return the device pointer for allocation nodes. - - Returns 0 for non-allocation nodes. - """ - return self._dptr - @property def pred(self): """Return the predecessor nodes (dependencies) of this node. + Results are cached since a node's dependencies are immutable + once created. + Returns ------- tuple of Node The nodes that this node depends on. """ + if self._pred_cache is not None: + return self._pred_cache + if self._node == NULL: - return () + self._pred_cache = () + return self._pred_cache cdef size_t num_deps = 0 cdef cydriver.CUgraphNode node = self._node @@ -285,26 +326,35 @@ cdef class Node: HANDLE_RETURN(cydriver.cuGraphNodeGetDependencies(node, NULL, NULL, &num_deps)) if num_deps == 0: - return () + self._pred_cache = () + return self._pred_cache cdef vector[cydriver.CUgraphNode] deps deps.resize(num_deps) with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetDependencies(node, deps.data(), NULL, &num_deps)) - return tuple(Node._create(self._h_graph, deps[i], 0) for i in range(num_deps)) + self._pred_cache = tuple(Node._create(self._h_graph, deps[i]) for i in range(num_deps)) + return self._pred_cache @property def succ(self): """Return the successor nodes (dependents) of this node. + Results are cached and automatically invalidated when new + dependent nodes are added via builder methods. + Returns ------- tuple of Node The nodes that depend on this node. """ + if self._succ_cache is not None: + return self._succ_cache + if self._node == NULL: - return () + self._succ_cache = () + return self._succ_cache cdef size_t num_deps = 0 cdef cydriver.CUgraphNode node = self._node @@ -313,14 +363,16 @@ cdef class Node: HANDLE_RETURN(cydriver.cuGraphNodeGetDependentNodes(node, NULL, NULL, &num_deps)) if num_deps == 0: - return () + self._succ_cache = () + return self._succ_cache cdef vector[cydriver.CUgraphNode] deps deps.resize(num_deps) with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetDependentNodes(node, deps.data(), NULL, &num_deps)) - return tuple(Node._create(self._h_graph, deps[i], 0) for i in range(num_deps)) + self._succ_cache = tuple(Node._create(self._h_graph, deps[i]) for i in range(num_deps)) + return self._succ_cache def launch(self, config, kernel, *args): """Add a kernel launch node depending on this node. @@ -336,8 +388,8 @@ cdef class Node: Returns ------- - Node - A new Node representing the kernel launch. + KernelNode + A new KernelNode representing the kernel launch. """ cdef LaunchConfig conf = config cdef Kernel ker = kernel @@ -369,7 +421,11 @@ cdef class Node: with nogil: HANDLE_RETURN(cydriver.cuGraphAddKernelNode(&new_node, graph, deps, num_deps, &node_params)) - return Node._create(self._h_graph, new_node, 0) + self._succ_cache = None + return KernelNode._create_with_params( + self._h_graph, new_node, + conf.grid, conf.block, conf.shmem_size, + node_params.kern) def join(self, *nodes): """Create an empty node that depends on this node and all given nodes. @@ -383,8 +439,8 @@ cdef class Node: Returns ------- - Node - A new Node that depends on all input nodes. + EmptyNode + A new EmptyNode that depends on all input nodes. """ cdef vector[cydriver.CUgraphNode] deps cdef cydriver.CUgraphNode new_node = NULL @@ -406,7 +462,10 @@ cdef class Node: with nogil: HANDLE_RETURN(cydriver.cuGraphAddEmptyNode(&new_node, graph, deps_ptr, num_deps)) - return Node._create(self._h_graph, new_node, 0) + self._succ_cache = None + for other in nodes: + (other)._succ_cache = None + return EmptyNode._create_impl(self._h_graph, new_node) def alloc(self, size_t size, options: GraphAllocOptions | None = None): """Add a memory allocation node depending on this node. @@ -420,8 +479,8 @@ cdef class Node: Returns ------- - Node - A new Node representing the allocation. Access the allocated + AllocNode + A new AllocNode representing the allocation. Access the allocated device pointer via the dptr property. """ cdef int device_id @@ -446,10 +505,12 @@ cdef class Node: cdef vector[cydriver.CUmemAccessDesc] access_descs cdef int peer_id + cdef list peer_ids = [] if options is not None and options.peer_access is not None: for peer_dev in options.peer_access: peer_id = getattr(peer_dev, 'device_id', peer_dev) + peer_ids.append(peer_id) access_descs.push_back(cydriver.CUmemAccessDesc_st( cydriver.CUmemLocation_st( cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE, @@ -489,7 +550,10 @@ cdef class Node: with nogil: HANDLE_RETURN(cydriver.cuGraphAddMemAllocNode(&new_node, graph, deps, num_deps, &alloc_params)) - return Node._create(self._h_graph, new_node, alloc_params.dptr) + self._succ_cache = None + return AllocNode._create_with_params( + self._h_graph, new_node, alloc_params.dptr, size, + device_id, memory_type, tuple(peer_ids)) def free(self, dptr): """Add a memory free node depending on this node. @@ -497,12 +561,12 @@ cdef class Node: Parameters ---------- dptr : int - Device pointer to free (typically from Node.dptr of an alloc node). + Device pointer to free (typically from AllocNode.dptr). Returns ------- - Node - A new Node representing the free operation. + FreeNode + A new FreeNode representing the free operation. """ cdef cydriver.CUgraphNode new_node = NULL cdef cydriver.CUgraph graph = as_cu(self._h_graph) @@ -517,4 +581,238 @@ cdef class Node: with nogil: HANDLE_RETURN(cydriver.cuGraphAddMemFreeNode(&new_node, graph, deps, num_deps, c_dptr)) - return Node._create(self._h_graph, new_node, 0) + self._succ_cache = None + return FreeNode._create_with_params(self._h_graph, new_node, c_dptr) + + +# ============================================================================= +# Node subclasses +# ============================================================================= + + +cdef class EmptyNode(Node): + """A synchronization / join node with no operation.""" + + @staticmethod + cdef EmptyNode _create_impl(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef EmptyNode n = EmptyNode.__new__(EmptyNode) + n._h_graph = h_graph + n._node = node + return n + + def __repr__(self): + return f"self._node:x}>" + + +cdef class KernelNode(Node): + """A kernel launch node. + + Properties + ---------- + grid : tuple of int + Grid dimensions (gridDimX, gridDimY, gridDimZ). + block : tuple of int + Block dimensions (blockDimX, blockDimY, blockDimZ). + shmem_size : int + Dynamic shared memory size in bytes. + kernel : Kernel + The kernel object for this launch node. + config : LaunchConfig + A LaunchConfig reconstructed from this node's parameters. + """ + + @staticmethod + cdef KernelNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + tuple grid, tuple block, unsigned int shmem_size, + cydriver.CUkernel kern): + """Create from known params (called by launch() builder).""" + cdef KernelNode n = KernelNode.__new__(KernelNode) + n._h_graph = h_graph + n._node = node + n._grid = grid + n._block = block + n._shmem_size = shmem_size + n._kern = kern + return n + + @staticmethod + cdef KernelNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUDA_KERNEL_NODE_PARAMS params + with nogil: + HANDLE_RETURN(cydriver.cuGraphKernelNodeGetParams(node, ¶ms)) + return KernelNode._create_with_params( + h_graph, node, + (params.gridDimX, params.gridDimY, params.gridDimZ), + (params.blockDimX, params.blockDimY, params.blockDimZ), + params.sharedMemBytes, + params.kern) + + def __repr__(self): + return f"self._node:x}>" + + @property + def grid(self): + """Grid dimensions as a 3-tuple (gridDimX, gridDimY, gridDimZ).""" + return self._grid + + @property + def block(self): + """Block dimensions as a 3-tuple (blockDimX, blockDimY, blockDimZ).""" + return self._block + + @property + def shmem_size(self): + """Dynamic shared memory size in bytes.""" + return self._shmem_size + + @property + def kernel(self): + """The Kernel object for this launch node.""" + return Kernel.from_handle(self._kern) + + @property + def config(self): + """A LaunchConfig reconstructed from this node's grid, block, and shmem_size. + + Note: cluster dimensions and cooperative_launch are not preserved + by the CUDA driver's kernel node params, so they are not included. + """ + return LaunchConfig(grid=self._grid, block=self._block, + shmem_size=self._shmem_size) + + +cdef class AllocNode(Node): + """A memory allocation node. + + Properties + ---------- + dptr : int + The device pointer for the allocation. + bytesize : int + The number of bytes allocated. + device_id : int + The device on which the allocation was made. + memory_type : str + The type of memory allocated (``"device"``, ``"host"``, or ``"managed"``). + peer_access : tuple of int + Device IDs that have read-write access to this allocation. + options : GraphAllocOptions + A GraphAllocOptions reconstructed from this node's parameters. + """ + + @staticmethod + cdef AllocNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUdeviceptr dptr, size_t bytesize, + int device_id, str memory_type, tuple peer_access): + """Create from known params (called by alloc() builder).""" + cdef AllocNode n = AllocNode.__new__(AllocNode) + n._h_graph = h_graph + n._node = node + n._dptr = dptr + n._bytesize = bytesize + n._device_id = device_id + n._memory_type = memory_type + n._peer_access = peer_access + return n + + @staticmethod + cdef AllocNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS params + with nogil: + HANDLE_RETURN(cydriver.cuGraphMemAllocNodeGetParams(node, ¶ms)) + + cdef str memory_type + if params.poolProps.allocType == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED: + if params.poolProps.location.type == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST: + memory_type = "host" + else: + memory_type = "device" + elif params.poolProps.allocType == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED: + memory_type = "managed" + else: + memory_type = "device" + + cdef list peer_ids = [] + cdef size_t i + for i in range(params.accessDescCount): + peer_ids.append(params.accessDescs[i].location.id) + + return AllocNode._create_with_params( + h_graph, node, params.dptr, params.bytesize, + params.poolProps.location.id, memory_type, tuple(peer_ids)) + + def __repr__(self): + return (f"self._node:x} " + f"dptr=0x{self._dptr:x} size={self._bytesize}>") + + @property + def dptr(self): + """The device pointer for the allocation.""" + return self._dptr + + @property + def bytesize(self): + """The number of bytes allocated.""" + return self._bytesize + + @property + def device_id(self): + """The device on which the allocation was made.""" + return self._device_id + + @property + def memory_type(self): + """The type of memory: ``"device"``, ``"host"``, or ``"managed"``.""" + return self._memory_type + + @property + def peer_access(self): + """Device IDs with read-write access to this allocation.""" + return self._peer_access + + @property + def options(self): + """A GraphAllocOptions reconstructed from this node's parameters.""" + return GraphAllocOptions( + device=self._device_id, + memory_type=self._memory_type, + peer_access=list(self._peer_access) if self._peer_access else None, + ) + + +cdef class FreeNode(Node): + """A memory free node. + + Properties + ---------- + dptr : int + The device pointer being freed. + """ + + @staticmethod + cdef FreeNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUdeviceptr dptr): + """Create from known params (called by free() builder).""" + cdef FreeNode n = FreeNode.__new__(FreeNode) + n._h_graph = h_graph + n._node = node + n._dptr = dptr + return n + + @staticmethod + cdef FreeNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUdeviceptr dptr + with nogil: + HANDLE_RETURN(cydriver.cuGraphMemFreeNodeGetParams(node, &dptr)) + return FreeNode._create_with_params(h_graph, node, dptr) + + def __repr__(self): + return f"self._node:x} dptr=0x{self._dptr:x}>" + + @property + def dptr(self): + """The device pointer being freed.""" + return self._dptr diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py index 7d922911c9..706bf06e6b 100644 --- a/cuda_core/tests/graph/test_explicit.py +++ b/cuda_core/tests/graph/test_explicit.py @@ -3,235 +3,395 @@ """Tests for explicit CUDA graph construction (GraphDef and Node).""" -import itertools -import tempfile -from pathlib import Path +from collections.abc import Callable +from dataclasses import dataclass, field import pytest from helpers.graph_kernels import compile_common_kernels from cuda.core import Device, LaunchConfig from cuda.core._graph import GraphDebugPrintOptions -from cuda.core._graph._graphdef import GraphAllocOptions, GraphDef, Node +from cuda.core._graph._graphdef import ( + AllocNode, + EmptyNode, + FreeNode, + GraphAllocOptions, + GraphDef, + KernelNode, + Node, +) ALLOC_SIZE = 1024 # ============================================================================= -# Fixtures - Sample objects +# GraphSpec — representative graph topologies # ============================================================================= -@pytest.fixture -def sample_graphdef(init_cuda): - """A sample GraphDef.""" - return GraphDef() +@dataclass +class GraphSpec: + """Describes a graph topology with expected structural properties.""" + name: str + graphdef: GraphDef + named_nodes: dict = field(default_factory=dict) + expected_edges: set = field(default_factory=set) + expected_pred: dict = field(default_factory=dict) + expected_succ: dict = field(default_factory=dict) -@pytest.fixture -def sample_graphdef_alt(init_cuda): - """An alternate GraphDef (for inequality testing).""" - return GraphDef() +def _build_empty(): + """No nodes, no edges.""" + return GraphSpec("empty", GraphDef()) -@pytest.fixture -def sample_root_node(sample_graphdef): - """A root Node (virtual, NULL handle).""" - return sample_graphdef.root +def _build_single(): + """One alloc node, no edges.""" + g = GraphDef() + a = g.root.alloc(ALLOC_SIZE) + return GraphSpec( + "single", + g, + named_nodes={"a": a}, + expected_edges=set(), + expected_pred={"a": set()}, + expected_succ={"a": set()}, + ) + + +def _build_chain(): + """Linear chain: a -> b -> c.""" + g = GraphDef() + a = g.root.alloc(ALLOC_SIZE) + b = a.alloc(ALLOC_SIZE) + c = b.alloc(ALLOC_SIZE) + return GraphSpec( + "chain", + g, + named_nodes={"a": a, "b": b, "c": c}, + expected_edges={("a", "b"), ("b", "c")}, + expected_pred={"a": set(), "b": {"a"}, "c": {"b"}}, + expected_succ={"a": {"b"}, "b": {"c"}, "c": set()}, + ) + + +def _build_fan_out(): + """One node feeds three: a -> {b, c, d}.""" + g = GraphDef() + a = g.root.alloc(ALLOC_SIZE) + b = a.alloc(ALLOC_SIZE) + c = a.alloc(ALLOC_SIZE) + d = a.alloc(ALLOC_SIZE) + return GraphSpec( + "fan_out", + g, + named_nodes={"a": a, "b": b, "c": c, "d": d}, + expected_edges={("a", "b"), ("a", "c"), ("a", "d")}, + expected_pred={"a": set(), "b": {"a"}, "c": {"a"}, "d": {"a"}}, + expected_succ={"a": {"b", "c", "d"}, "b": set(), "c": set(), "d": set()}, + ) + + +def _build_fan_in(): + """Three entry nodes merge: {a, b, c} -> d (join).""" + g = GraphDef() + a = g.root.alloc(ALLOC_SIZE) + b = g.root.alloc(ALLOC_SIZE) + c = g.root.alloc(ALLOC_SIZE) + d = a.join(b, c) + return GraphSpec( + "fan_in", + g, + named_nodes={"a": a, "b": b, "c": c, "d": d}, + expected_edges={("a", "d"), ("b", "d"), ("c", "d")}, + expected_pred={"a": set(), "b": set(), "c": set(), "d": {"a", "b", "c"}}, + expected_succ={"a": {"d"}, "b": {"d"}, "c": {"d"}, "d": set()}, + ) + + +def _build_diamond(): + """Diamond: a -> {b, c} -> d (join).""" + g = GraphDef() + a = g.root.alloc(ALLOC_SIZE) + b = a.alloc(ALLOC_SIZE) + c = a.alloc(ALLOC_SIZE) + d = b.join(c) + return GraphSpec( + "diamond", + g, + named_nodes={"a": a, "b": b, "c": c, "d": d}, + expected_edges={("a", "b"), ("a", "c"), ("b", "d"), ("c", "d")}, + expected_pred={"a": set(), "b": {"a"}, "c": {"a"}, "d": {"b", "c"}}, + expected_succ={"a": {"b", "c"}, "b": {"d"}, "c": {"d"}, "d": set()}, + ) + + +def _build_disconnected(): + """Two independent entry nodes: a, b.""" + g = GraphDef() + a = g.root.alloc(ALLOC_SIZE) + b = g.root.alloc(ALLOC_SIZE) + return GraphSpec( + "disconnected", + g, + named_nodes={"a": a, "b": b}, + expected_edges=set(), + expected_pred={"a": set(), "b": set()}, + expected_succ={"a": set(), "b": set()}, + ) + + +_ALL_BUILDERS = [ + pytest.param(_build_empty, id="empty"), + pytest.param(_build_single, id="single"), + pytest.param(_build_chain, id="chain"), + pytest.param(_build_fan_out, id="fan_out"), + pytest.param(_build_fan_in, id="fan_in"), + pytest.param(_build_diamond, id="diamond"), + pytest.param(_build_disconnected, id="disconnected"), +] -@pytest.fixture -def sample_root_node_alt(sample_graphdef_alt): - """An alternate root Node from different graph.""" - return sample_graphdef_alt.root +_NONEMPTY_BUILDERS = [p for p in _ALL_BUILDERS if p.values[0] is not _build_empty] -@pytest.fixture -def sample_empty_node(sample_graphdef): - """An empty Node (join node).""" - return sample_graphdef.root.join() +@pytest.fixture(params=_ALL_BUILDERS) +def graph_spec(request, init_cuda): + return request.param() -@pytest.fixture -def sample_empty_node_alt(sample_graphdef): - """An alternate empty Node from same graph.""" - return sample_graphdef.root.join() +@pytest.fixture(params=_NONEMPTY_BUILDERS) +def nonempty_graph_spec(request, init_cuda): + return request.param() -@pytest.fixture -def sample_alloc_node(sample_graphdef): - """An allocation Node.""" - return sample_graphdef.root.alloc(ALLOC_SIZE) +# ============================================================================= +# NodeSpec — representative node types +# ============================================================================= -@pytest.fixture -def sample_alloc_node_alt(sample_graphdef): - """An alternate allocation Node from same graph.""" - return sample_graphdef.root.alloc(ALLOC_SIZE) +@dataclass +class NodeSpec: + """Describes a node type with expected properties. + The builder returns (node, expected_attrs) where expected_attrs maps + property names to expected values. Callable values are treated as + predicates (e.g., ``lambda v: v != 0``). + """ -@pytest.fixture -def sample_kernel_node(sample_graphdef, init_cuda): - """A kernel launch Node.""" + name: str + expected_class: type + expected_type_name: str + builder: Callable[[GraphDef], tuple[Node, dict]] + + +def _build_empty_node(g): + a = g.root.alloc(ALLOC_SIZE) + b = g.root.alloc(ALLOC_SIZE) + return a.join(b), {} + + +def _build_kernel_node(g): mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") - config = LaunchConfig(grid=1, block=1) - return sample_graphdef.root.launch(config, kernel) + config = LaunchConfig(grid=(2, 3, 1), block=(32, 4, 1), shmem_size=128) + entry = g.root.alloc(ALLOC_SIZE) + node = entry.launch(config, kernel) + return node, { + "grid": (2, 3, 1), + "block": (32, 4, 1), + "shmem_size": 128, + "kernel": kernel, + "config": config, + } + + +def _build_alloc_node(g): + device_id = Device().device_id + entry = g.root.alloc(ALLOC_SIZE) + node = entry.alloc(ALLOC_SIZE) + return node, { + "dptr": lambda v: v != 0, + "bytesize": ALLOC_SIZE, + "device_id": device_id, + "memory_type": "device", + "peer_access": (), + "options": GraphAllocOptions(device=device_id, memory_type="device"), + } + + +def _build_alloc_managed_node(g): + device_id = Device().device_id + options = GraphAllocOptions(memory_type="managed") + entry = g.root.alloc(ALLOC_SIZE) + node = entry.alloc(ALLOC_SIZE, options) + return node, { + "dptr": lambda v: v != 0, + "bytesize": ALLOC_SIZE, + "device_id": device_id, + "memory_type": "managed", + "peer_access": (), + "options": GraphAllocOptions(device=device_id, memory_type="managed"), + } + + +def _build_free_node(g): + alloc = g.root.alloc(ALLOC_SIZE) + node = alloc.free(alloc.dptr) + return node, { + "dptr": alloc.dptr, + } + + +_NODE_SPECS = [ + pytest.param(NodeSpec("empty", EmptyNode, "CU_GRAPH_NODE_TYPE_EMPTY", _build_empty_node), id="empty"), + pytest.param(NodeSpec("kernel", KernelNode, "CU_GRAPH_NODE_TYPE_KERNEL", _build_kernel_node), id="kernel"), + pytest.param(NodeSpec("alloc", AllocNode, "CU_GRAPH_NODE_TYPE_MEM_ALLOC", _build_alloc_node), id="alloc"), + pytest.param( + NodeSpec("alloc_managed", AllocNode, "CU_GRAPH_NODE_TYPE_MEM_ALLOC", _build_alloc_managed_node), + id="alloc_managed", + ), + pytest.param(NodeSpec("free", FreeNode, "CU_GRAPH_NODE_TYPE_MEM_FREE", _build_free_node), id="free"), +] -@pytest.fixture -def dot_file(): - """Temporary DOT file path, cleaned up after test.""" - path = Path(tempfile.mktemp(suffix=".dot")) - yield path - path.unlink(missing_ok=True) +@pytest.fixture(params=_NODE_SPECS) +def node_spec(request, init_cuda): + spec = request.param + g = GraphDef() + node, expected_attrs = spec.builder(g) + return spec, g, node, expected_attrs # ============================================================================= -# Type groupings +# Fixtures # ============================================================================= -# All types that support __hash__ -HASH_TYPES = [ - "sample_graphdef", - "sample_root_node", - "sample_empty_node", - "sample_alloc_node", -] - -# All types that support __eq__ -EQ_TYPES = [ - "sample_graphdef", - "sample_root_node", - "sample_empty_node", - "sample_alloc_node", -] -# All types (for repr testing) -ALL_TYPES = [ - "sample_graphdef", - "sample_root_node", - "sample_empty_node", - "sample_alloc_node", - "sample_kernel_node", -] +@pytest.fixture +def sample_graphdef(init_cuda): + """A sample GraphDef for standalone tests.""" + return GraphDef() -# Pairs of distinct objects for inequality testing (a != b) -DISTINCT_PAIRS = [ - ("sample_graphdef", "sample_graphdef_alt"), - ("sample_root_node", "sample_root_node_alt"), - ("sample_empty_node", "sample_empty_node_alt"), - ("sample_alloc_node", "sample_alloc_node_alt"), -] -# Repr patterns -REPR_PATTERNS = [ - ("sample_graphdef", r""), - ("sample_root_node", r""), - ("sample_empty_node", r""), - ("sample_alloc_node", r""), - ("sample_kernel_node", r""), -] +@pytest.fixture +def dot_file(tmp_path): + """Temporary DOT file path, cleaned up after test.""" + path = tmp_path / "graph.dot" + yield path + path.unlink(missing_ok=True) # ============================================================================= -# Hash tests +# Topology tests (parameterized over graph specs) # ============================================================================= -@pytest.mark.parametrize("fixture_name", HASH_TYPES) -def test_hash_consistent(fixture_name, request): - """Hash is consistent across multiple calls.""" - obj = request.getfixturevalue(fixture_name) - assert hash(obj) == hash(obj) - +def test_node_count(graph_spec): + """Graph contains the expected number of nodes.""" + assert len(graph_spec.graphdef.nodes()) == len(graph_spec.named_nodes) -@pytest.mark.parametrize("a_name,b_name", DISTINCT_PAIRS) -def test_hash_distinct(a_name, b_name, request): - """Distinct objects have different hashes.""" - obj_a = request.getfixturevalue(a_name) - obj_b = request.getfixturevalue(b_name) - assert hash(obj_a) != hash(obj_b) - -# ============================================================================= -# Equality tests (identity-based) -# ============================================================================= +def test_nodes_match(nonempty_graph_spec): + """nodes() returns exactly the expected nodes.""" + spec = nonempty_graph_spec + assert set(spec.graphdef.nodes()) == set(spec.named_nodes.values()) -@pytest.mark.parametrize("fixture_name", EQ_TYPES) -def test_equals_self(fixture_name, request): - """Object equals itself.""" - obj = request.getfixturevalue(fixture_name) - assert obj == obj +def test_edges(graph_spec): + """edges() returns exactly the expected edges.""" + spec = graph_spec + node_to_name = {v: k for k, v in spec.named_nodes.items()} + actual = {(node_to_name[a], node_to_name[b]) for a, b in spec.graphdef.edges()} + assert actual == spec.expected_edges -@pytest.mark.parametrize("fixture_name", EQ_TYPES) -def test_not_equal_to_other_types(fixture_name, request): - """Object not equal to unrelated types.""" - obj = request.getfixturevalue(fixture_name) - assert obj.__eq__("string") is NotImplemented - assert obj.__eq__(42) is NotImplemented - assert obj.__eq__(None) is NotImplemented +def test_pred(nonempty_graph_spec): + """Each node has the expected predecessors.""" + spec = nonempty_graph_spec + node_to_name = {v: k for k, v in spec.named_nodes.items()} + for name, node in spec.named_nodes.items(): + actual = {node_to_name[p] for p in node.pred} + assert actual == spec.expected_pred[name], f"pred mismatch for node {name}" -@pytest.mark.parametrize("a_name,b_name", DISTINCT_PAIRS) -def test_distinct_objects_not_equal(a_name, b_name, request): - """Distinct objects of same type are not equal.""" - obj_a = request.getfixturevalue(a_name) - obj_b = request.getfixturevalue(b_name) - assert obj_a is not obj_b - assert obj_a != obj_b +def test_succ(nonempty_graph_spec): + """Each node has the expected successors.""" + spec = nonempty_graph_spec + node_to_name = {v: k for k, v in spec.named_nodes.items()} + for name, node in spec.named_nodes.items(): + actual = {node_to_name[s] for s in node.succ} + assert actual == spec.expected_succ[name], f"succ mismatch for node {name}" -@pytest.mark.parametrize("a_name,b_name", list(itertools.combinations(EQ_TYPES, 2))) -def test_cross_type_equality_by_identity(a_name, b_name, request): - """Cross-type equality: equal iff same object identity.""" - obj_a = request.getfixturevalue(a_name) - obj_b = request.getfixturevalue(b_name) - if obj_a is obj_b: - assert obj_a == obj_b - else: - assert obj_a != obj_b +def test_node_graph_property(nonempty_graph_spec): + """Every node's .graph property returns the parent GraphDef.""" + spec = nonempty_graph_spec + for name, node in spec.named_nodes.items(): + assert node.graph == spec.graphdef, f"graph mismatch for node {name}" # ============================================================================= -# Collection usage tests +# Node type tests (parameterized over node specs) # ============================================================================= -@pytest.mark.parametrize("fixture_name", HASH_TYPES) -def test_usable_in_set(fixture_name, request): - """Object can be added to a set.""" - obj = request.getfixturevalue(fixture_name) - s = {obj} - assert obj in s +def test_node_isinstance(node_spec): + """Node is an instance of the expected subclass.""" + spec, g, node, _ = node_spec + assert isinstance(node, spec.expected_class) + assert isinstance(node, Node) -@pytest.mark.parametrize("fixture_name", HASH_TYPES) -def test_usable_as_dict_key(fixture_name, request): - """Object can be used as dictionary key.""" - obj = request.getfixturevalue(fixture_name) - d = {obj: "value"} - assert d[obj] == "value" +def test_node_type_property(node_spec): + """Node.type returns the expected CUgraphNodeType.""" + spec, g, node, _ = node_spec + assert node.type.name == spec.expected_type_name -# ============================================================================= -# Repr tests -# ============================================================================= +def test_node_type_preserved_by_nodes(node_spec): + """Node type is preserved when retrieved via graphdef.nodes().""" + spec, g, node, _ = node_spec + all_nodes = g.nodes() + matched = [n for n in all_nodes if n == node] + assert len(matched) == 1 + assert isinstance(matched[0], spec.expected_class) + + +def test_node_type_preserved_by_pred_succ(node_spec): + """Node type is preserved when retrieved via pred/succ traversal.""" + spec, g, node, _ = node_spec + for predecessor in node.pred: + matched = [s for s in predecessor.succ if s == node] + assert len(matched) == 1 + assert isinstance(matched[0], spec.expected_class) -@pytest.mark.parametrize("fixture_name,pattern", REPR_PATTERNS) -def test_repr_format(fixture_name, pattern, request): - """repr() matches expected pattern.""" - import re +def test_node_attrs(node_spec): + """Type-specific attributes have expected values after construction.""" + spec, g, node, expected_attrs = node_spec + if not expected_attrs: + pytest.skip("no type-specific attributes") + for attr, expected in expected_attrs.items(): + actual = getattr(node, attr) + if callable(expected): + assert expected(actual), f"{spec.name}.{attr}: check failed (got {actual})" + else: + assert actual == expected, f"{spec.name}.{attr}: expected {expected}, got {actual}" - obj = request.getfixturevalue(fixture_name) - assert re.fullmatch(pattern, repr(obj)) + +def test_node_attrs_preserved_by_nodes(node_spec): + """Type-specific attributes survive round-trip through graphdef.nodes().""" + spec, g, node, expected_attrs = node_spec + if not expected_attrs: + pytest.skip("no type-specific attributes") + retrieved = next(n for n in g.nodes() if n == node) + for attr in expected_attrs: + assert getattr(retrieved, attr) == getattr(node, attr), f"{spec.name}.{attr} not preserved by nodes()" # ============================================================================= -# GraphDef-specific tests +# GraphDef basics # ============================================================================= @@ -247,72 +407,18 @@ def test_graphdef_root_returns_node(sample_graphdef): def test_graphdef_root_is_virtual(sample_graphdef): - """Root node is virtual (no pred/succ).""" + """Root node is virtual (no pred/succ, type is None).""" root = sample_graphdef.root assert root.pred == () assert root.succ == () + assert root.type is None # ============================================================================= -# Node property tests -# ============================================================================= - - -def test_node_graph_property(sample_graphdef): - """Node.graph returns the parent GraphDef.""" - node = sample_graphdef.root.join() - assert node.graph == sample_graphdef - - -def test_node_dptr_zero_for_non_alloc(sample_empty_node): - """Non-alloc nodes have dptr=0.""" - assert sample_empty_node.dptr == 0 - - -def test_node_dptr_nonzero_for_alloc(sample_alloc_node): - """Alloc nodes have non-zero dptr.""" - assert sample_alloc_node.dptr != 0 - - -# ============================================================================= -# Graph building: join +# Alloc/free API # ============================================================================= -def test_join_from_root(sample_graphdef): - """Join from root creates entry node with no predecessors.""" - node = sample_graphdef.root.join() - assert isinstance(node, Node) - assert len(node.pred) == 0 - - -def test_join_single_dependency(sample_graphdef): - """Join from a node creates dependency.""" - n1 = sample_graphdef.root.join() - n2 = n1.join() - assert n1 in n2.pred - assert len(n2.pred) == 1 - - -@pytest.mark.parametrize("num_deps", [2, 3, 5]) -def test_join_multiple_dependencies(sample_graphdef, num_deps): - """Join N nodes creates node depending on all.""" - nodes = [sample_graphdef.root.join() for _ in range(num_deps)] - joined = nodes[0].join(*nodes[1:]) - assert set(joined.pred) == set(nodes) - - -# ============================================================================= -# Graph building: alloc/free -# ============================================================================= - - -def test_alloc_returns_valid_dptr(sample_graphdef): - """Alloc returns node with valid device pointer.""" - node = sample_graphdef.root.alloc(ALLOC_SIZE) - assert node.dptr != 0 - - def test_alloc_zero_size_fails(sample_graphdef): """Alloc with zero size raises error (CUDA limitation).""" from cuda.core._utils.cuda_utils import CUDAError @@ -326,7 +432,6 @@ def test_free_creates_dependency(sample_graphdef): alloc = sample_graphdef.root.alloc(ALLOC_SIZE) free = alloc.free(alloc.dptr) assert alloc in free.pred - assert free.dptr == 0 def test_alloc_free_chain(sample_graphdef): @@ -341,18 +446,10 @@ def test_alloc_free_chain(sample_graphdef): # ============================================================================= -# Allocation options +# Allocation options (error cases, input variants, multi-GPU) # ============================================================================= -@pytest.mark.parametrize("memory_type", ["device", "managed"]) -def test_alloc_memory_type(sample_graphdef, memory_type): - """Allocation succeeds for supported memory types.""" - options = GraphAllocOptions(memory_type=memory_type) - node = sample_graphdef.root.alloc(ALLOC_SIZE, options) - assert node.dptr != 0 - - def test_alloc_memory_type_invalid(sample_graphdef): """Invalid memory type raises ValueError.""" options = GraphAllocOptions(memory_type="invalid") @@ -376,84 +473,26 @@ def test_alloc_device_option(sample_graphdef, device_spec): def test_alloc_peer_access(mempool_device_x2): - """Allocation with peer access list succeeds.""" + """AllocNode.peer_access reflects requested peers.""" d0, d1 = mempool_device_x2 g = GraphDef() options = GraphAllocOptions(device=d0.device_id, peer_access=[d1.device_id]) node = g.root.alloc(ALLOC_SIZE, options) - assert node.dptr != 0 + assert d1.device_id in node.peer_access # ============================================================================= -# Graph traversal: nodes, edges, pred, succ +# Join API # ============================================================================= -def test_empty_graph_has_no_nodes(sample_graphdef): - """Empty graph returns no nodes.""" - assert sample_graphdef.nodes() == () - - -def test_empty_graph_has_no_edges(sample_graphdef): - """Empty graph returns no edges.""" - assert sample_graphdef.edges() == () - - -def test_nodes_returns_all_nodes(sample_graphdef): - """nodes() returns all added nodes.""" - n1 = sample_graphdef.root.join() - n2 = sample_graphdef.root.join() - n3 = n1.join(n2) - nodes = sample_graphdef.nodes() - assert len(nodes) == 3 - assert set(nodes) == {n1, n2, n3} - - -def test_edges_returns_dependency_pairs(sample_graphdef): - """edges() returns (from, to) pairs for all dependencies.""" - n1 = sample_graphdef.root.join() - n2 = n1.join() - edges = sample_graphdef.edges() - assert (n1, n2) in edges - - -def test_edges_multiple(sample_graphdef): - """edges() with fan-in topology.""" - n1 = sample_graphdef.root.join() - n2 = sample_graphdef.root.join() - n3 = n1.join(n2) - edges = sample_graphdef.edges() - assert len(edges) == 2 - assert (n1, n3) in edges - assert (n2, n3) in edges - - -@pytest.mark.parametrize("direction", ["pred", "succ"]) -def test_traversal_single(sample_graphdef, direction): - """Single predecessor/successor relationship.""" - n1 = sample_graphdef.root.join() - n2 = n1.join() - if direction == "pred": - assert n1 in n2.pred - assert len(n2.pred) == 1 - else: - assert n2 in n1.succ - assert len(n1.succ) == 1 - - -@pytest.mark.parametrize("direction", ["pred", "succ"]) -def test_traversal_multiple(sample_graphdef, direction): - """Multiple predecessors/successors.""" - if direction == "pred": - n1 = sample_graphdef.root.join() - n2 = sample_graphdef.root.join() - n3 = n1.join(n2) - assert set(n3.pred) == {n1, n2} - else: - n1 = sample_graphdef.root.join() - n2 = n1.join() - n3 = n1.join() - assert set(n1.succ) == {n2, n3} +@pytest.mark.parametrize("num_branches", [2, 3, 5]) +def test_join_merges_branches(sample_graphdef, num_branches): + """join() with multiple branches creates correct dependencies.""" + branches = [sample_graphdef.root.alloc(ALLOC_SIZE) for _ in range(num_branches)] + joined = branches[0].join(*branches[1:]) + assert isinstance(joined, EmptyNode) + assert set(joined.pred) == set(branches) # ============================================================================= @@ -461,17 +500,16 @@ def test_traversal_multiple(sample_graphdef, direction): # ============================================================================= -def test_launch_creates_node(sample_graphdef, init_cuda): - """launch() creates a kernel node.""" +def test_launch_creates_node(sample_graphdef): + """launch() creates a KernelNode.""" mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") config = LaunchConfig(grid=1, block=1) node = sample_graphdef.root.launch(config, kernel) - assert isinstance(node, Node) - assert node.dptr == 0 + assert isinstance(node, KernelNode) -def test_launch_chain_dependencies(sample_graphdef, init_cuda): +def test_launch_chain_dependencies(sample_graphdef): """Chained launches create correct dependencies.""" mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") @@ -485,7 +523,7 @@ def test_launch_chain_dependencies(sample_graphdef, init_cuda): # ============================================================================= -# Graph instantiation and execution +# Instantiation and execution # ============================================================================= @@ -497,13 +535,13 @@ def test_instantiate_empty_graph(sample_graphdef): def test_instantiate_with_nodes(sample_graphdef): """Graph with nodes can be instantiated.""" - sample_graphdef.root.join() - sample_graphdef.root.join() + sample_graphdef.root.alloc(ALLOC_SIZE) + sample_graphdef.root.alloc(ALLOC_SIZE) graph = sample_graphdef.instantiate() assert graph is not None -def test_instantiate_and_execute_kernel(sample_graphdef, init_cuda): +def test_instantiate_and_execute_kernel(sample_graphdef): """Graph with kernel can be instantiated and executed.""" mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") @@ -536,7 +574,7 @@ def test_instantiate_and_execute_alloc_free(sample_graphdef): def test_debug_dot_print_creates_file(sample_graphdef, dot_file): """debug_dot_print writes a DOT file.""" - sample_graphdef.root.join() + sample_graphdef.root.alloc(ALLOC_SIZE) sample_graphdef.debug_dot_print(str(dot_file)) assert dot_file.exists() content = dot_file.read_text() @@ -545,7 +583,7 @@ def test_debug_dot_print_creates_file(sample_graphdef, dot_file): def test_debug_dot_print_with_options(sample_graphdef, dot_file): """debug_dot_print accepts GraphDebugPrintOptions.""" - sample_graphdef.root.join() + sample_graphdef.root.alloc(ALLOC_SIZE) options = GraphDebugPrintOptions(verbose=True, handles=True) sample_graphdef.debug_dot_print(str(dot_file), options) assert dot_file.exists() @@ -553,6 +591,6 @@ def test_debug_dot_print_with_options(sample_graphdef, dot_file): def test_debug_dot_print_invalid_options(sample_graphdef, dot_file): """debug_dot_print rejects invalid options type.""" - sample_graphdef.root.join() + sample_graphdef.root.alloc(ALLOC_SIZE) with pytest.raises(TypeError, match="options must be a GraphDebugPrintOptions"): sample_graphdef.debug_dot_print(str(dot_file), "invalid") diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index fa35a3887e..f5a16d6acd 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -12,8 +12,10 @@ import weakref import pytest +from helpers.graph_kernels import compile_common_kernels from cuda.core import Buffer, Device, Kernel, LaunchConfig, Program, Stream, system +from cuda.core._graph._graphdef import GraphDef from cuda.core._program import _can_load_generated_ptx # ============================================================================= @@ -199,6 +201,97 @@ def sample_kernel_alt(sample_object_code_alt): return sample_object_code_alt.get_kernel("test_kernel_alt") +# ============================================================================= +# Fixtures - Graph types (GraphDef and Node) +# ============================================================================= + +ALLOC_SIZE = 1024 + + +@pytest.fixture +def sample_graphdef(init_cuda): + """A sample GraphDef.""" + return GraphDef() + + +@pytest.fixture +def sample_graphdef_alt(init_cuda): + """An alternate GraphDef (for inequality testing).""" + return GraphDef() + + +@pytest.fixture +def sample_root_node(sample_graphdef): + """A root Node (virtual, NULL handle).""" + return sample_graphdef.root + + +@pytest.fixture +def sample_root_node_alt(sample_graphdef_alt): + """An alternate root Node from different graph.""" + return sample_graphdef_alt.root + + +@pytest.fixture +def sample_empty_node(sample_graphdef): + """An EmptyNode created by merging two branches.""" + a = sample_graphdef.root.alloc(ALLOC_SIZE) + b = sample_graphdef.root.alloc(ALLOC_SIZE) + return a.join(b) + + +@pytest.fixture +def sample_empty_node_alt(sample_graphdef): + """An alternate EmptyNode from same graph.""" + c = sample_graphdef.root.alloc(ALLOC_SIZE) + d = sample_graphdef.root.alloc(ALLOC_SIZE) + return c.join(d) + + +@pytest.fixture +def sample_alloc_node(sample_graphdef): + """An AllocNode.""" + return sample_graphdef.root.alloc(ALLOC_SIZE) + + +@pytest.fixture +def sample_alloc_node_alt(sample_graphdef): + """An alternate AllocNode from same graph.""" + return sample_graphdef.root.alloc(ALLOC_SIZE) + + +@pytest.fixture +def sample_kernel_node(sample_graphdef, init_cuda): + """A KernelNode.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + return sample_graphdef.root.launch(config, kernel) + + +@pytest.fixture +def sample_kernel_node_alt(sample_graphdef, init_cuda): + """An alternate KernelNode from same graph.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + return sample_graphdef.root.launch(config, kernel) + + +@pytest.fixture +def sample_free_node(sample_graphdef): + """A FreeNode.""" + alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + return alloc.free(alloc.dptr) + + +@pytest.fixture +def sample_free_node_alt(sample_graphdef): + """An alternate FreeNode from same graph.""" + alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + return alloc.free(alloc.dptr) + + # ============================================================================= # Type groupings # ============================================================================= @@ -213,6 +306,12 @@ def sample_kernel_alt(sample_object_code_alt): "sample_launch_config", "sample_object_code_cubin", "sample_kernel", + "sample_graphdef", + "sample_root_node", + "sample_empty_node", + "sample_alloc_node", + "sample_kernel_node", + "sample_free_node", ] # Types with __eq__ support @@ -225,6 +324,12 @@ def sample_kernel_alt(sample_object_code_alt): "sample_launch_config", "sample_object_code_cubin", "sample_kernel", + "sample_graphdef", + "sample_root_node", + "sample_empty_node", + "sample_alloc_node", + "sample_kernel_node", + "sample_free_node", ] # Types with __weakref__ support @@ -238,6 +343,12 @@ def sample_kernel_alt(sample_object_code_alt): "sample_object_code_cubin", "sample_kernel", "sample_program_nvrtc", + "sample_graphdef", + "sample_root_node", + "sample_empty_node", + "sample_alloc_node", + "sample_kernel_node", + "sample_free_node", ] # Pairs of distinct objects of the same type (for inequality testing) @@ -251,6 +362,12 @@ def sample_kernel_alt(sample_object_code_alt): ("sample_launch_config", "sample_launch_config_alt"), ("sample_object_code_cubin", "sample_object_code_alt"), ("sample_kernel", "sample_kernel_alt"), + ("sample_graphdef", "sample_graphdef_alt"), + ("sample_root_node", "sample_root_node_alt"), + ("sample_empty_node", "sample_empty_node_alt"), + ("sample_alloc_node", "sample_alloc_node_alt"), + ("sample_kernel_node", "sample_kernel_node_alt"), + ("sample_free_node", "sample_free_node_alt"), ] # Types with public from_handle methods and how to create a copy @@ -286,6 +403,13 @@ def sample_kernel_alt(sample_object_code_alt): ("sample_program_nvrtc", r""), ("sample_program_ptx", r""), ("sample_program_nvvm", r""), + # Graph types + ("sample_graphdef", r""), + ("sample_root_node", r""), + ("sample_empty_node", r""), + ("sample_alloc_node", r""), + ("sample_kernel_node", r""), + ("sample_free_node", r""), ] From ae5d70641e22e217d9ceb2019704c3f3de72cf32 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 4 Mar 2026 19:54:26 -0800 Subject: [PATCH 05/23] Add MemsetNode with shared _parse_fill_value utility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract fill value parsing (int/bytes/buffer protocol → value + element size) from Buffer.fill() into cpdef _parse_fill_value in cuda_utils so it can be reused by both Buffer.fill() and Node.memset(). Add MemsetNode class with properties: dptr, value, element_size, width, height, pitch. Node.memset() builder supports 1D and 2D memset with element sizes 1, 2, and 4. Tests cover all element sizes, 2D memset, instantiate-and-execute, and object protocols. Made-with: Cursor --- cuda_core/cuda/core/_graph/_graphdef.pxd | 20 +++ cuda_core/cuda/core/_graph/_graphdef.pyx | 151 +++++++++++++++++++++- cuda_core/cuda/core/_memory/_buffer.pyx | 80 ++++-------- cuda_core/cuda/core/_utils/cuda_utils.pxd | 4 +- cuda_core/cuda/core/_utils/cuda_utils.pyx | 60 +++++++++ cuda_core/tests/graph/test_explicit.py | 76 +++++++++++ cuda_core/tests/test_object_protocols.py | 19 +++ 7 files changed, 348 insertions(+), 62 deletions(-) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index 0557f5a15b..4f294f7105 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -14,6 +14,7 @@ cdef class EmptyNode(Node) cdef class KernelNode(Node) cdef class AllocNode(Node) cdef class FreeNode(Node) +cdef class MemsetNode(Node) cdef class GraphDef: @@ -85,3 +86,22 @@ cdef class FreeNode(Node): @staticmethod cdef FreeNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + + +cdef class MemsetNode(Node): + cdef: + cydriver.CUdeviceptr _dptr + unsigned int _value + unsigned int _element_size + size_t _width + size_t _height + size_t _pitch + + @staticmethod + cdef MemsetNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUdeviceptr dptr, unsigned int value, + unsigned int element_size, size_t width, + size_t height, size_t pitch) + + @staticmethod + cdef MemsetNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 493278de66..f318ed5b88 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -14,7 +14,8 @@ Node hierarchy: ├── EmptyNode (synchronization / join point) ├── KernelNode (kernel launch) ├── AllocNode (memory allocation, exposes dptr and bytesize) - └── FreeNode (memory free, exposes dptr) + ├── FreeNode (memory free, exposes dptr) + └── MemsetNode (memory set, exposes dptr, value, element_size, etc.) """ from dataclasses import dataclass @@ -25,7 +26,7 @@ if TYPE_CHECKING: from libc.stddef cimport size_t from libc.stdint cimport uintptr_t -from libc.string cimport memset +from libc.string cimport memset as c_memset from libcpp.vector cimport vector @@ -40,7 +41,7 @@ from cuda.core._resource_handles cimport ( from cuda.core._module cimport Kernel from cuda.core._launch_config cimport LaunchConfig from cuda.core._kernel_arg_handler cimport ParamHolder -from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value from cuda.core._utils.cuda_utils import driver @@ -258,6 +259,8 @@ cdef class Node: return AllocNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEM_FREE: return FreeNode._create_from_driver(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEMSET: + return MemsetNode._create_from_driver(h_graph, node) else: n = Node.__new__(Node) (n)._h_graph = h_graph @@ -523,7 +526,7 @@ cdef class Node: if options is not None and options.memory_type is not None: memory_type = options.memory_type - memset(&alloc_params, 0, sizeof(alloc_params)) + c_memset(&alloc_params, 0, sizeof(alloc_params)) alloc_params.poolProps.handleTypes = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE alloc_params.bytesize = size @@ -584,6 +587,65 @@ cdef class Node: self._succ_cache = None return FreeNode._create_with_params(self._h_graph, new_node, c_dptr) + def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0): + """Add a memset node depending on this node. + + Parameters + ---------- + dst : int + Destination device pointer. + value : int or buffer-protocol object + Fill value. int for 1-byte fill (range [0, 256)), + or buffer-protocol object of 1, 2, or 4 bytes. + width : int + Width of the row in elements. + height : int, optional + Number of rows (default 1). + pitch : int, optional + Pitch of destination in bytes (default 0, unused if height is 1). + + Returns + ------- + MemsetNode + A new MemsetNode representing the memset operation. + """ + cdef unsigned int val + cdef unsigned int elem_size + val, elem_size = _parse_fill_value(value) + + cdef cydriver.CUDA_MEMSET_NODE_PARAMS memset_params + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + + if self._node != NULL: + deps = &self._node + num_deps = 1 + + cdef cydriver.CUdeviceptr c_dst = dst + cdef cydriver.CUcontext ctx = NULL + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + + c_memset(&memset_params, 0, sizeof(memset_params)) + memset_params.dst = c_dst + memset_params.value = val + memset_params.elementSize = elem_size + memset_params.width = width + memset_params.height = height + memset_params.pitch = pitch + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddMemsetNode( + &new_node, graph, deps, num_deps, + &memset_params, ctx)) + + self._succ_cache = None + return MemsetNode._create_with_params( + self._h_graph, new_node, c_dst, + val, elem_size, width, height, pitch) + # ============================================================================= # Node subclasses @@ -816,3 +878,84 @@ cdef class FreeNode(Node): def dptr(self): """The device pointer being freed.""" return self._dptr + + +cdef class MemsetNode(Node): + """A memory set node. + + Properties + ---------- + dptr : int + The destination device pointer. + value : int + The fill value. + element_size : int + Element size in bytes (1, 2, or 4). + width : int + Width of the row in elements. + height : int + Number of rows. + pitch : int + Pitch in bytes (unused if height is 1). + """ + + @staticmethod + cdef MemsetNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUdeviceptr dptr, unsigned int value, + unsigned int element_size, size_t width, + size_t height, size_t pitch): + """Create from known params (called by memset() builder).""" + cdef MemsetNode n = MemsetNode.__new__(MemsetNode) + n._h_graph = h_graph + n._node = node + n._dptr = dptr + n._value = value + n._element_size = element_size + n._width = width + n._height = height + n._pitch = pitch + return n + + @staticmethod + cdef MemsetNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUDA_MEMSET_NODE_PARAMS params + with nogil: + HANDLE_RETURN(cydriver.cuGraphMemsetNodeGetParams(node, ¶ms)) + return MemsetNode._create_with_params( + h_graph, node, params.dst, params.value, + params.elementSize, params.width, params.height, params.pitch) + + def __repr__(self): + return (f"self._node:x} " + f"dptr=0x{self._dptr:x}>") + + @property + def dptr(self): + """The destination device pointer.""" + return self._dptr + + @property + def value(self): + """The fill value.""" + return self._value + + @property + def element_size(self): + """Element size in bytes (1, 2, or 4).""" + return self._element_size + + @property + def width(self): + """Width of the row in elements.""" + return self._width + + @property + def height(self): + """Number of rows.""" + return self._height + + @property + def pitch(self): + """Pitch in bytes (unused if height is 1).""" + return self._pitch diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 83009f74ae..a688c2065c 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -5,8 +5,7 @@ from __future__ import annotations cimport cython -from libc.stdint cimport uint8_t, uint16_t, uint32_t, uintptr_t -from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, Py_buffer, PyBUF_SIMPLE +from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver from cuda.core._memory._device_memory_resource import DeviceMemoryResource @@ -25,7 +24,7 @@ from cuda.core._resource_handles cimport ( ) from cuda.core._stream cimport Stream, Stream_accept -from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value import sys from typing import TypeVar @@ -271,27 +270,27 @@ cdef class Buffer: """ cdef Stream s_stream = Stream_accept(stream) - - # Handle int case: 1-byte fill with automatic overflow checking. - if isinstance(value, int): - Buffer_fill_uint8(self, value, s_stream._h_stream) - return - - # Handle bytes case: direct pointer access without intermediate objects. - if isinstance(value, bytes): - Buffer_fill_from_ptr(self, value, len(value), s_stream._h_stream) - return - - # General buffer protocol path using C buffer API. - cdef Py_buffer buf - if PyObject_GetBuffer(value, &buf, PyBUF_SIMPLE) != 0: - raise TypeError( - f"value must be an int or support the buffer protocol, got {type(value).__name__}" - ) - try: - Buffer_fill_from_ptr(self, buf.buf, buf.len, s_stream._h_stream) - finally: - PyBuffer_Release(&buf) + cdef unsigned int val + cdef unsigned int elem_size + val, elem_size = _parse_fill_value(value) + + cdef size_t buffer_size = self._size + cdef cydriver.CUdeviceptr dst = as_cu(self._h_ptr) + cdef cydriver.CUstream s = as_cu(s_stream._h_stream) + + if elem_size == 1: + with nogil: + HANDLE_RETURN(cydriver.cuMemsetD8Async(dst, val, buffer_size, s)) + elif elem_size == 2: + if buffer_size & 0x1: + raise ValueError(f"buffer size ({buffer_size}) must be divisible by 2") + with nogil: + HANDLE_RETURN(cydriver.cuMemsetD16Async(dst, val, buffer_size // 2, s)) + elif elem_size == 4: + if buffer_size & 0x3: + raise ValueError(f"buffer size ({buffer_size}) must be divisible by 4") + with nogil: + HANDLE_RETURN(cydriver.cuMemsetD32Async(dst, val, buffer_size // 4, s)) def __dlpack__( self, @@ -569,36 +568,3 @@ cdef inline void Buffer_close(Buffer self, object stream): self._memory_resource = None self._ipc_data = None self._owner = None - - -cdef inline int Buffer_fill_uint8(Buffer self, uint8_t value, StreamHandle h_stream) except? -1: - cdef cydriver.CUdeviceptr ptr = as_cu(self._h_ptr) - cdef cydriver.CUstream s = as_cu(h_stream) - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD8Async(ptr, value, self._size, s)) - return 0 - - -cdef inline int Buffer_fill_from_ptr( - Buffer self, const char* ptr, size_t width, StreamHandle h_stream -) except? -1: - cdef size_t buffer_size = self._size - cdef cydriver.CUdeviceptr dst = as_cu(self._h_ptr) - cdef cydriver.CUstream s = as_cu(h_stream) - - if width == 1: - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD8Async(dst, (ptr)[0], buffer_size, s)) - elif width == 2: - if buffer_size & 0x1: - raise ValueError(f"buffer size ({buffer_size}) must be divisible by 2") - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD16Async(dst, (ptr)[0], buffer_size // 2, s)) - elif width == 4: - if buffer_size & 0x3: - raise ValueError(f"buffer size ({buffer_size}) must be divisible by 4") - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD32Async(dst, (ptr)[0], buffer_size // 4, s)) - else: - raise ValueError(f"value must be 1, 2, or 4 bytes, got {width}") - return 0 diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd index a42bbf2dd0..efea18b79b 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd @@ -4,7 +4,7 @@ cimport cpython from cpython.object cimport PyObject -from libc.stdint cimport int64_t, int32_t +from libc.stdint cimport int64_t, int32_t, uint8_t, uint16_t, uint32_t from cuda.bindings cimport cydriver, cynvrtc, cynvvm @@ -31,6 +31,8 @@ cpdef int _check_nvrtc_error(error) except?-1 cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*) +cpdef tuple _parse_fill_value(value) + # Create low-level externs so Cython won't "helpfully" handle reference counting # for us. Prefixing with an underscore to distinguish it from the definition in diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx index 734ae32f79..c05aa15615 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx @@ -22,6 +22,8 @@ except ImportError: from cuda.bindings.nvvm import nvvmError +from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, Py_buffer, PyBUF_SIMPLE + from cuda.bindings cimport cynvrtc, cynvvm from cuda.core._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS @@ -339,6 +341,64 @@ def reset_fork_warning(): _fork_warning_checked = False +cdef inline tuple _read_fill_ptr(const char* ptr, Py_ssize_t width): + """Extract (value, element_size) from a raw pointer of known width.""" + cdef unsigned int val + if width == 1: + val = (ptr)[0] + elif width == 2: + val = (ptr)[0] + elif width == 4: + val = (ptr)[0] + else: + raise ValueError(f"value must be 1, 2, or 4 bytes, got {width}") + return (val, width) + + +cpdef tuple _parse_fill_value(value): + """Parse a fill/memset value into (raw_value, element_size). + + Parameters + ---------- + value : int or buffer-protocol object + - int: Must be in range [0, 256). Treated as 1-byte fill. + - bytes or buffer-protocol: Must be 1, 2, or 4 bytes. + + Returns + ------- + tuple of (int, int) + (raw_value, element_size) where element_size is 1, 2, or 4. + + Raises + ------ + OverflowError + If int value is outside [0, 256). + TypeError + If value is not an int and does not support the buffer protocol. + ValueError + If value byte length is not 1, 2, or 4. + """ + cdef uint8_t byte_val + cdef Py_buffer buf + + if isinstance(value, int): + byte_val = value + return (byte_val, 1) + + if isinstance(value, bytes): + return _read_fill_ptr(value, len(value)) + + if PyObject_GetBuffer(value, &buf, PyBUF_SIMPLE) != 0: + raise TypeError( + f"value must be an int or support the buffer protocol, " + f"got {type(value).__name__}" + ) + try: + return _read_fill_ptr(buf.buf, buf.len) + finally: + PyBuffer_Release(&buf) + + def check_multiprocessing_start_method(): """Check if multiprocessing start method is 'fork' and warn if so.""" global _fork_warning_checked diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py index 706bf06e6b..3cbf44f5fb 100644 --- a/cuda_core/tests/graph/test_explicit.py +++ b/cuda_core/tests/graph/test_explicit.py @@ -18,6 +18,7 @@ GraphAllocOptions, GraphDef, KernelNode, + MemsetNode, Node, ) @@ -243,6 +244,60 @@ def _build_free_node(g): } +def _build_memset_node(g): + alloc = g.root.alloc(ALLOC_SIZE) + node = alloc.memset(alloc.dptr, 42, ALLOC_SIZE) + return node, { + "dptr": alloc.dptr, + "value": 42, + "element_size": 1, + "width": ALLOC_SIZE, + "height": 1, + "pitch": 0, + } + + +def _build_memset_node_u16(g): + alloc = g.root.alloc(ALLOC_SIZE) + node = alloc.memset(alloc.dptr, b"\xab\xcd", ALLOC_SIZE // 2) + return node, { + "dptr": alloc.dptr, + "value": int.from_bytes(b"\xab\xcd", byteorder="little"), + "element_size": 2, + "width": ALLOC_SIZE // 2, + "height": 1, + "pitch": 0, + } + + +def _build_memset_node_u32(g): + alloc = g.root.alloc(ALLOC_SIZE) + node = alloc.memset(alloc.dptr, b"\x01\x02\x03\x04", ALLOC_SIZE // 4) + return node, { + "dptr": alloc.dptr, + "value": int.from_bytes(b"\x01\x02\x03\x04", byteorder="little"), + "element_size": 4, + "width": ALLOC_SIZE // 4, + "height": 1, + "pitch": 0, + } + + +def _build_memset_node_2d(g): + rows = 4 + cols = ALLOC_SIZE // rows + alloc = g.root.alloc(ALLOC_SIZE) + node = alloc.memset(alloc.dptr, 0xFF, cols, height=rows, pitch=cols) + return node, { + "dptr": alloc.dptr, + "value": 0xFF, + "element_size": 1, + "width": cols, + "height": rows, + "pitch": cols, + } + + _NODE_SPECS = [ pytest.param(NodeSpec("empty", EmptyNode, "CU_GRAPH_NODE_TYPE_EMPTY", _build_empty_node), id="empty"), pytest.param(NodeSpec("kernel", KernelNode, "CU_GRAPH_NODE_TYPE_KERNEL", _build_kernel_node), id="kernel"), @@ -252,6 +307,14 @@ def _build_free_node(g): id="alloc_managed", ), pytest.param(NodeSpec("free", FreeNode, "CU_GRAPH_NODE_TYPE_MEM_FREE", _build_free_node), id="free"), + pytest.param(NodeSpec("memset", MemsetNode, "CU_GRAPH_NODE_TYPE_MEMSET", _build_memset_node), id="memset"), + pytest.param( + NodeSpec("memset_u16", MemsetNode, "CU_GRAPH_NODE_TYPE_MEMSET", _build_memset_node_u16), id="memset_u16" + ), + pytest.param( + NodeSpec("memset_u32", MemsetNode, "CU_GRAPH_NODE_TYPE_MEMSET", _build_memset_node_u32), id="memset_u32" + ), + pytest.param(NodeSpec("memset_2d", MemsetNode, "CU_GRAPH_NODE_TYPE_MEMSET", _build_memset_node_2d), id="memset_2d"), ] @@ -567,6 +630,19 @@ def test_instantiate_and_execute_alloc_free(sample_graphdef): stream.sync() +def test_instantiate_and_execute_memset(sample_graphdef): + """Graph with alloc/memset/free can be executed.""" + alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + ms = alloc.memset(alloc.dptr, 0xAB, ALLOC_SIZE) + ms.free(alloc.dptr) + graph = sample_graphdef.instantiate() + + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + # ============================================================================= # Debug output # ============================================================================= diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index f5a16d6acd..701374ac68 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -292,6 +292,20 @@ def sample_free_node_alt(sample_graphdef): return alloc.free(alloc.dptr) +@pytest.fixture +def sample_memset_node(sample_graphdef): + """A MemsetNode.""" + alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) + + +@pytest.fixture +def sample_memset_node_alt(sample_graphdef): + """An alternate MemsetNode from same graph.""" + alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) + + # ============================================================================= # Type groupings # ============================================================================= @@ -312,6 +326,7 @@ def sample_free_node_alt(sample_graphdef): "sample_alloc_node", "sample_kernel_node", "sample_free_node", + "sample_memset_node", ] # Types with __eq__ support @@ -330,6 +345,7 @@ def sample_free_node_alt(sample_graphdef): "sample_alloc_node", "sample_kernel_node", "sample_free_node", + "sample_memset_node", ] # Types with __weakref__ support @@ -349,6 +365,7 @@ def sample_free_node_alt(sample_graphdef): "sample_alloc_node", "sample_kernel_node", "sample_free_node", + "sample_memset_node", ] # Pairs of distinct objects of the same type (for inequality testing) @@ -368,6 +385,7 @@ def sample_free_node_alt(sample_graphdef): ("sample_alloc_node", "sample_alloc_node_alt"), ("sample_kernel_node", "sample_kernel_node_alt"), ("sample_free_node", "sample_free_node_alt"), + ("sample_memset_node", "sample_memset_node_alt"), ] # Types with public from_handle methods and how to create a copy @@ -410,6 +428,7 @@ def sample_free_node_alt(sample_graphdef): ("sample_alloc_node", r""), ("sample_kernel_node", r""), ("sample_free_node", r""), + ("sample_memset_node", r""), ] From ba7d2f9828edde0e3a93b397ecef5a963f17aa29 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 4 Mar 2026 20:08:27 -0800 Subject: [PATCH 06/23] Add EventRecordNode and EventWaitNode with Event.from_handle support Implements event record/wait graph nodes with full test coverage. Adds non-owning create_event_handle_ref to RAII layer and Event.from_handle() / Event._from_raw_handle() for reconstructing Event objects from raw CUevent handles managed by the graph. Made-with: Cursor --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 5 + cuda_core/cuda/core/_cpp/resource_handles.hpp | 5 + cuda_core/cuda/core/_event.pxd | 3 + cuda_core/cuda/core/_event.pyx | 36 +++++ cuda_core/cuda/core/_graph/_graphdef.pxd | 26 +++ cuda_core/cuda/core/_graph/_graphdef.pyx | 151 +++++++++++++++++- cuda_core/cuda/core/_resource_handles.pxd | 1 + cuda_core/cuda/core/_resource_handles.pyx | 2 + cuda_core/tests/graph/test_explicit.py | 41 +++++ cuda_core/tests/test_object_protocols.py | 38 +++++ 10 files changed, 303 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 5655d0e4ac..502ad8315b 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -341,6 +341,11 @@ EventHandle create_event_handle_noctx(unsigned int flags) { return create_event_handle(ContextHandle{}, flags); } +EventHandle create_event_handle_ref(CUevent event) { + auto box = std::make_shared(EventBox{event}); + return EventHandle(box, &box->resource); +} + EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { GILReleaseGuard gil; CUevent event; diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index c5e1132990..60cdb95808 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -182,6 +182,11 @@ EventHandle create_event_handle_noctx(unsigned int flags); // Returns empty handle on error (caller must check). EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle); +// Create a non-owning event handle (references existing event). +// Use for events that are managed by the CUDA graph or another owner. +// The event will NOT be destroyed when the handle is released. +EventHandle create_event_handle_ref(CUevent event); + // ============================================================================ // Memory pool handle functions // ============================================================================ diff --git a/cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/_event.pxd index c393b29ebf..69f46d73b0 100644 --- a/cuda_core/cuda/core/_event.pxd +++ b/cuda_core/cuda/core/_event.pxd @@ -21,4 +21,7 @@ cdef class Event: @staticmethod cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free) + @staticmethod + cdef Event _from_raw_handle(cydriver.CUevent raw_event) + cpdef close(self) diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 1ff87a1ea0..1eefec1b1d 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -12,6 +12,7 @@ from cuda.core._resource_handles cimport ( ContextHandle, EventHandle, create_event_handle, + create_event_handle_ref, create_event_handle_ipc, as_intptr, as_cu, @@ -125,6 +126,41 @@ cdef class Event: self.get_ipc_descriptor() return self + @staticmethod + cdef Event _from_raw_handle(cydriver.CUevent raw_event): + """Create a non-owning Event from a raw CUevent (internal use).""" + cdef EventHandle h_event = create_event_handle_ref(raw_event) + cdef Event self = Event.__new__(Event) + self._h_event = h_event + self._h_context = ContextHandle() + self._timing_disabled = True + self._busy_waited = False + self._ipc_enabled = False + self._ipc_descriptor = None + self._device_id = -1 + return self + + @staticmethod + def from_handle(handle) -> Event: + """Create a non-owning :obj:`Event` from a foreign event handle. + + Parameters + ---------- + handle : int + Event handle representing the address of a foreign + event object (CUevent). + + Notes + ----- + The returned Event does not own the underlying CUevent and will + not destroy it when garbage collected. This is intended for + wrapping events managed by other subsystems (e.g., CUDA graphs). + """ + if not isinstance(handle, int): + raise TypeError(f"handle must be an integer, got {type(handle).__name__}") + cdef cydriver.CUevent raw = handle + return Event._from_raw_handle(raw) + cpdef close(self): """Destroy the event. diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index 4f294f7105..f16abe01f9 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -15,6 +15,8 @@ cdef class KernelNode(Node) cdef class AllocNode(Node) cdef class FreeNode(Node) cdef class MemsetNode(Node) +cdef class EventRecordNode(Node) +cdef class EventWaitNode(Node) cdef class GraphDef: @@ -105,3 +107,27 @@ cdef class MemsetNode(Node): @staticmethod cdef MemsetNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + + +cdef class EventRecordNode(Node): + cdef: + cydriver.CUevent _event + + @staticmethod + cdef EventRecordNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUevent event) + + @staticmethod + cdef EventRecordNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + + +cdef class EventWaitNode(Node): + cdef: + cydriver.CUevent _event + + @staticmethod + cdef EventWaitNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUevent event) + + @staticmethod + cdef EventWaitNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index f318ed5b88..8aade3b672 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -11,11 +11,13 @@ the same public Graph type for execution. Node hierarchy: Node (base — also used for the virtual root) - ├── EmptyNode (synchronization / join point) - ├── KernelNode (kernel launch) - ├── AllocNode (memory allocation, exposes dptr and bytesize) - ├── FreeNode (memory free, exposes dptr) - └── MemsetNode (memory set, exposes dptr, value, element_size, etc.) + ├── EmptyNode (synchronization / join point) + ├── KernelNode (kernel launch) + ├── AllocNode (memory allocation, exposes dptr and bytesize) + ├── FreeNode (memory free, exposes dptr) + ├── MemsetNode (memory set, exposes dptr, value, element_size, etc.) + ├── EventRecordNode (record an event) + └── EventWaitNode (wait for an event) """ from dataclasses import dataclass @@ -38,6 +40,7 @@ from cuda.core._resource_handles cimport ( as_cu, as_intptr, ) +from cuda.core._event cimport Event from cuda.core._module cimport Kernel from cuda.core._launch_config cimport LaunchConfig from cuda.core._kernel_arg_handler cimport ParamHolder @@ -261,6 +264,10 @@ cdef class Node: return FreeNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEMSET: return MemsetNode._create_from_driver(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_EVENT_RECORD: + return EventRecordNode._create_from_driver(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_WAIT_EVENT: + return EventWaitNode._create_from_driver(h_graph, node) else: n = Node.__new__(Node) (n)._h_graph = h_graph @@ -646,6 +653,68 @@ cdef class Node: self._h_graph, new_node, c_dst, val, elem_size, width, height, pitch) + def record_event(self, event): + """Add an event record node depending on this node. + + Parameters + ---------- + event : Event + The event to record. + + Returns + ------- + EventRecordNode + A new EventRecordNode representing the event record operation. + """ + cdef Event ev = event + cdef cydriver.CUevent c_event = as_cu(ev._h_event) + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + + if self._node != NULL: + deps = &self._node + num_deps = 1 + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddEventRecordNode( + &new_node, graph, deps, num_deps, c_event)) + + self._succ_cache = None + return EventRecordNode._create_with_params(self._h_graph, new_node, c_event) + + def wait_event(self, event): + """Add an event wait node depending on this node. + + Parameters + ---------- + event : Event + The event to wait for. + + Returns + ------- + EventWaitNode + A new EventWaitNode representing the event wait operation. + """ + cdef Event ev = event + cdef cydriver.CUevent c_event = as_cu(ev._h_event) + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + + if self._node != NULL: + deps = &self._node + num_deps = 1 + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddEventWaitNode( + &new_node, graph, deps, num_deps, c_event)) + + self._succ_cache = None + return EventWaitNode._create_with_params(self._h_graph, new_node, c_event) + # ============================================================================= # Node subclasses @@ -959,3 +1028,75 @@ cdef class MemsetNode(Node): def pitch(self): """Pitch in bytes (unused if height is 1).""" return self._pitch + + +cdef class EventRecordNode(Node): + """An event record node. + + Properties + ---------- + event : Event + The event being recorded (non-owning wrapper). + """ + + @staticmethod + cdef EventRecordNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUevent event): + """Create from known params (called by record_event() builder).""" + cdef EventRecordNode n = EventRecordNode.__new__(EventRecordNode) + n._h_graph = h_graph + n._node = node + n._event = event + return n + + @staticmethod + cdef EventRecordNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUevent event + with nogil: + HANDLE_RETURN(cydriver.cuGraphEventRecordNodeGetEvent(node, &event)) + return EventRecordNode._create_with_params(h_graph, node, event) + + def __repr__(self): + return f"self._node:x}>" + + @property + def event(self): + """The event being recorded (non-owning wrapper).""" + return Event._from_raw_handle(self._event) + + +cdef class EventWaitNode(Node): + """An event wait node. + + Properties + ---------- + event : Event + The event being waited on (non-owning wrapper). + """ + + @staticmethod + cdef EventWaitNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUevent event): + """Create from known params (called by wait_event() builder).""" + cdef EventWaitNode n = EventWaitNode.__new__(EventWaitNode) + n._h_graph = h_graph + n._node = node + n._event = event + return n + + @staticmethod + cdef EventWaitNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUevent event + with nogil: + HANDLE_RETURN(cydriver.cuGraphEventWaitNodeGetEvent(node, &event)) + return EventWaitNode._create_with_params(h_graph, node, event) + + def __repr__(self): + return f"self._node:x}>" + + @property + def event(self): + """The event being waited on (non-owning wrapper).""" + return Event._from_raw_handle(self._event) diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 7ddf0911de..2635f41b2c 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -97,6 +97,7 @@ cdef StreamHandle get_per_thread_stream() except+ nogil # Event handles cdef EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) except+ nogil cdef EventHandle create_event_handle_noctx(unsigned int flags) except+ nogil +cdef EventHandle create_event_handle_ref(cydriver.CUevent event) except+ nogil cdef EventHandle create_event_handle_ipc( const cydriver.CUipcEventHandle& ipc_handle) except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index a013f1c7cb..7d8bb3d837 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -70,6 +70,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": const ContextHandle& h_ctx, unsigned int flags) except+ nogil EventHandle create_event_handle_noctx "cuda_core::create_event_handle_noctx" ( unsigned int flags) except+ nogil + EventHandle create_event_handle_ref "cuda_core::create_event_handle_ref" ( + cydriver.CUevent event) except+ nogil EventHandle create_event_handle_ipc "cuda_core::create_event_handle_ipc" ( const cydriver.CUipcEventHandle& ipc_handle) except+ nogil diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py index 3cbf44f5fb..cc855a11b0 100644 --- a/cuda_core/tests/graph/test_explicit.py +++ b/cuda_core/tests/graph/test_explicit.py @@ -14,6 +14,8 @@ from cuda.core._graph._graphdef import ( AllocNode, EmptyNode, + EventRecordNode, + EventWaitNode, FreeNode, GraphAllocOptions, GraphDef, @@ -298,6 +300,24 @@ def _build_memset_node_2d(g): } +def _build_event_record_node(g): + event = Device().create_event() + entry = g.root.alloc(ALLOC_SIZE) + node = entry.record_event(event) + return node, { + "event": event, + } + + +def _build_event_wait_node(g): + event = Device().create_event() + entry = g.root.alloc(ALLOC_SIZE) + node = entry.wait_event(event) + return node, { + "event": event, + } + + _NODE_SPECS = [ pytest.param(NodeSpec("empty", EmptyNode, "CU_GRAPH_NODE_TYPE_EMPTY", _build_empty_node), id="empty"), pytest.param(NodeSpec("kernel", KernelNode, "CU_GRAPH_NODE_TYPE_KERNEL", _build_kernel_node), id="kernel"), @@ -315,6 +335,14 @@ def _build_memset_node_2d(g): NodeSpec("memset_u32", MemsetNode, "CU_GRAPH_NODE_TYPE_MEMSET", _build_memset_node_u32), id="memset_u32" ), pytest.param(NodeSpec("memset_2d", MemsetNode, "CU_GRAPH_NODE_TYPE_MEMSET", _build_memset_node_2d), id="memset_2d"), + pytest.param( + NodeSpec("event_record", EventRecordNode, "CU_GRAPH_NODE_TYPE_EVENT_RECORD", _build_event_record_node), + id="event_record", + ), + pytest.param( + NodeSpec("event_wait", EventWaitNode, "CU_GRAPH_NODE_TYPE_WAIT_EVENT", _build_event_wait_node), + id="event_wait", + ), ] @@ -643,6 +671,19 @@ def test_instantiate_and_execute_memset(sample_graphdef): stream.sync() +def test_instantiate_and_execute_event_record_wait(sample_graphdef): + """Graph with event record and wait nodes can be executed.""" + event = Device().create_event() + rec = sample_graphdef.root.record_event(event) + rec.wait_event(event) + graph = sample_graphdef.instantiate() + + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + # ============================================================================= # Debug output # ============================================================================= diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index 701374ac68..56bc30e80c 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -306,6 +306,34 @@ def sample_memset_node_alt(sample_graphdef): return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) +@pytest.fixture +def sample_event_record_node(sample_graphdef, sample_device): + """An EventRecordNode.""" + event = sample_device.create_event() + return sample_graphdef.root.record_event(event) + + +@pytest.fixture +def sample_event_record_node_alt(sample_graphdef, sample_device): + """An alternate EventRecordNode from same graph.""" + event = sample_device.create_event() + return sample_graphdef.root.record_event(event) + + +@pytest.fixture +def sample_event_wait_node(sample_graphdef, sample_device): + """An EventWaitNode.""" + event = sample_device.create_event() + return sample_graphdef.root.wait_event(event) + + +@pytest.fixture +def sample_event_wait_node_alt(sample_graphdef, sample_device): + """An alternate EventWaitNode from same graph.""" + event = sample_device.create_event() + return sample_graphdef.root.wait_event(event) + + # ============================================================================= # Type groupings # ============================================================================= @@ -327,6 +355,8 @@ def sample_memset_node_alt(sample_graphdef): "sample_kernel_node", "sample_free_node", "sample_memset_node", + "sample_event_record_node", + "sample_event_wait_node", ] # Types with __eq__ support @@ -346,6 +376,8 @@ def sample_memset_node_alt(sample_graphdef): "sample_kernel_node", "sample_free_node", "sample_memset_node", + "sample_event_record_node", + "sample_event_wait_node", ] # Types with __weakref__ support @@ -366,6 +398,8 @@ def sample_memset_node_alt(sample_graphdef): "sample_kernel_node", "sample_free_node", "sample_memset_node", + "sample_event_record_node", + "sample_event_wait_node", ] # Pairs of distinct objects of the same type (for inequality testing) @@ -386,6 +420,8 @@ def sample_memset_node_alt(sample_graphdef): ("sample_kernel_node", "sample_kernel_node_alt"), ("sample_free_node", "sample_free_node_alt"), ("sample_memset_node", "sample_memset_node_alt"), + ("sample_event_record_node", "sample_event_record_node_alt"), + ("sample_event_wait_node", "sample_event_wait_node_alt"), ] # Types with public from_handle methods and how to create a copy @@ -429,6 +465,8 @@ def sample_memset_node_alt(sample_graphdef): ("sample_kernel_node", r""), ("sample_free_node", r""), ("sample_memset_node", r""), + ("sample_event_record_node", r""), + ("sample_event_wait_node", r""), ] From 77a5dac26de0cdf70280df8664fae31235682f5f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 07:16:55 -0800 Subject: [PATCH 07/23] Replace GraphDef.root with forwarding methods and GraphDef.join GraphDef now exposes alloc, free, memset, launch, record_event, wait_event, and join directly. The virtual root node becomes an internal implementation detail (_entry). Also renames Event._from_raw_handle to Event._from_handle for consistency. Made-with: Cursor --- cuda_core/cuda/core/_event.pxd | 2 +- cuda_core/cuda/core/_event.pyx | 4 +- cuda_core/cuda/core/_graph/_graphdef.pyx | 77 ++++++++++++++--- cuda_core/tests/graph/test_explicit.py | 102 +++++++++++------------ cuda_core/tests/test_object_protocols.py | 40 ++++----- 5 files changed, 136 insertions(+), 89 deletions(-) diff --git a/cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/_event.pxd index 69f46d73b0..7f60b8cbc3 100644 --- a/cuda_core/cuda/core/_event.pxd +++ b/cuda_core/cuda/core/_event.pxd @@ -22,6 +22,6 @@ cdef class Event: cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free) @staticmethod - cdef Event _from_raw_handle(cydriver.CUevent raw_event) + cdef Event _from_handle(cydriver.CUevent raw_event) cpdef close(self) diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 1eefec1b1d..25480a76e1 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -127,7 +127,7 @@ cdef class Event: return self @staticmethod - cdef Event _from_raw_handle(cydriver.CUevent raw_event): + cdef Event _from_handle(cydriver.CUevent raw_event): """Create a non-owning Event from a raw CUevent (internal use).""" cdef EventHandle h_event = create_event_handle_ref(raw_event) cdef Event self = Event.__new__(Event) @@ -159,7 +159,7 @@ cdef class Event: if not isinstance(handle, int): raise TypeError(f"handle must be an integer, got {type(handle).__name__}") cdef cydriver.CUevent raw = handle - return Event._from_raw_handle(raw) + return Event._from_handle(raw) cpdef close(self): """Destroy the event. diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 8aade3b672..480a898a8f 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -119,17 +119,70 @@ cdef class GraphDef: return hash(as_intptr(self._h_graph)) @property - def root(self): - """Return the root Node for this graph. - - The root node has no dependencies. Operations added from the root - will be entry points to the graph. - """ + def _entry(self): + """Return the internal entry-point Node (no dependencies).""" cdef Node n = Node.__new__(Node) n._h_graph = self._h_graph n._node = NULL return n + def alloc(self, size_t size, options: GraphAllocOptions | None = None): + """Add an entry-point memory allocation node (no dependencies). + + See :meth:`Node.alloc` for full documentation. + """ + return self._entry.alloc(size, options) + + def free(self, dptr): + """Add an entry-point memory free node (no dependencies). + + See :meth:`Node.free` for full documentation. + """ + return self._entry.free(dptr) + + def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0): + """Add an entry-point memset node (no dependencies). + + See :meth:`Node.memset` for full documentation. + """ + return self._entry.memset(dst, value, width, height, pitch) + + def launch(self, config, kernel, *args): + """Add an entry-point kernel launch node (no dependencies). + + See :meth:`Node.launch` for full documentation. + """ + return self._entry.launch(config, kernel, *args) + + def join(self, *nodes): + """Create an empty node that depends on all given nodes. + + Parameters + ---------- + *nodes : Node + Nodes to merge. + + Returns + ------- + EmptyNode + A new EmptyNode that depends on all input nodes. + """ + return self._entry.join(*nodes) + + def record_event(self, event): + """Add an entry-point event record node (no dependencies). + + See :meth:`Node.record_event` for full documentation. + """ + return self._entry.record_event(event) + + def wait_event(self, event): + """Add an entry-point event wait node (no dependencies). + + See :meth:`Node.wait_event` for full documentation. + """ + return self._entry.wait_event(event) + def instantiate(self): """Instantiate the graph definition into an executable Graph. @@ -234,11 +287,9 @@ cdef class GraphDef: cdef class Node: """Base class for all graph nodes. - Nodes are created by calling methods on other Nodes. Each method - returns a new Node subclass that depends on the current node(s). - - The root node (obtained from GraphDef.root) is a base Node with a - NULL internal handle, representing graph entry points. + Nodes are created by calling builder methods on GraphDef (for + entry-point nodes with no dependencies) or on other Nodes (for + nodes that depend on a predecessor). """ @staticmethod @@ -1063,7 +1114,7 @@ cdef class EventRecordNode(Node): @property def event(self): """The event being recorded (non-owning wrapper).""" - return Event._from_raw_handle(self._event) + return Event._from_handle(self._event) cdef class EventWaitNode(Node): @@ -1099,4 +1150,4 @@ cdef class EventWaitNode(Node): @property def event(self): """The event being waited on (non-owning wrapper).""" - return Event._from_raw_handle(self._event) + return Event._from_handle(self._event) diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py index cc855a11b0..d6c9bb89b0 100644 --- a/cuda_core/tests/graph/test_explicit.py +++ b/cuda_core/tests/graph/test_explicit.py @@ -52,7 +52,7 @@ def _build_empty(): def _build_single(): """One alloc node, no edges.""" g = GraphDef() - a = g.root.alloc(ALLOC_SIZE) + a = g.alloc(ALLOC_SIZE) return GraphSpec( "single", g, @@ -66,7 +66,7 @@ def _build_single(): def _build_chain(): """Linear chain: a -> b -> c.""" g = GraphDef() - a = g.root.alloc(ALLOC_SIZE) + a = g.alloc(ALLOC_SIZE) b = a.alloc(ALLOC_SIZE) c = b.alloc(ALLOC_SIZE) return GraphSpec( @@ -82,7 +82,7 @@ def _build_chain(): def _build_fan_out(): """One node feeds three: a -> {b, c, d}.""" g = GraphDef() - a = g.root.alloc(ALLOC_SIZE) + a = g.alloc(ALLOC_SIZE) b = a.alloc(ALLOC_SIZE) c = a.alloc(ALLOC_SIZE) d = a.alloc(ALLOC_SIZE) @@ -99,10 +99,10 @@ def _build_fan_out(): def _build_fan_in(): """Three entry nodes merge: {a, b, c} -> d (join).""" g = GraphDef() - a = g.root.alloc(ALLOC_SIZE) - b = g.root.alloc(ALLOC_SIZE) - c = g.root.alloc(ALLOC_SIZE) - d = a.join(b, c) + a = g.alloc(ALLOC_SIZE) + b = g.alloc(ALLOC_SIZE) + c = g.alloc(ALLOC_SIZE) + d = g.join(a, b, c) return GraphSpec( "fan_in", g, @@ -116,7 +116,7 @@ def _build_fan_in(): def _build_diamond(): """Diamond: a -> {b, c} -> d (join).""" g = GraphDef() - a = g.root.alloc(ALLOC_SIZE) + a = g.alloc(ALLOC_SIZE) b = a.alloc(ALLOC_SIZE) c = a.alloc(ALLOC_SIZE) d = b.join(c) @@ -133,8 +133,8 @@ def _build_diamond(): def _build_disconnected(): """Two independent entry nodes: a, b.""" g = GraphDef() - a = g.root.alloc(ALLOC_SIZE) - b = g.root.alloc(ALLOC_SIZE) + a = g.alloc(ALLOC_SIZE) + b = g.alloc(ALLOC_SIZE) return GraphSpec( "disconnected", g, @@ -189,16 +189,16 @@ class NodeSpec: def _build_empty_node(g): - a = g.root.alloc(ALLOC_SIZE) - b = g.root.alloc(ALLOC_SIZE) - return a.join(b), {} + a = g.alloc(ALLOC_SIZE) + b = g.alloc(ALLOC_SIZE) + return g.join(a, b), {} def _build_kernel_node(g): mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") config = LaunchConfig(grid=(2, 3, 1), block=(32, 4, 1), shmem_size=128) - entry = g.root.alloc(ALLOC_SIZE) + entry = g.alloc(ALLOC_SIZE) node = entry.launch(config, kernel) return node, { "grid": (2, 3, 1), @@ -211,7 +211,7 @@ def _build_kernel_node(g): def _build_alloc_node(g): device_id = Device().device_id - entry = g.root.alloc(ALLOC_SIZE) + entry = g.alloc(ALLOC_SIZE) node = entry.alloc(ALLOC_SIZE) return node, { "dptr": lambda v: v != 0, @@ -226,7 +226,7 @@ def _build_alloc_node(g): def _build_alloc_managed_node(g): device_id = Device().device_id options = GraphAllocOptions(memory_type="managed") - entry = g.root.alloc(ALLOC_SIZE) + entry = g.alloc(ALLOC_SIZE) node = entry.alloc(ALLOC_SIZE, options) return node, { "dptr": lambda v: v != 0, @@ -239,7 +239,7 @@ def _build_alloc_managed_node(g): def _build_free_node(g): - alloc = g.root.alloc(ALLOC_SIZE) + alloc = g.alloc(ALLOC_SIZE) node = alloc.free(alloc.dptr) return node, { "dptr": alloc.dptr, @@ -247,7 +247,7 @@ def _build_free_node(g): def _build_memset_node(g): - alloc = g.root.alloc(ALLOC_SIZE) + alloc = g.alloc(ALLOC_SIZE) node = alloc.memset(alloc.dptr, 42, ALLOC_SIZE) return node, { "dptr": alloc.dptr, @@ -260,7 +260,7 @@ def _build_memset_node(g): def _build_memset_node_u16(g): - alloc = g.root.alloc(ALLOC_SIZE) + alloc = g.alloc(ALLOC_SIZE) node = alloc.memset(alloc.dptr, b"\xab\xcd", ALLOC_SIZE // 2) return node, { "dptr": alloc.dptr, @@ -273,7 +273,7 @@ def _build_memset_node_u16(g): def _build_memset_node_u32(g): - alloc = g.root.alloc(ALLOC_SIZE) + alloc = g.alloc(ALLOC_SIZE) node = alloc.memset(alloc.dptr, b"\x01\x02\x03\x04", ALLOC_SIZE // 4) return node, { "dptr": alloc.dptr, @@ -288,7 +288,7 @@ def _build_memset_node_u32(g): def _build_memset_node_2d(g): rows = 4 cols = ALLOC_SIZE // rows - alloc = g.root.alloc(ALLOC_SIZE) + alloc = g.alloc(ALLOC_SIZE) node = alloc.memset(alloc.dptr, 0xFF, cols, height=rows, pitch=cols) return node, { "dptr": alloc.dptr, @@ -302,7 +302,7 @@ def _build_memset_node_2d(g): def _build_event_record_node(g): event = Device().create_event() - entry = g.root.alloc(ALLOC_SIZE) + entry = g.alloc(ALLOC_SIZE) node = entry.record_event(event) return node, { "event": event, @@ -311,7 +311,7 @@ def _build_event_record_node(g): def _build_event_wait_node(g): event = Device().create_event() - entry = g.root.alloc(ALLOC_SIZE) + entry = g.alloc(ALLOC_SIZE) node = entry.wait_event(event) return node, { "event": event, @@ -492,17 +492,13 @@ def test_graphdef_handle_valid(sample_graphdef): assert int(sample_graphdef.handle) != 0 -def test_graphdef_root_returns_node(sample_graphdef): - """GraphDef.root returns a Node instance.""" - assert isinstance(sample_graphdef.root, Node) - - -def test_graphdef_root_is_virtual(sample_graphdef): - """Root node is virtual (no pred/succ, type is None).""" - root = sample_graphdef.root - assert root.pred == () - assert root.succ == () - assert root.type is None +def test_graphdef_entry_is_virtual(sample_graphdef): + """Internal entry node is virtual (no pred/succ, type is None).""" + entry = sample_graphdef._entry + assert isinstance(entry, Node) + assert entry.pred == () + assert entry.succ == () + assert entry.type is None # ============================================================================= @@ -515,19 +511,19 @@ def test_alloc_zero_size_fails(sample_graphdef): from cuda.core._utils.cuda_utils import CUDAError with pytest.raises(CUDAError): - sample_graphdef.root.alloc(0) + sample_graphdef.alloc(0) def test_free_creates_dependency(sample_graphdef): """Free node depends on its predecessor.""" - alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + alloc = sample_graphdef.alloc(ALLOC_SIZE) free = alloc.free(alloc.dptr) assert alloc in free.pred def test_alloc_free_chain(sample_graphdef): """Alloc and free can be chained.""" - a1 = sample_graphdef.root.alloc(ALLOC_SIZE) + a1 = sample_graphdef.alloc(ALLOC_SIZE) a2 = a1.alloc(ALLOC_SIZE) f2 = a2.free(a2.dptr) f1 = f2.free(a1.dptr) @@ -545,7 +541,7 @@ def test_alloc_memory_type_invalid(sample_graphdef): """Invalid memory type raises ValueError.""" options = GraphAllocOptions(memory_type="invalid") with pytest.raises(ValueError, match="Invalid memory_type"): - sample_graphdef.root.alloc(ALLOC_SIZE, options) + sample_graphdef.alloc(ALLOC_SIZE, options) @pytest.mark.parametrize( @@ -559,7 +555,7 @@ def test_alloc_device_option(sample_graphdef, device_spec): """Device can be specified as int or Device object.""" device = Device() options = GraphAllocOptions(device=device_spec(device)) - node = sample_graphdef.root.alloc(ALLOC_SIZE, options) + node = sample_graphdef.alloc(ALLOC_SIZE, options) assert node.dptr != 0 @@ -568,7 +564,7 @@ def test_alloc_peer_access(mempool_device_x2): d0, d1 = mempool_device_x2 g = GraphDef() options = GraphAllocOptions(device=d0.device_id, peer_access=[d1.device_id]) - node = g.root.alloc(ALLOC_SIZE, options) + node = g.alloc(ALLOC_SIZE, options) assert d1.device_id in node.peer_access @@ -580,8 +576,8 @@ def test_alloc_peer_access(mempool_device_x2): @pytest.mark.parametrize("num_branches", [2, 3, 5]) def test_join_merges_branches(sample_graphdef, num_branches): """join() with multiple branches creates correct dependencies.""" - branches = [sample_graphdef.root.alloc(ALLOC_SIZE) for _ in range(num_branches)] - joined = branches[0].join(*branches[1:]) + branches = [sample_graphdef.alloc(ALLOC_SIZE) for _ in range(num_branches)] + joined = sample_graphdef.join(*branches) assert isinstance(joined, EmptyNode) assert set(joined.pred) == set(branches) @@ -596,7 +592,7 @@ def test_launch_creates_node(sample_graphdef): mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") config = LaunchConfig(grid=1, block=1) - node = sample_graphdef.root.launch(config, kernel) + node = sample_graphdef.launch(config, kernel) assert isinstance(node, KernelNode) @@ -605,7 +601,7 @@ def test_launch_chain_dependencies(sample_graphdef): mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") config = LaunchConfig(grid=1, block=1) - n1 = sample_graphdef.root.launch(config, kernel) + n1 = sample_graphdef.launch(config, kernel) n2 = n1.launch(config, kernel) n3 = n2.launch(config, kernel) assert n1 in n2.pred @@ -626,8 +622,8 @@ def test_instantiate_empty_graph(sample_graphdef): def test_instantiate_with_nodes(sample_graphdef): """Graph with nodes can be instantiated.""" - sample_graphdef.root.alloc(ALLOC_SIZE) - sample_graphdef.root.alloc(ALLOC_SIZE) + sample_graphdef.alloc(ALLOC_SIZE) + sample_graphdef.alloc(ALLOC_SIZE) graph = sample_graphdef.instantiate() assert graph is not None @@ -637,7 +633,7 @@ def test_instantiate_and_execute_kernel(sample_graphdef): mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") config = LaunchConfig(grid=1, block=1) - sample_graphdef.root.launch(config, kernel) + sample_graphdef.launch(config, kernel) graph = sample_graphdef.instantiate() stream = Device().create_stream() @@ -648,7 +644,7 @@ def test_instantiate_and_execute_kernel(sample_graphdef): def test_instantiate_and_execute_alloc_free(sample_graphdef): """Graph with alloc/free can be executed.""" - alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + alloc = sample_graphdef.alloc(ALLOC_SIZE) alloc.free(alloc.dptr) graph = sample_graphdef.instantiate() @@ -660,7 +656,7 @@ def test_instantiate_and_execute_alloc_free(sample_graphdef): def test_instantiate_and_execute_memset(sample_graphdef): """Graph with alloc/memset/free can be executed.""" - alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + alloc = sample_graphdef.alloc(ALLOC_SIZE) ms = alloc.memset(alloc.dptr, 0xAB, ALLOC_SIZE) ms.free(alloc.dptr) graph = sample_graphdef.instantiate() @@ -674,7 +670,7 @@ def test_instantiate_and_execute_memset(sample_graphdef): def test_instantiate_and_execute_event_record_wait(sample_graphdef): """Graph with event record and wait nodes can be executed.""" event = Device().create_event() - rec = sample_graphdef.root.record_event(event) + rec = sample_graphdef.record_event(event) rec.wait_event(event) graph = sample_graphdef.instantiate() @@ -691,7 +687,7 @@ def test_instantiate_and_execute_event_record_wait(sample_graphdef): def test_debug_dot_print_creates_file(sample_graphdef, dot_file): """debug_dot_print writes a DOT file.""" - sample_graphdef.root.alloc(ALLOC_SIZE) + sample_graphdef.alloc(ALLOC_SIZE) sample_graphdef.debug_dot_print(str(dot_file)) assert dot_file.exists() content = dot_file.read_text() @@ -700,7 +696,7 @@ def test_debug_dot_print_creates_file(sample_graphdef, dot_file): def test_debug_dot_print_with_options(sample_graphdef, dot_file): """debug_dot_print accepts GraphDebugPrintOptions.""" - sample_graphdef.root.alloc(ALLOC_SIZE) + sample_graphdef.alloc(ALLOC_SIZE) options = GraphDebugPrintOptions(verbose=True, handles=True) sample_graphdef.debug_dot_print(str(dot_file), options) assert dot_file.exists() @@ -708,6 +704,6 @@ def test_debug_dot_print_with_options(sample_graphdef, dot_file): def test_debug_dot_print_invalid_options(sample_graphdef, dot_file): """debug_dot_print rejects invalid options type.""" - sample_graphdef.root.alloc(ALLOC_SIZE) + sample_graphdef.alloc(ALLOC_SIZE) with pytest.raises(TypeError, match="options must be a GraphDebugPrintOptions"): sample_graphdef.debug_dot_print(str(dot_file), "invalid") diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index 56bc30e80c..008f98a53c 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -223,41 +223,41 @@ def sample_graphdef_alt(init_cuda): @pytest.fixture def sample_root_node(sample_graphdef): """A root Node (virtual, NULL handle).""" - return sample_graphdef.root + return sample_graphdef._entry @pytest.fixture def sample_root_node_alt(sample_graphdef_alt): """An alternate root Node from different graph.""" - return sample_graphdef_alt.root + return sample_graphdef_alt._entry @pytest.fixture def sample_empty_node(sample_graphdef): """An EmptyNode created by merging two branches.""" - a = sample_graphdef.root.alloc(ALLOC_SIZE) - b = sample_graphdef.root.alloc(ALLOC_SIZE) - return a.join(b) + a = sample_graphdef.alloc(ALLOC_SIZE) + b = sample_graphdef.alloc(ALLOC_SIZE) + return sample_graphdef.join(a, b) @pytest.fixture def sample_empty_node_alt(sample_graphdef): """An alternate EmptyNode from same graph.""" - c = sample_graphdef.root.alloc(ALLOC_SIZE) - d = sample_graphdef.root.alloc(ALLOC_SIZE) - return c.join(d) + c = sample_graphdef.alloc(ALLOC_SIZE) + d = sample_graphdef.alloc(ALLOC_SIZE) + return sample_graphdef.join(c, d) @pytest.fixture def sample_alloc_node(sample_graphdef): """An AllocNode.""" - return sample_graphdef.root.alloc(ALLOC_SIZE) + return sample_graphdef.alloc(ALLOC_SIZE) @pytest.fixture def sample_alloc_node_alt(sample_graphdef): """An alternate AllocNode from same graph.""" - return sample_graphdef.root.alloc(ALLOC_SIZE) + return sample_graphdef.alloc(ALLOC_SIZE) @pytest.fixture @@ -266,7 +266,7 @@ def sample_kernel_node(sample_graphdef, init_cuda): mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") config = LaunchConfig(grid=1, block=1) - return sample_graphdef.root.launch(config, kernel) + return sample_graphdef.launch(config, kernel) @pytest.fixture @@ -275,34 +275,34 @@ def sample_kernel_node_alt(sample_graphdef, init_cuda): mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") config = LaunchConfig(grid=1, block=1) - return sample_graphdef.root.launch(config, kernel) + return sample_graphdef.launch(config, kernel) @pytest.fixture def sample_free_node(sample_graphdef): """A FreeNode.""" - alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + alloc = sample_graphdef.alloc(ALLOC_SIZE) return alloc.free(alloc.dptr) @pytest.fixture def sample_free_node_alt(sample_graphdef): """An alternate FreeNode from same graph.""" - alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + alloc = sample_graphdef.alloc(ALLOC_SIZE) return alloc.free(alloc.dptr) @pytest.fixture def sample_memset_node(sample_graphdef): """A MemsetNode.""" - alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + alloc = sample_graphdef.alloc(ALLOC_SIZE) return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) @pytest.fixture def sample_memset_node_alt(sample_graphdef): """An alternate MemsetNode from same graph.""" - alloc = sample_graphdef.root.alloc(ALLOC_SIZE) + alloc = sample_graphdef.alloc(ALLOC_SIZE) return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) @@ -310,28 +310,28 @@ def sample_memset_node_alt(sample_graphdef): def sample_event_record_node(sample_graphdef, sample_device): """An EventRecordNode.""" event = sample_device.create_event() - return sample_graphdef.root.record_event(event) + return sample_graphdef.record_event(event) @pytest.fixture def sample_event_record_node_alt(sample_graphdef, sample_device): """An alternate EventRecordNode from same graph.""" event = sample_device.create_event() - return sample_graphdef.root.record_event(event) + return sample_graphdef.record_event(event) @pytest.fixture def sample_event_wait_node(sample_graphdef, sample_device): """An EventWaitNode.""" event = sample_device.create_event() - return sample_graphdef.root.wait_event(event) + return sample_graphdef.wait_event(event) @pytest.fixture def sample_event_wait_node_alt(sample_graphdef, sample_device): """An alternate EventWaitNode from same graph.""" event = sample_device.create_event() - return sample_graphdef.root.wait_event(event) + return sample_graphdef.wait_event(event) # ============================================================================= From 1a790ceeb969dfa588d446f095c2af771ecc6371 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 07:50:15 -0800 Subject: [PATCH 08/23] Improve __repr__ for graph nodes, add Node.handle, use as_py for GraphDef.handle - Fix stale 'root' references to 'entry' in docstrings, comments, repr - Add Node.handle property (returns CUgraphNode as int, None for entry) - GraphDef.handle now uses as_py() for cleaner conversion - Update reprs to show domain-relevant payload instead of ambiguous handles: EmptyNode shows pred count, KernelNode shows grid/block, AllocNode/FreeNode/MemsetNode show dptr and params, EventRecord/WaitNode show event handle Made-with: Cursor --- cuda_core/cuda/core/_graph/_graphdef.pxd | 2 +- cuda_core/cuda/core/_graph/_graphdef.pyx | 39 +++++++++++++++--------- cuda_core/tests/test_object_protocols.py | 20 ++++++------ 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index f16abe01f9..dc3e88ce81 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -31,7 +31,7 @@ cdef class GraphDef: cdef class Node: cdef: GraphHandle _h_graph - cydriver.CUgraphNode _node # NULL for root + cydriver.CUgraphNode _node # NULL for entry node tuple _pred_cache tuple _succ_cache object __weakref__ diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 480a898a8f..f89acba1cd 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -10,7 +10,7 @@ graphs explicitly (as opposed to stream capture). Both approaches produce the same public Graph type for execution. Node hierarchy: - Node (base — also used for the virtual root) + Node (base — also used for the internal entry point) ├── EmptyNode (synchronization / join point) ├── KernelNode (kernel launch) ├── AllocNode (memory allocation, exposes dptr and bytesize) @@ -39,6 +39,7 @@ from cuda.core._resource_handles cimport ( create_graph_handle, as_cu, as_intptr, + as_py, ) from cuda.core._event cimport Event from cuda.core._module cimport Kernel @@ -228,7 +229,7 @@ cdef class GraphDef: Returns ------- tuple of Node - All nodes in the graph (excluding the virtual root). + All nodes in the graph. """ cdef cydriver.CUgraph graph = as_cu(self._h_graph) cdef size_t num_nodes = 0 @@ -281,7 +282,7 @@ cdef class GraphDef: @property def handle(self): """Return the underlying CUgraph handle.""" - return driver.CUgraph(as_intptr(self._h_graph)) + return as_py(self._h_graph) cdef class Node: @@ -327,7 +328,7 @@ cdef class Node: def __repr__(self): if self._node == NULL: - return "" + return "" return f"self._node:x}>" def __eq__(self, other): @@ -347,7 +348,7 @@ cdef class Node: Returns ------- CUgraphNodeType or None - The node type enum value, or None for the virtual root node. + The node type enum value, or None for the entry node. """ if self._node == NULL: return None @@ -361,6 +362,16 @@ cdef class Node: """Return the GraphDef this node belongs to.""" return GraphDef._from_handle(self._h_graph) + @property + def handle(self): + """Return the underlying CUgraphNode handle as an int. + + Returns None for the entry node. + """ + if self._node == NULL: + return None + return self._node + @property def pred(self): """Return the predecessor nodes (dependencies) of this node. @@ -783,7 +794,8 @@ cdef class EmptyNode(Node): return n def __repr__(self): - return f"self._node:x}>" + cdef Py_ssize_t n = len(self.pred) + return f"" cdef class KernelNode(Node): @@ -831,7 +843,7 @@ cdef class KernelNode(Node): params.kern) def __repr__(self): - return f"self._node:x}>" + return (f"") @property def grid(self): @@ -926,8 +938,7 @@ cdef class AllocNode(Node): params.poolProps.location.id, memory_type, tuple(peer_ids)) def __repr__(self): - return (f"self._node:x} " - f"dptr=0x{self._dptr:x} size={self._bytesize}>") + return f"" @property def dptr(self): @@ -992,7 +1003,7 @@ cdef class FreeNode(Node): return FreeNode._create_with_params(h_graph, node, dptr) def __repr__(self): - return f"self._node:x} dptr=0x{self._dptr:x}>" + return f"" @property def dptr(self): @@ -1047,8 +1058,8 @@ cdef class MemsetNode(Node): params.elementSize, params.width, params.height, params.pitch) def __repr__(self): - return (f"self._node:x} " - f"dptr=0x{self._dptr:x}>") + return (f"") @property def dptr(self): @@ -1109,7 +1120,7 @@ cdef class EventRecordNode(Node): return EventRecordNode._create_with_params(h_graph, node, event) def __repr__(self): - return f"self._node:x}>" + return f"self._event:x}>" @property def event(self): @@ -1145,7 +1156,7 @@ cdef class EventWaitNode(Node): return EventWaitNode._create_with_params(h_graph, node, event) def __repr__(self): - return f"self._node:x}>" + return f"self._event:x}>" @property def event(self): diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index 008f98a53c..33f479dc83 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -222,13 +222,13 @@ def sample_graphdef_alt(init_cuda): @pytest.fixture def sample_root_node(sample_graphdef): - """A root Node (virtual, NULL handle).""" + """An entry Node (virtual, NULL handle).""" return sample_graphdef._entry @pytest.fixture def sample_root_node_alt(sample_graphdef_alt): - """An alternate root Node from different graph.""" + """An alternate entry Node from different graph.""" return sample_graphdef_alt._entry @@ -459,14 +459,14 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): ("sample_program_nvvm", r""), # Graph types ("sample_graphdef", r""), - ("sample_root_node", r""), - ("sample_empty_node", r""), - ("sample_alloc_node", r""), - ("sample_kernel_node", r""), - ("sample_free_node", r""), - ("sample_memset_node", r""), - ("sample_event_record_node", r""), - ("sample_event_wait_node", r""), + ("sample_root_node", r""), + ("sample_empty_node", r""), + ("sample_alloc_node", r""), + ("sample_kernel_node", r""), + ("sample_free_node", r""), + ("sample_memset_node", r""), + ("sample_event_record_node", r""), + ("sample_event_wait_node", r""), ] From 3e25c641ace589faef5166d681eb2d827053ee2e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 08:09:34 -0800 Subject: [PATCH 09/23] Add MemcpyNode with auto-detected memory types Simple 1D memcpy interface: Node.memcpy(dst, src, size) auto-detects host vs device memory via cuPointerGetAttribute, falling back to device type for graph-allocated pointers. Includes MemcpyNode subclass with dst/src/size properties, GraphDef.memcpy forwarding, execution test verifying data correctness, and object protocol coverage. Made-with: Cursor --- cuda_core/cuda/core/_graph/_graphdef.pxd | 19 +++ cuda_core/cuda/core/_graph/_graphdef.pyx | 164 +++++++++++++++++++++++ cuda_core/tests/graph/test_explicit.py | 41 ++++++ cuda_core/tests/test_object_protocols.py | 23 ++++ 4 files changed, 247 insertions(+) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index dc3e88ce81..e2ad8b67ec 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -15,6 +15,7 @@ cdef class KernelNode(Node) cdef class AllocNode(Node) cdef class FreeNode(Node) cdef class MemsetNode(Node) +cdef class MemcpyNode(Node) cdef class EventRecordNode(Node) cdef class EventWaitNode(Node) @@ -109,6 +110,24 @@ cdef class MemsetNode(Node): cdef MemsetNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) +cdef class MemcpyNode(Node): + cdef: + cydriver.CUdeviceptr _dst + cydriver.CUdeviceptr _src + size_t _size + cydriver.CUmemorytype _dst_type + cydriver.CUmemorytype _src_type + + @staticmethod + cdef MemcpyNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUdeviceptr dst, cydriver.CUdeviceptr src, + size_t size, cydriver.CUmemorytype dst_type, + cydriver.CUmemorytype src_type) + + @staticmethod + cdef MemcpyNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + + cdef class EventRecordNode(Node): cdef: cydriver.CUevent _event diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index f89acba1cd..a3af97b2cf 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -16,6 +16,7 @@ Node hierarchy: ├── AllocNode (memory allocation, exposes dptr and bytesize) ├── FreeNode (memory free, exposes dptr) ├── MemsetNode (memory set, exposes dptr, value, element_size, etc.) + ├── MemcpyNode (memory copy, exposes dst, src, size) ├── EventRecordNode (record an event) └── EventWaitNode (wait for an event) """ @@ -170,6 +171,13 @@ cdef class GraphDef: """ return self._entry.join(*nodes) + def memcpy(self, dst, src, size_t size): + """Add an entry-point memcpy node (no dependencies). + + See :meth:`Node.memcpy` for full documentation. + """ + return self._entry.memcpy(dst, src, size) + def record_event(self, event): """Add an entry-point event record node (no dependencies). @@ -316,6 +324,8 @@ cdef class Node: return FreeNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEMSET: return MemsetNode._create_from_driver(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEMCPY: + return MemcpyNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_EVENT_RECORD: return EventRecordNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_WAIT_EVENT: @@ -715,6 +725,87 @@ cdef class Node: self._h_graph, new_node, c_dst, val, elem_size, width, height, pitch) + def memcpy(self, dst, src, size_t size): + """Add a memcpy node depending on this node. + + Copies ``size`` bytes from ``src`` to ``dst``. Memory types are + auto-detected via the driver, so both device and pinned host + pointers are supported. + + Parameters + ---------- + dst : int + Destination pointer (device or pinned host). + src : int + Source pointer (device or pinned host). + size : int + Number of bytes to copy. + + Returns + ------- + MemcpyNode + A new MemcpyNode representing the copy operation. + """ + cdef cydriver.CUdeviceptr c_dst = dst + cdef cydriver.CUdeviceptr c_src = src + + cdef unsigned int dst_mem_type = cydriver.CU_MEMORYTYPE_DEVICE + cdef unsigned int src_mem_type = cydriver.CU_MEMORYTYPE_DEVICE + cdef cydriver.CUresult ret + with nogil: + ret = cydriver.cuPointerGetAttribute( + &dst_mem_type, + cydriver.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + c_dst) + if ret != cydriver.CUDA_SUCCESS and ret != cydriver.CUDA_ERROR_INVALID_VALUE: + HANDLE_RETURN(ret) + ret = cydriver.cuPointerGetAttribute( + &src_mem_type, + cydriver.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + c_src) + if ret != cydriver.CUDA_SUCCESS and ret != cydriver.CUDA_ERROR_INVALID_VALUE: + HANDLE_RETURN(ret) + + cdef cydriver.CUmemorytype c_dst_type = dst_mem_type + cdef cydriver.CUmemorytype c_src_type = src_mem_type + + cdef cydriver.CUDA_MEMCPY3D params + c_memset(¶ms, 0, sizeof(params)) + + params.srcMemoryType = c_src_type + params.dstMemoryType = c_dst_type + if c_src_type == cydriver.CU_MEMORYTYPE_HOST: + params.srcHost = c_src + else: + params.srcDevice = c_src + if c_dst_type == cydriver.CU_MEMORYTYPE_HOST: + params.dstHost = c_dst + else: + params.dstDevice = c_dst + params.WidthInBytes = size + params.Height = 1 + params.Depth = 1 + + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + + if self._node != NULL: + deps = &self._node + num_deps = 1 + + cdef cydriver.CUcontext ctx = NULL + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + HANDLE_RETURN(cydriver.cuGraphAddMemcpyNode( + &new_node, graph, deps, num_deps, ¶ms, ctx)) + + self._succ_cache = None + return MemcpyNode._create_with_params( + self._h_graph, new_node, c_dst, c_src, size, + c_dst_type, c_src_type) + def record_event(self, event): """Add an event record node depending on this node. @@ -1092,6 +1183,79 @@ cdef class MemsetNode(Node): return self._pitch +cdef class MemcpyNode(Node): + """A memory copy node. + + Properties + ---------- + dst : int + The destination pointer. + src : int + The source pointer. + size : int + The number of bytes copied. + """ + + @staticmethod + cdef MemcpyNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cydriver.CUdeviceptr dst, cydriver.CUdeviceptr src, + size_t size, cydriver.CUmemorytype dst_type, + cydriver.CUmemorytype src_type): + """Create from known params (called by memcpy() builder).""" + cdef MemcpyNode n = MemcpyNode.__new__(MemcpyNode) + n._h_graph = h_graph + n._node = node + n._dst = dst + n._src = src + n._size = size + n._dst_type = dst_type + n._src_type = src_type + return n + + @staticmethod + cdef MemcpyNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUDA_MEMCPY3D params + with nogil: + HANDLE_RETURN(cydriver.cuGraphMemcpyNodeGetParams(node, ¶ms)) + + cdef cydriver.CUdeviceptr dst + cdef cydriver.CUdeviceptr src + if params.dstMemoryType == cydriver.CU_MEMORYTYPE_HOST: + dst = params.dstHost + else: + dst = params.dstDevice + if params.srcMemoryType == cydriver.CU_MEMORYTYPE_HOST: + src = params.srcHost + else: + src = params.srcDevice + + return MemcpyNode._create_with_params( + h_graph, node, dst, src, params.WidthInBytes, + params.dstMemoryType, params.srcMemoryType) + + def __repr__(self): + cdef str dt = "H" if self._dst_type == cydriver.CU_MEMORYTYPE_HOST else "D" + cdef str st = "H" if self._src_type == cydriver.CU_MEMORYTYPE_HOST else "D" + return (f"") + + @property + def dst(self): + """The destination pointer.""" + return self._dst + + @property + def src(self): + """The source pointer.""" + return self._src + + @property + def size(self): + """The number of bytes copied.""" + return self._size + + cdef class EventRecordNode(Node): """An event record node. diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py index d6c9bb89b0..c42408b665 100644 --- a/cuda_core/tests/graph/test_explicit.py +++ b/cuda_core/tests/graph/test_explicit.py @@ -20,6 +20,7 @@ GraphAllocOptions, GraphDef, KernelNode, + MemcpyNode, MemsetNode, Node, ) @@ -318,6 +319,18 @@ def _build_event_wait_node(g): } +def _build_memcpy_node(g): + src_alloc = g.alloc(ALLOC_SIZE) + dst_alloc = g.alloc(ALLOC_SIZE) + dep = g.join(src_alloc, dst_alloc) + node = dep.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE) + return node, { + "dst": dst_alloc.dptr, + "src": src_alloc.dptr, + "size": ALLOC_SIZE, + } + + _NODE_SPECS = [ pytest.param(NodeSpec("empty", EmptyNode, "CU_GRAPH_NODE_TYPE_EMPTY", _build_empty_node), id="empty"), pytest.param(NodeSpec("kernel", KernelNode, "CU_GRAPH_NODE_TYPE_KERNEL", _build_kernel_node), id="kernel"), @@ -335,6 +348,10 @@ def _build_event_wait_node(g): NodeSpec("memset_u32", MemsetNode, "CU_GRAPH_NODE_TYPE_MEMSET", _build_memset_node_u32), id="memset_u32" ), pytest.param(NodeSpec("memset_2d", MemsetNode, "CU_GRAPH_NODE_TYPE_MEMSET", _build_memset_node_2d), id="memset_2d"), + pytest.param( + NodeSpec("memcpy", MemcpyNode, "CU_GRAPH_NODE_TYPE_MEMCPY", _build_memcpy_node), + id="memcpy", + ), pytest.param( NodeSpec("event_record", EventRecordNode, "CU_GRAPH_NODE_TYPE_EVENT_RECORD", _build_event_record_node), id="event_record", @@ -667,6 +684,30 @@ def test_instantiate_and_execute_memset(sample_graphdef): stream.sync() +def test_instantiate_and_execute_memcpy(sample_graphdef): + """Graph with alloc/memset/memcpy/free can be executed and data is copied.""" + import ctypes + + src_alloc = sample_graphdef.alloc(ALLOC_SIZE) + dst_alloc = sample_graphdef.alloc(ALLOC_SIZE) + dep = sample_graphdef.join(src_alloc, dst_alloc) + ms = dep.memset(src_alloc.dptr, 0xAB, ALLOC_SIZE) + cp = ms.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE) + cp.free(src_alloc.dptr) + + graph = sample_graphdef.instantiate() + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + host_buf = (ctypes.c_ubyte * ALLOC_SIZE)() + from cuda.bindings import driver as drv + + drv.cuMemcpyDtoH(host_buf, dst_alloc.dptr, ALLOC_SIZE) + assert all(b == 0xAB for b in host_buf) + + def test_instantiate_and_execute_event_record_wait(sample_graphdef): """Graph with event record and wait nodes can be executed.""" event = Device().create_event() diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index 33f479dc83..5efce36c98 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -306,6 +306,24 @@ def sample_memset_node_alt(sample_graphdef): return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) +@pytest.fixture +def sample_memcpy_node(sample_graphdef): + """A MemcpyNode.""" + src = sample_graphdef.alloc(ALLOC_SIZE) + dst = sample_graphdef.alloc(ALLOC_SIZE) + dep = sample_graphdef.join(src, dst) + return dep.memcpy(dst.dptr, src.dptr, ALLOC_SIZE) + + +@pytest.fixture +def sample_memcpy_node_alt(sample_graphdef): + """An alternate MemcpyNode from same graph.""" + src = sample_graphdef.alloc(ALLOC_SIZE) + dst = sample_graphdef.alloc(ALLOC_SIZE) + dep = sample_graphdef.join(src, dst) + return dep.memcpy(dst.dptr, src.dptr, ALLOC_SIZE) + + @pytest.fixture def sample_event_record_node(sample_graphdef, sample_device): """An EventRecordNode.""" @@ -355,6 +373,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): "sample_kernel_node", "sample_free_node", "sample_memset_node", + "sample_memcpy_node", "sample_event_record_node", "sample_event_wait_node", ] @@ -376,6 +395,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): "sample_kernel_node", "sample_free_node", "sample_memset_node", + "sample_memcpy_node", "sample_event_record_node", "sample_event_wait_node", ] @@ -398,6 +418,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): "sample_kernel_node", "sample_free_node", "sample_memset_node", + "sample_memcpy_node", "sample_event_record_node", "sample_event_wait_node", ] @@ -420,6 +441,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): ("sample_kernel_node", "sample_kernel_node_alt"), ("sample_free_node", "sample_free_node_alt"), ("sample_memset_node", "sample_memset_node_alt"), + ("sample_memcpy_node", "sample_memcpy_node_alt"), ("sample_event_record_node", "sample_event_record_node_alt"), ("sample_event_wait_node", "sample_event_wait_node_alt"), ] @@ -465,6 +487,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): ("sample_kernel_node", r""), ("sample_free_node", r""), ("sample_memset_node", r""), + ("sample_memcpy_node", r""), ("sample_event_record_node", r""), ("sample_event_wait_node", r""), ] From 228de3892758875984f6e3c2171021a979408304 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 08:42:09 -0800 Subject: [PATCH 10/23] Add ChildGraphNode with embed() builder and non-owning graph handle Node.embed(child) clones a GraphDef as a sub-graph node. Adds create_graph_handle_ref for non-owning graph handles (child graph is owned by the node, not the wrapper). ChildGraphNode exposes child_graph property and shows subnode count in repr. Made-with: Cursor --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 5 + cuda_core/cuda/core/_cpp/resource_handles.hpp | 5 + cuda_core/cuda/core/_graph/_graphdef.pxd | 13 +++ cuda_core/cuda/core/_graph/_graphdef.pyx | 95 +++++++++++++++++++ cuda_core/cuda/core/_resource_handles.pxd | 1 + cuda_core/cuda/core/_resource_handles.pyx | 2 + cuda_core/tests/graph/test_explicit.py | 35 +++++++ cuda_core/tests/test_object_protocols.py | 25 +++++ 8 files changed, 181 insertions(+) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 502ad8315b..4d65b74506 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -842,6 +842,11 @@ GraphHandle create_graph_handle(CUgraph graph) { return GraphHandle(box, &box->resource); } +GraphHandle create_graph_handle_ref(CUgraph graph) { + auto box = std::make_shared(GraphBox{graph}); + return GraphHandle(box, &box->resource); +} + // ============================================================================ // Graphics Resource Handles // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 60cdb95808..5fd2445112 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -329,6 +329,11 @@ KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_li // The caller must have already created the graph via cuGraphCreate. GraphHandle create_graph_handle(CUgraph graph); +// Create a non-owning graph handle (references existing graph). +// Use for graphs that are managed by a child graph node or another owner. +// The graph will NOT be destroyed when the handle is released. +GraphHandle create_graph_handle_ref(CUgraph graph); + // ============================================================================ // Graphics resource handle functions // ============================================================================ diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index e2ad8b67ec..55f77576a8 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -16,6 +16,7 @@ cdef class AllocNode(Node) cdef class FreeNode(Node) cdef class MemsetNode(Node) cdef class MemcpyNode(Node) +cdef class ChildGraphNode(Node) cdef class EventRecordNode(Node) cdef class EventWaitNode(Node) @@ -128,6 +129,18 @@ cdef class MemcpyNode(Node): cdef MemcpyNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) +cdef class ChildGraphNode(Node): + cdef: + GraphHandle _h_child_graph + + @staticmethod + cdef ChildGraphNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + GraphHandle h_child_graph) + + @staticmethod + cdef ChildGraphNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + + cdef class EventRecordNode(Node): cdef: cydriver.CUevent _event diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index a3af97b2cf..4007803f78 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -17,6 +17,7 @@ Node hierarchy: ├── FreeNode (memory free, exposes dptr) ├── MemsetNode (memory set, exposes dptr, value, element_size, etc.) ├── MemcpyNode (memory copy, exposes dst, src, size) + ├── ChildGraphNode (embedded sub-graph) ├── EventRecordNode (record an event) └── EventWaitNode (wait for an event) """ @@ -38,6 +39,7 @@ from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport ( GraphHandle, create_graph_handle, + create_graph_handle_ref, as_cu, as_intptr, as_py, @@ -178,6 +180,13 @@ cdef class GraphDef: """ return self._entry.memcpy(dst, src, size) + def embed(self, child): + """Add an entry-point child graph node (no dependencies). + + See :meth:`Node.embed` for full documentation. + """ + return self._entry.embed(child) + def record_event(self, event): """Add an entry-point event record node (no dependencies). @@ -326,6 +335,8 @@ cdef class Node: return MemsetNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEMCPY: return MemcpyNode._create_from_driver(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_GRAPH: + return ChildGraphNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_EVENT_RECORD: return EventRecordNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_WAIT_EVENT: @@ -806,6 +817,48 @@ cdef class Node: self._h_graph, new_node, c_dst, c_src, size, c_dst_type, c_src_type) + def embed(self, child): + """Add a child graph node depending on this node. + + Embeds a clone of the given graph definition as a sub-graph node. + The child graph must not contain allocation, free, or conditional + nodes. + + Parameters + ---------- + child : GraphDef + The graph definition to embed (will be cloned). + + Returns + ------- + ChildGraphNode + A new ChildGraphNode representing the embedded sub-graph. + """ + cdef GraphDef child_def = child + cdef cydriver.CUgraph child_graph = as_cu(child_def._h_graph) + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + + if self._node != NULL: + deps = &self._node + num_deps = 1 + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddChildGraphNode( + &new_node, graph, deps, num_deps, child_graph)) + + cdef cydriver.CUgraph embedded_graph = NULL + with nogil: + HANDLE_RETURN(cydriver.cuGraphChildGraphNodeGetGraph( + new_node, &embedded_graph)) + + cdef GraphHandle h_embedded = create_graph_handle_ref(embedded_graph) + + self._succ_cache = None + return ChildGraphNode._create_with_params(self._h_graph, new_node, h_embedded) + def record_event(self, event): """Add an event record node depending on this node. @@ -1256,6 +1309,48 @@ cdef class MemcpyNode(Node): return self._size +cdef class ChildGraphNode(Node): + """A child graph (sub-graph) node. + + Properties + ---------- + child_graph : GraphDef + The embedded graph definition (non-owning wrapper). + """ + + @staticmethod + cdef ChildGraphNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + GraphHandle h_child_graph): + """Create from known params (called by embed() builder).""" + cdef ChildGraphNode n = ChildGraphNode.__new__(ChildGraphNode) + n._h_graph = h_graph + n._node = node + n._h_child_graph = h_child_graph + return n + + @staticmethod + cdef ChildGraphNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraph child_graph = NULL + with nogil: + HANDLE_RETURN(cydriver.cuGraphChildGraphNodeGetGraph(node, &child_graph)) + cdef GraphHandle h_child = create_graph_handle_ref(child_graph) + return ChildGraphNode._create_with_params(h_graph, node, h_child) + + def __repr__(self): + cdef cydriver.CUgraph g = as_cu(self._h_child_graph) + cdef size_t num_nodes = 0 + with nogil: + HANDLE_RETURN(cydriver.cuGraphGetNodes(g, NULL, &num_nodes)) + cdef Py_ssize_t n = num_nodes + return f"" + + @property + def child_graph(self): + """The embedded graph definition (non-owning wrapper).""" + return GraphDef._from_handle(self._h_child_graph) + + cdef class EventRecordNode(Node): """An event record node. diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 2635f41b2c..58ab1d9abe 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -143,6 +143,7 @@ cdef KernelHandle create_kernel_handle_ref( # Graph handles cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil +cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph) except+ nogil # Graphics resource handles cdef GraphicsResourceHandle create_graphics_resource_handle( diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 7d8bb3d837..ac0410c49a 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -130,6 +130,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Graph handles GraphHandle create_graph_handle "cuda_core::create_graph_handle" ( cydriver.CUgraph graph) except+ nogil + GraphHandle create_graph_handle_ref "cuda_core::create_graph_handle_ref" ( + cydriver.CUgraph graph) except+ nogil # Graphics resource handles GraphicsResourceHandle create_graphics_resource_handle "cuda_core::create_graphics_resource_handle" ( diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py index c42408b665..33320a4072 100644 --- a/cuda_core/tests/graph/test_explicit.py +++ b/cuda_core/tests/graph/test_explicit.py @@ -13,6 +13,7 @@ from cuda.core._graph import GraphDebugPrintOptions from cuda.core._graph._graphdef import ( AllocNode, + ChildGraphNode, EmptyNode, EventRecordNode, EventWaitNode, @@ -331,6 +332,19 @@ def _build_memcpy_node(g): } +def _build_child_graph_node(g): + child = GraphDef() + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + child.launch(config, kernel) + child.launch(config, kernel) + node = g.embed(child) + return node, { + "child_graph": lambda v: isinstance(v, GraphDef) and len(v.nodes()) == 2, + } + + _NODE_SPECS = [ pytest.param(NodeSpec("empty", EmptyNode, "CU_GRAPH_NODE_TYPE_EMPTY", _build_empty_node), id="empty"), pytest.param(NodeSpec("kernel", KernelNode, "CU_GRAPH_NODE_TYPE_KERNEL", _build_kernel_node), id="kernel"), @@ -352,6 +366,10 @@ def _build_memcpy_node(g): NodeSpec("memcpy", MemcpyNode, "CU_GRAPH_NODE_TYPE_MEMCPY", _build_memcpy_node), id="memcpy", ), + pytest.param( + NodeSpec("child_graph", ChildGraphNode, "CU_GRAPH_NODE_TYPE_GRAPH", _build_child_graph_node), + id="child_graph", + ), pytest.param( NodeSpec("event_record", EventRecordNode, "CU_GRAPH_NODE_TYPE_EVENT_RECORD", _build_event_record_node), id="event_record", @@ -708,6 +726,23 @@ def test_instantiate_and_execute_memcpy(sample_graphdef): assert all(b == 0xAB for b in host_buf) +def test_instantiate_and_execute_child_graph(sample_graphdef): + """Graph with embedded child graph can be executed.""" + child = GraphDef() + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + child.launch(config, kernel) + + sample_graphdef.embed(child) + graph = sample_graphdef.instantiate() + + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + def test_instantiate_and_execute_event_record_wait(sample_graphdef): """Graph with event record and wait nodes can be executed.""" event = Device().create_event() diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index 5efce36c98..e617567cbb 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -324,6 +324,26 @@ def sample_memcpy_node_alt(sample_graphdef): return dep.memcpy(dst.dptr, src.dptr, ALLOC_SIZE) +@pytest.fixture +def sample_child_graph_node(sample_graphdef): + """A ChildGraphNode.""" + child = GraphDef() + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + child.launch(LaunchConfig(grid=1, block=1), kernel) + return sample_graphdef.embed(child) + + +@pytest.fixture +def sample_child_graph_node_alt(sample_graphdef): + """An alternate ChildGraphNode from same graph.""" + child = GraphDef() + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + child.launch(LaunchConfig(grid=1, block=1), kernel) + return sample_graphdef.embed(child) + + @pytest.fixture def sample_event_record_node(sample_graphdef, sample_device): """An EventRecordNode.""" @@ -374,6 +394,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): "sample_free_node", "sample_memset_node", "sample_memcpy_node", + "sample_child_graph_node", "sample_event_record_node", "sample_event_wait_node", ] @@ -396,6 +417,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): "sample_free_node", "sample_memset_node", "sample_memcpy_node", + "sample_child_graph_node", "sample_event_record_node", "sample_event_wait_node", ] @@ -419,6 +441,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): "sample_free_node", "sample_memset_node", "sample_memcpy_node", + "sample_child_graph_node", "sample_event_record_node", "sample_event_wait_node", ] @@ -442,6 +465,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): ("sample_free_node", "sample_free_node_alt"), ("sample_memset_node", "sample_memset_node_alt"), ("sample_memcpy_node", "sample_memcpy_node_alt"), + ("sample_child_graph_node", "sample_child_graph_node_alt"), ("sample_event_record_node", "sample_event_record_node_alt"), ("sample_event_wait_node", "sample_event_wait_node_alt"), ] @@ -488,6 +512,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): ("sample_free_node", r""), ("sample_memset_node", r""), ("sample_memcpy_node", r""), + ("sample_child_graph_node", r""), ("sample_event_record_node", r""), ("sample_event_wait_node", r""), ] From 506850b72e9647069c50137ed75146742917071c Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 10:58:59 -0800 Subject: [PATCH 11/23] Add HostCallbackNode with Python callable and ctypes CFUNCTYPE support Implements host callback graph nodes supporting two modes: - Python callable: GIL acquired via trampoline, nullary callbacks - ctypes CFUNCTYPE: raw C function pointer with optional user_data (bytes copied to graph-managed buffer, or raw int passthrough) Uses CUDA user objects to tie callback/data lifetime to the graph. Made-with: Cursor --- cuda_core/cuda/core/_graph/_graphdef.pxd | 16 ++ cuda_core/cuda/core/_graph/_graphdef.pyx | 195 ++++++++++++++++++++++- cuda_core/tests/graph/test_explicit.py | 100 ++++++++++++ cuda_core/tests/test_object_protocols.py | 25 +++ 4 files changed, 334 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index 55f77576a8..18bed46808 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -19,6 +19,7 @@ cdef class MemcpyNode(Node) cdef class ChildGraphNode(Node) cdef class EventRecordNode(Node) cdef class EventWaitNode(Node) +cdef class HostCallbackNode(Node) cdef class GraphDef: @@ -163,3 +164,18 @@ cdef class EventWaitNode(Node): @staticmethod cdef EventWaitNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + + +cdef class HostCallbackNode(Node): + cdef: + object _callable + cydriver.CUhostFn _fn + void* _user_data + + @staticmethod + cdef HostCallbackNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + object callable_obj, cydriver.CUhostFn fn, + void* user_data) + + @staticmethod + cdef HostCallbackNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 4007803f78..c8b3eff2ef 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -19,7 +19,8 @@ Node hierarchy: ├── MemcpyNode (memory copy, exposes dst, src, size) ├── ChildGraphNode (embedded sub-graph) ├── EventRecordNode (record an event) - └── EventWaitNode (wait for an event) + ├── EventWaitNode (wait for an event) + └── HostCallbackNode (host CPU callback) """ from dataclasses import dataclass @@ -28,9 +29,12 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: from cuda.core import Device +from cpython.ref cimport Py_INCREF + from libc.stddef cimport size_t from libc.stdint cimport uintptr_t -from libc.string cimport memset as c_memset +from libc.stdlib cimport malloc, free +from libc.string cimport memset as c_memset, memcpy as c_memcpy from libcpp.vector cimport vector @@ -53,6 +57,44 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value from cuda.core._utils.cuda_utils import driver +cdef extern from "Python.h": + void _py_decref "Py_DECREF" (void*) + + +cdef void _py_host_trampoline(void* data) noexcept with gil: + (data)() + + +cdef void _py_host_destructor(void* data) noexcept with gil: + _py_decref(data) + + +cdef void _attach_user_object( + cydriver.CUgraph graph, void* ptr, + cydriver.CUhostFn destroy) except *: + """Create a CUDA user object and transfer ownership to the graph. + + On success the graph owns the resource (via MOVE semantics). + On failure the destroy callback is invoked to clean up ptr, + then a CUDAError is raised — callers need no try/except. + """ + cdef cydriver.CUuserObject user_obj = NULL + cdef cydriver.CUresult ret + with nogil: + ret = cydriver.cuUserObjectCreate( + &user_obj, ptr, destroy, 1, + cydriver.CU_USER_OBJECT_NO_DESTRUCTOR_SYNC) + if ret == cydriver.CUDA_SUCCESS: + ret = cydriver.cuGraphRetainUserObject( + graph, user_obj, 1, cydriver.CU_GRAPH_USER_OBJECT_MOVE) + if ret != cydriver.CUDA_SUCCESS: + cydriver.cuUserObjectRelease(user_obj, 1) + if ret != cydriver.CUDA_SUCCESS: + if user_obj == NULL: + destroy(ptr) + HANDLE_RETURN(ret) + + @dataclass class GraphAllocOptions: """Options for graph memory allocation nodes. @@ -201,6 +243,13 @@ cdef class GraphDef: """ return self._entry.wait_event(event) + def callback(self, fn, *, user_data=None): + """Add an entry-point host callback node (no dependencies). + + See :meth:`Node.callback` for full documentation. + """ + return self._entry.callback(fn, user_data=user_data) + def instantiate(self): """Instantiate the graph definition into an executable Graph. @@ -341,6 +390,8 @@ cdef class Node: return EventRecordNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_WAIT_EVENT: return EventWaitNode._create_from_driver(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_HOST: + return HostCallbackNode._create_from_driver(h_graph, node) else: n = Node.__new__(Node) (n)._h_graph = h_graph @@ -921,6 +972,96 @@ cdef class Node: self._succ_cache = None return EventWaitNode._create_with_params(self._h_graph, new_node, c_event) + def callback(self, fn, *, user_data=None): + """Add a host callback node depending on this node. + + The callback runs on the host CPU when the graph reaches this node. + Two modes are supported: + + - **Python callable**: Pass any callable. The GIL is acquired + automatically. The callable must take no arguments; use closures + or ``functools.partial`` to bind state. + - **ctypes function pointer**: Pass a ``ctypes.CFUNCTYPE`` instance. + The function receives a single ``void*`` argument (the + ``user_data``). The caller must keep the ctypes wrapper alive + for the lifetime of the graph. + + .. warning:: + + Callbacks must not call CUDA API functions. Doing so may + deadlock or corrupt driver state. + + Parameters + ---------- + fn : callable or ctypes function pointer + The callback function. + user_data : int or bytes-like, optional + Only for ctypes function pointers. If ``int``, passed as a raw + pointer (caller manages lifetime). If bytes-like, the data is + copied and its lifetime is tied to the graph. + + Returns + ------- + HostCallbackNode + A new HostCallbackNode representing the callback. + """ + import ctypes as ct + + cdef cydriver.CUDA_HOST_NODE_PARAMS node_params + cdef cydriver.CUgraphNode new_node = NULL + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + cdef void* c_user_data = NULL + cdef object callable_obj = None + cdef void* fn_pyobj = NULL + + if self._node != NULL: + deps = &self._node + num_deps = 1 + + if isinstance(fn, ct._CFuncPtr): + node_params.fn = ct.cast( + fn, ct.c_void_p).value + + if user_data is not None: + if isinstance(user_data, int): + c_user_data = user_data + else: + buf = bytes(user_data) + c_user_data = malloc(len(buf)) + if c_user_data == NULL: + raise MemoryError( + "failed to allocate user_data buffer") + c_memcpy(c_user_data, buf, len(buf)) + _attach_user_object( + graph, c_user_data, + free) + + node_params.userData = c_user_data + else: + if user_data is not None: + raise ValueError( + "user_data is only supported with ctypes " + "function pointers") + callable_obj = fn + Py_INCREF(fn) + fn_pyobj = fn + node_params.fn = _py_host_trampoline + node_params.userData = fn_pyobj + _attach_user_object( + graph, fn_pyobj, + _py_host_destructor) + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddHostNode( + &new_node, graph, deps, num_deps, &node_params)) + + self._succ_cache = None + return HostCallbackNode._create_with_params( + self._h_graph, new_node, callable_obj, + node_params.fn, node_params.userData) + # ============================================================================= # Node subclasses @@ -1421,3 +1562,53 @@ cdef class EventWaitNode(Node): def event(self): """The event being waited on (non-owning wrapper).""" return Event._from_handle(self._event) + + +cdef class HostCallbackNode(Node): + """A host callback node. + + Properties + ---------- + callback_fn : callable or None + The Python callable (None for ctypes function pointer callbacks). + """ + + @staticmethod + cdef HostCallbackNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + object callable_obj, cydriver.CUhostFn fn, + void* user_data): + """Create from known params (called by callback() builder).""" + cdef HostCallbackNode n = HostCallbackNode.__new__(HostCallbackNode) + n._h_graph = h_graph + n._node = node + n._callable = callable_obj + n._fn = fn + n._user_data = user_data + return n + + @staticmethod + cdef HostCallbackNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUDA_HOST_NODE_PARAMS params + with nogil: + HANDLE_RETURN(cydriver.cuGraphHostNodeGetParams(node, ¶ms)) + + cdef object callable_obj = None + if params.fn == _py_host_trampoline: + # cast Py_INCREFs — HostCallbackNode holds its own + # reference, independent of the user object's reference. + callable_obj = params.userData + + return HostCallbackNode._create_with_params( + h_graph, node, callable_obj, params.fn, params.userData) + + def __repr__(self): + if self._callable is not None: + name = getattr(self._callable, '__name__', '?') + return f"" + return f"self._fn:x}>" + + @property + def callback_fn(self): + """The Python callable, or None for ctypes function pointer callbacks.""" + return self._callable diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py index 33320a4072..d95c1d0dc9 100644 --- a/cuda_core/tests/graph/test_explicit.py +++ b/cuda_core/tests/graph/test_explicit.py @@ -20,6 +20,7 @@ FreeNode, GraphAllocOptions, GraphDef, + HostCallbackNode, KernelNode, MemcpyNode, MemsetNode, @@ -332,6 +333,29 @@ def _build_memcpy_node(g): } +def _build_host_callback_node(g): + def my_callback(): + pass + + node = g.callback(my_callback) + return node, { + "callback_fn": lambda v: v is my_callback, + } + + +def _build_host_callback_cfunc_node(g): + import ctypes + + CALLBACK = ctypes.CFUNCTYPE(None, ctypes.c_void_p) + + @CALLBACK + def noop(data): + pass + + node = g.callback(noop) + return node, {} + + def _build_child_graph_node(g): child = GraphDef() mod = compile_common_kernels() @@ -370,6 +394,14 @@ def _build_child_graph_node(g): NodeSpec("child_graph", ChildGraphNode, "CU_GRAPH_NODE_TYPE_GRAPH", _build_child_graph_node), id="child_graph", ), + pytest.param( + NodeSpec("host_callback", HostCallbackNode, "CU_GRAPH_NODE_TYPE_HOST", _build_host_callback_node), + id="host_callback", + ), + pytest.param( + NodeSpec("host_callback_cfunc", HostCallbackNode, "CU_GRAPH_NODE_TYPE_HOST", _build_host_callback_cfunc_node), + id="host_callback_cfunc", + ), pytest.param( NodeSpec("event_record", EventRecordNode, "CU_GRAPH_NODE_TYPE_EVENT_RECORD", _build_event_record_node), id="event_record", @@ -743,6 +775,74 @@ def test_instantiate_and_execute_child_graph(sample_graphdef): stream.sync() +def test_instantiate_and_execute_host_callback(sample_graphdef): + """Graph with host callback can be executed and callback is invoked.""" + results = [] + + def my_callback(): + results.append(42) + + sample_graphdef.callback(my_callback) + graph = sample_graphdef.instantiate() + + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + assert results == [42] + + +def test_instantiate_and_execute_host_callback_cfunc(sample_graphdef): + """Graph with ctypes function pointer callback can be executed.""" + import ctypes + + CALLBACK = ctypes.CFUNCTYPE(None, ctypes.c_void_p) + called = [False] + + @CALLBACK + def raw_fn(data): + called[0] = True + + sample_graphdef.callback(raw_fn) + graph = sample_graphdef.instantiate() + + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + assert called[0] + + +def test_host_callback_cfunc_with_user_data(sample_graphdef): + """Host callback with bytes user_data passes data to C function.""" + import ctypes + + CALLBACK = ctypes.CFUNCTYPE(None, ctypes.c_void_p) + result = [0] + + @CALLBACK + def read_byte(data): + result[0] = ctypes.cast(data, ctypes.POINTER(ctypes.c_uint8))[0] + + sample_graphdef.callback(read_byte, user_data=bytes([0xAB])) + graph = sample_graphdef.instantiate() + + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + assert result[0] == 0xAB + + +def test_host_callback_user_data_rejected_for_python_callable(sample_graphdef): + """user_data is rejected for Python callables.""" + with pytest.raises(ValueError, match="user_data is only supported"): + sample_graphdef.callback(lambda: None, user_data=b"hello") + + def test_instantiate_and_execute_event_record_wait(sample_graphdef): """Graph with event record and wait nodes can be executed.""" event = Device().create_event() diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index e617567cbb..b51252e991 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -372,6 +372,26 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): return sample_graphdef.wait_event(event) +@pytest.fixture +def sample_host_callback_node(sample_graphdef): + """A HostCallbackNode.""" + + def my_callback(): + pass + + return sample_graphdef.callback(my_callback) + + +@pytest.fixture +def sample_host_callback_node_alt(sample_graphdef): + """An alternate HostCallbackNode from same graph.""" + + def other_callback(): + pass + + return sample_graphdef.callback(other_callback) + + # ============================================================================= # Type groupings # ============================================================================= @@ -397,6 +417,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): "sample_child_graph_node", "sample_event_record_node", "sample_event_wait_node", + "sample_host_callback_node", ] # Types with __eq__ support @@ -420,6 +441,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): "sample_child_graph_node", "sample_event_record_node", "sample_event_wait_node", + "sample_host_callback_node", ] # Types with __weakref__ support @@ -444,6 +466,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): "sample_child_graph_node", "sample_event_record_node", "sample_event_wait_node", + "sample_host_callback_node", ] # Pairs of distinct objects of the same type (for inequality testing) @@ -468,6 +491,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): ("sample_child_graph_node", "sample_child_graph_node_alt"), ("sample_event_record_node", "sample_event_record_node_alt"), ("sample_event_wait_node", "sample_event_wait_node_alt"), + ("sample_host_callback_node", "sample_host_callback_node_alt"), ] # Types with public from_handle methods and how to create a copy @@ -515,6 +539,7 @@ def sample_event_wait_node_alt(sample_graphdef, sample_device): ("sample_child_graph_node", r""), ("sample_event_record_node", r""), ("sample_event_wait_node", r""), + ("sample_host_callback_node", r""), ] From 4ee0ed78536ccbb0ecd8eb0ecff58c6b17b0475d Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 12:56:27 -0800 Subject: [PATCH 12/23] Fix dangling child graph references by capturing parent handle create_graph_handle_ref now takes a parent GraphHandle, keeping the parent graph alive while any child/branch graph handle exists. This prevents use-after-free when a ChildGraphNode outlives its parent GraphDef. Made-with: Cursor --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 7 ++++--- cuda_core/cuda/core/_cpp/resource_handles.hpp | 9 +++++---- cuda_core/cuda/core/_graph/_graphdef.pyx | 4 ++-- cuda_core/cuda/core/_resource_handles.pxd | 2 +- cuda_core/cuda/core/_resource_handles.pyx | 2 +- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 4d65b74506..61c4b652a4 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -827,12 +827,13 @@ KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_li namespace { struct GraphBox { CUgraph resource; + GraphHandle h_parent; // Keeps parent alive for child/branch graphs }; } // namespace GraphHandle create_graph_handle(CUgraph graph) { auto box = std::shared_ptr( - new GraphBox{graph}, + new GraphBox{graph, {}}, [](const GraphBox* b) { GILReleaseGuard gil; p_cuGraphDestroy(b->resource); @@ -842,8 +843,8 @@ GraphHandle create_graph_handle(CUgraph graph) { return GraphHandle(box, &box->resource); } -GraphHandle create_graph_handle_ref(CUgraph graph) { - auto box = std::make_shared(GraphBox{graph}); +GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent) { + auto box = std::make_shared(GraphBox{graph, h_parent}); return GraphHandle(box, &box->resource); } diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 5fd2445112..a8fa04b010 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -329,10 +329,11 @@ KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_li // The caller must have already created the graph via cuGraphCreate. GraphHandle create_graph_handle(CUgraph graph); -// Create a non-owning graph handle (references existing graph). -// Use for graphs that are managed by a child graph node or another owner. -// The graph will NOT be destroyed when the handle is released. -GraphHandle create_graph_handle_ref(CUgraph graph); +// Create a non-owning graph handle that keeps h_parent alive. +// Use for graphs owned by a child/conditional node in a parent graph. +// The child graph will NOT be destroyed when this handle is released, +// but h_parent will be prevented from destruction while this handle exists. +GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent); // ============================================================================ // Graphics resource handle functions diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index c8b3eff2ef..87909ffb3d 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -905,7 +905,7 @@ cdef class Node: HANDLE_RETURN(cydriver.cuGraphChildGraphNodeGetGraph( new_node, &embedded_graph)) - cdef GraphHandle h_embedded = create_graph_handle_ref(embedded_graph) + cdef GraphHandle h_embedded = create_graph_handle_ref(embedded_graph, self._h_graph) self._succ_cache = None return ChildGraphNode._create_with_params(self._h_graph, new_node, h_embedded) @@ -1475,7 +1475,7 @@ cdef class ChildGraphNode(Node): cdef cydriver.CUgraph child_graph = NULL with nogil: HANDLE_RETURN(cydriver.cuGraphChildGraphNodeGetGraph(node, &child_graph)) - cdef GraphHandle h_child = create_graph_handle_ref(child_graph) + cdef GraphHandle h_child = create_graph_handle_ref(child_graph, h_graph) return ChildGraphNode._create_with_params(h_graph, node, h_child) def __repr__(self): diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 58ab1d9abe..18d5736eac 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -143,7 +143,7 @@ cdef KernelHandle create_kernel_handle_ref( # Graph handles cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil -cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph) except+ nogil +cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil # Graphics resource handles cdef GraphicsResourceHandle create_graphics_resource_handle( diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index ac0410c49a..100bda8e26 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -131,7 +131,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": GraphHandle create_graph_handle "cuda_core::create_graph_handle" ( cydriver.CUgraph graph) except+ nogil GraphHandle create_graph_handle_ref "cuda_core::create_graph_handle_ref" ( - cydriver.CUgraph graph) except+ nogil + cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil # Graphics resource handles GraphicsResourceHandle create_graphics_resource_handle "cuda_core::create_graphics_resource_handle" ( From 82f0ec70e07f380888661bbed9249c41bb986dfd Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 13:46:27 -0800 Subject: [PATCH 13/23] Add conditional graph nodes (IfNode, IfElseNode, WhileNode, SwitchNode) Implement conditional node hierarchy with Condition wrapper class, builder methods (if_cond, if_else, while_loop, switch), and branch graph access via non-owning GraphDef handles. Pre-CUDA 13.2 driver reconstruction falls back to ConditionalNode base class. Made-with: Cursor --- cuda_core/cuda/core/_graph/_graphdef.pxd | 35 +++ cuda_core/cuda/core/_graph/_graphdef.pyx | 355 ++++++++++++++++++++++- cuda_core/tests/graph/test_explicit.py | 220 +++++++++++++- cuda_core/tests/helpers/misc.py | 14 +- cuda_core/tests/test_object_protocols.py | 94 ++++++ 5 files changed, 714 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index 18bed46808..73cb733d66 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport GraphHandle +cdef class Condition cdef class GraphDef cdef class Node cdef class EmptyNode(Node) @@ -20,6 +21,17 @@ cdef class ChildGraphNode(Node) cdef class EventRecordNode(Node) cdef class EventWaitNode(Node) cdef class HostCallbackNode(Node) +cdef class ConditionalNode(Node) +cdef class IfNode(ConditionalNode) +cdef class IfElseNode(ConditionalNode) +cdef class WhileNode(ConditionalNode) +cdef class SwitchNode(ConditionalNode) + + +cdef class Condition: + cdef: + cydriver.CUgraphConditionalHandle _c_handle + object __weakref__ cdef class GraphDef: @@ -179,3 +191,26 @@ cdef class HostCallbackNode(Node): @staticmethod cdef HostCallbackNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + + +cdef class ConditionalNode(Node): + cdef: + Condition _condition + cydriver.CUgraphConditionalNodeType _cond_type + tuple _branches # tuple of GraphDef (non-owning wrappers) + + +cdef class IfNode(ConditionalNode): + pass + + +cdef class IfElseNode(ConditionalNode): + pass + + +cdef class WhileNode(ConditionalNode): + pass + + +cdef class SwitchNode(ConditionalNode): + pass diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 87909ffb3d..03296d1877 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -20,7 +20,12 @@ Node hierarchy: ├── ChildGraphNode (embedded sub-graph) ├── EventRecordNode (record an event) ├── EventWaitNode (wait for an event) - └── HostCallbackNode (host CPU callback) + ├── HostCallbackNode (host CPU callback) + └── ConditionalNode (conditional execution — base for reconstruction) + ├── IfNode (if-then conditional, 1 branch) + ├── IfElseNode (if-then-else conditional, 2 branches) + ├── WhileNode (while-loop conditional, 1 branch) + └── SwitchNode (switch conditional, N branches) """ from dataclasses import dataclass @@ -95,6 +100,92 @@ cdef void _attach_user_object( HANDLE_RETURN(ret) +cdef class Condition: + """Wraps a CUgraphConditionalHandle. + + Created by :meth:`GraphDef.create_condition` and passed to + conditional-node builder methods (``if_cond``, ``if_else``, + ``while_loop``, ``switch``). The underlying value is set at + runtime by device code via ``cudaGraphSetConditional``. + """ + + def __repr__(self): + return f"self._c_handle:x}>" + + def __eq__(self, other): + if not isinstance(other, Condition): + return NotImplemented + return self._c_handle == (other)._c_handle + + def __hash__(self): + return hash(self._c_handle) + + @property + def handle(self): + """The raw CUgraphConditionalHandle as an int.""" + return self._c_handle + + +cdef ConditionalNode _make_conditional_node( + Node pred, + Condition condition, + cydriver.CUgraphConditionalNodeType cond_type, + unsigned int size, + type node_cls): + if not isinstance(condition, Condition): + raise TypeError( + f"condition must be a Condition object (from " + f"GraphDef.create_condition()), got {type(condition).__name__}") + cdef cydriver.CUgraphNodeParams params + cdef cydriver.CUgraphNode new_node = NULL + cdef vector[cydriver.CUgraph] branch_graphs + branch_graphs.resize(size) + + c_memset(¶ms, 0, sizeof(params)) + params.type = cydriver.CU_GRAPH_NODE_TYPE_CONDITIONAL + params.conditional.handle = condition._c_handle + params.conditional.type = cond_type + params.conditional.size = size + params.conditional.phGraph_out = branch_graphs.data() + + cdef cydriver.CUcontext ctx = NULL + cdef cydriver.CUgraph graph = as_cu(pred._h_graph) + cdef cydriver.CUgraphNode* deps = NULL + cdef size_t num_deps = 0 + + if pred._node != NULL: + deps = &pred._node + num_deps = 1 + + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + params.conditional.ctx = ctx + + with nogil: + HANDLE_RETURN(cydriver.cuGraphAddNode( + &new_node, graph, deps, NULL, num_deps, ¶ms)) + + cdef list branch_list = [] + cdef unsigned int i + cdef cydriver.CUgraph bg + cdef GraphHandle h_branch + for i in range(size): + bg = branch_graphs[i] + h_branch = create_graph_handle_ref(bg, pred._h_graph) + branch_list.append(GraphDef._from_handle(h_branch)) + cdef tuple branches = tuple(branch_list) + + cdef ConditionalNode n = node_cls.__new__(node_cls) + n._h_graph = pred._h_graph + n._node = new_node + n._condition = condition + n._cond_type = cond_type + n._branches = branches + + pred._succ_cache = None + return n + + @dataclass class GraphAllocOptions: """Options for graph memory allocation nodes. @@ -250,6 +341,71 @@ cdef class GraphDef: """ return self._entry.callback(fn, user_data=user_data) + def create_condition(self, default_value=None): + """Create a condition variable for use with conditional nodes. + + The returned :class:`Condition` object is passed to conditional-node + builder methods. Its value is controlled at runtime by device code + via ``cudaGraphSetConditional``. + + Parameters + ---------- + default_value : int, optional + The default value to assign to the condition. + If None, no default is assigned. + + Returns + ------- + Condition + A condition variable for controlling conditional execution. + """ + cdef cydriver.CUgraphConditionalHandle c_handle + cdef unsigned int flags = 0 + cdef unsigned int default_val = 0 + + if default_value is not None: + default_val = default_value + flags = cydriver.CU_GRAPH_COND_ASSIGN_DEFAULT + + cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef cydriver.CUcontext ctx = NULL + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + HANDLE_RETURN(cydriver.cuGraphConditionalHandleCreate( + &c_handle, graph, ctx, default_val, flags)) + + cdef Condition cond = Condition.__new__(Condition) + cond._c_handle = c_handle + return cond + + def if_cond(self, condition): + """Add an entry-point if-conditional node (no dependencies). + + See :meth:`Node.if_cond` for full documentation. + """ + return self._entry.if_cond(condition) + + def if_else(self, condition): + """Add an entry-point if-else conditional node (no dependencies). + + See :meth:`Node.if_else` for full documentation. + """ + return self._entry.if_else(condition) + + def while_loop(self, condition): + """Add an entry-point while-loop conditional node (no dependencies). + + See :meth:`Node.while_loop` for full documentation. + """ + return self._entry.while_loop(condition) + + def switch(self, condition, unsigned int count): + """Add an entry-point switch conditional node (no dependencies). + + See :meth:`Node.switch` for full documentation. + """ + return self._entry.switch(condition, count) + def instantiate(self): """Instantiate the graph definition into an executable Graph. @@ -392,6 +548,17 @@ cdef class Node: return EventWaitNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_HOST: return HostCallbackNode._create_from_driver(h_graph, node) + elif node_type == cydriver.CU_GRAPH_NODE_TYPE_CONDITIONAL: + # TODO(CUDA 13.2): Use cuGraphNodeGetParams to reconstruct + # ConditionalNode subtype (IfNode, IfElseNode, WhileNode, SwitchNode). + # Until then, falls through to ConditionalNode base with no params. + n = ConditionalNode.__new__(ConditionalNode) + (n)._h_graph = h_graph + (n)._node = node + (n)._condition = None + (n)._cond_type = cydriver.CU_GRAPH_COND_TYPE_IF + (n)._branches = () + return n else: n = Node.__new__(Node) (n)._h_graph = h_graph @@ -1062,6 +1229,89 @@ cdef class Node: self._h_graph, new_node, callable_obj, node_params.fn, node_params.userData) + def if_cond(self, condition): + """Add an if-conditional node depending on this node. + + The body graph executes only when the condition evaluates to + a non-zero value at runtime. + + Parameters + ---------- + condition : Condition + Condition from :meth:`GraphDef.create_condition`. + + Returns + ------- + IfNode + A new IfNode with one branch accessible via ``.then``. + """ + return _make_conditional_node( + self, condition, + cydriver.CU_GRAPH_COND_TYPE_IF, 1, IfNode) + + def if_else(self, condition): + """Add an if-else conditional node depending on this node. + + Two body graphs: the first executes when the condition is + non-zero, the second when it is zero. + + Parameters + ---------- + condition : Condition + Condition from :meth:`GraphDef.create_condition`. + + Returns + ------- + IfElseNode + A new IfElseNode with branches accessible via + ``.then`` and ``.else_``. + """ + return _make_conditional_node( + self, condition, + cydriver.CU_GRAPH_COND_TYPE_IF, 2, IfElseNode) + + def while_loop(self, condition): + """Add a while-loop conditional node depending on this node. + + The body graph executes repeatedly while the condition + evaluates to a non-zero value. + + Parameters + ---------- + condition : Condition + Condition from :meth:`GraphDef.create_condition`. + + Returns + ------- + WhileNode + A new WhileNode with body accessible via ``.body``. + """ + return _make_conditional_node( + self, condition, + cydriver.CU_GRAPH_COND_TYPE_WHILE, 1, WhileNode) + + def switch(self, condition, unsigned int count): + """Add a switch conditional node depending on this node. + + The condition value selects which branch to execute. If the + value is out of range, no branch executes. + + Parameters + ---------- + condition : Condition + Condition from :meth:`GraphDef.create_condition`. + count : int + Number of switch cases (branches). + + Returns + ------- + SwitchNode + A new SwitchNode with branches accessible via ``.branches``. + """ + return _make_conditional_node( + self, condition, + cydriver.CU_GRAPH_COND_TYPE_SWITCH, count, SwitchNode) + # ============================================================================= # Node subclasses @@ -1612,3 +1862,106 @@ cdef class HostCallbackNode(Node): def callback_fn(self): """The Python callable, or None for ctypes function pointer callbacks.""" return self._callable + + +cdef class ConditionalNode(Node): + """Base class for conditional graph nodes. + + When created via builder methods (if_cond, if_else, while_loop, switch), + a specific subclass (IfNode, IfElseNode, WhileNode, SwitchNode) is + returned. When reconstructed from the driver (pre-CUDA 13.2), this + base class is used as a fallback since the driver does not yet expose + a getter for conditional node parameters. + + Properties + ---------- + condition : Condition + The condition variable controlling execution. + cond_type : str + The conditional type ("if", "while", or "switch"). + branches : tuple of GraphDef + The body graphs for each branch. + """ + + def __repr__(self): + return "" + + @property + def condition(self): + """The condition variable controlling execution.""" + return self._condition + + @property + def cond_type(self): + """The conditional type as a string: 'if', 'while', or 'switch'. + + Returns None when reconstructed from the driver pre-CUDA 13.2, + as the conditional type cannot be determined. + """ + if self._condition is None: + return None + if self._cond_type == cydriver.CU_GRAPH_COND_TYPE_IF: + return "if" + elif self._cond_type == cydriver.CU_GRAPH_COND_TYPE_WHILE: + return "while" + else: + return "switch" + + @property + def branches(self): + """The body graphs for each branch as a tuple of GraphDef. + + Returns an empty tuple when reconstructed from the driver + pre-CUDA 13.2. + """ + return self._branches + + +cdef class IfNode(ConditionalNode): + """An if-conditional node (1 branch, executes when condition is non-zero).""" + + def __repr__(self): + return f"self._condition._c_handle:x}>" + + @property + def then(self): + """The 'then' branch graph.""" + return self._branches[0] + + +cdef class IfElseNode(ConditionalNode): + """An if-else conditional node (2 branches).""" + + def __repr__(self): + return f"self._condition._c_handle:x}>" + + @property + def then(self): + """The 'then' branch graph (executed when condition is non-zero).""" + return self._branches[0] + + @property + def else_(self): + """The 'else' branch graph (executed when condition is zero).""" + return self._branches[1] + + +cdef class WhileNode(ConditionalNode): + """A while-loop conditional node (1 branch, repeats while condition is non-zero).""" + + def __repr__(self): + return f"self._condition._c_handle:x}>" + + @property + def body(self): + """The loop body graph.""" + return self._branches[0] + + +cdef class SwitchNode(ConditionalNode): + """A switch conditional node (N branches, selected by condition value).""" + + def __repr__(self): + cdef Py_ssize_t n = len(self._branches) + return (f"self._condition._c_handle:x}" + f" with {n} {'branch' if n == 1 else 'branches'}>") diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py index d95c1d0dc9..854e2b4f7b 100644 --- a/cuda_core/tests/graph/test_explicit.py +++ b/cuda_core/tests/graph/test_explicit.py @@ -8,12 +8,14 @@ import pytest from helpers.graph_kernels import compile_common_kernels +from helpers.misc import try_create_condition from cuda.core import Device, LaunchConfig from cuda.core._graph import GraphDebugPrintOptions from cuda.core._graph._graphdef import ( AllocNode, ChildGraphNode, + ConditionalNode, EmptyNode, EventRecordNode, EventWaitNode, @@ -21,10 +23,14 @@ GraphAllocOptions, GraphDef, HostCallbackNode, + IfElseNode, + IfNode, KernelNode, MemcpyNode, MemsetNode, Node, + SwitchNode, + WhileNode, ) ALLOC_SIZE = 1024 @@ -189,6 +195,12 @@ class NodeSpec: expected_class: type expected_type_name: str builder: Callable[[GraphDef], tuple[Node, dict]] + reconstructed_class: type | None = None + + @property + def roundtrip_class(self): + """Class expected after reconstruction from the driver.""" + return self.reconstructed_class or self.expected_class def _build_empty_node(g): @@ -369,6 +381,50 @@ def _build_child_graph_node(g): } +def _build_if_cond_node(g): + condition = try_create_condition(g) + node = g.if_cond(condition) + return node, { + "condition": condition, + "cond_type": "if", + "branches": lambda v: isinstance(v, tuple) and len(v) == 1, + "then": lambda v: isinstance(v, GraphDef), + } + + +def _build_if_else_node(g): + condition = try_create_condition(g) + node = g.if_else(condition) + return node, { + "condition": condition, + "cond_type": "if", + "branches": lambda v: isinstance(v, tuple) and len(v) == 2, + "then": lambda v: isinstance(v, GraphDef), + "else_": lambda v: isinstance(v, GraphDef), + } + + +def _build_while_loop_node(g): + condition = try_create_condition(g) + node = g.while_loop(condition) + return node, { + "condition": condition, + "cond_type": "while", + "branches": lambda v: isinstance(v, tuple) and len(v) == 1, + "body": lambda v: isinstance(v, GraphDef), + } + + +def _build_switch_node(g): + condition = try_create_condition(g) + node = g.switch(condition, 3) + return node, { + "condition": condition, + "cond_type": "switch", + "branches": lambda v: isinstance(v, tuple) and len(v) == 3, + } + + _NODE_SPECS = [ pytest.param(NodeSpec("empty", EmptyNode, "CU_GRAPH_NODE_TYPE_EMPTY", _build_empty_node), id="empty"), pytest.param(NodeSpec("kernel", KernelNode, "CU_GRAPH_NODE_TYPE_KERNEL", _build_kernel_node), id="kernel"), @@ -410,6 +466,46 @@ def _build_child_graph_node(g): NodeSpec("event_wait", EventWaitNode, "CU_GRAPH_NODE_TYPE_WAIT_EVENT", _build_event_wait_node), id="event_wait", ), + pytest.param( + NodeSpec( + "if_cond", + IfNode, + "CU_GRAPH_NODE_TYPE_CONDITIONAL", + _build_if_cond_node, + reconstructed_class=ConditionalNode, + ), + id="if_cond", + ), + pytest.param( + NodeSpec( + "if_else", + IfElseNode, + "CU_GRAPH_NODE_TYPE_CONDITIONAL", + _build_if_else_node, + reconstructed_class=ConditionalNode, + ), + id="if_else", + ), + pytest.param( + NodeSpec( + "while_loop", + WhileNode, + "CU_GRAPH_NODE_TYPE_CONDITIONAL", + _build_while_loop_node, + reconstructed_class=ConditionalNode, + ), + id="while_loop", + ), + pytest.param( + NodeSpec( + "switch", + SwitchNode, + "CU_GRAPH_NODE_TYPE_CONDITIONAL", + _build_switch_node, + reconstructed_class=ConditionalNode, + ), + id="switch", + ), ] @@ -513,7 +609,7 @@ def test_node_type_preserved_by_nodes(node_spec): all_nodes = g.nodes() matched = [n for n in all_nodes if n == node] assert len(matched) == 1 - assert isinstance(matched[0], spec.expected_class) + assert isinstance(matched[0], spec.roundtrip_class) def test_node_type_preserved_by_pred_succ(node_spec): @@ -522,7 +618,7 @@ def test_node_type_preserved_by_pred_succ(node_spec): for predecessor in node.pred: matched = [s for s in predecessor.succ if s == node] assert len(matched) == 1 - assert isinstance(matched[0], spec.expected_class) + assert isinstance(matched[0], spec.roundtrip_class) def test_node_attrs(node_spec): @@ -543,6 +639,8 @@ def test_node_attrs_preserved_by_nodes(node_spec): spec, g, node, expected_attrs = node_spec if not expected_attrs: pytest.skip("no type-specific attributes") + if spec.reconstructed_class is not None: + pytest.skip("reconstructed type differs — attrs not preserved") retrieved = next(n for n in g.nodes() if n == node) for attr in expected_attrs: assert getattr(retrieved, attr) == getattr(node, attr), f"{spec.name}.{attr} not preserved by nodes()" @@ -856,6 +954,124 @@ def test_instantiate_and_execute_event_record_wait(sample_graphdef): stream.sync() +# ============================================================================= +# Conditional nodes +# ============================================================================= + + +def _skip_unless_cc_90(): + if Device(0).compute_capability < (9, 0): + pytest.skip("Conditional node execution requires CC >= 9.0 (Hopper)") + + +def test_instantiate_and_execute_if_cond(sample_graphdef): + """If-conditional node: body executes only when condition is non-zero.""" + _skip_unless_cc_90() + import ctypes + + from helpers.graph_kernels import compile_conditional_kernels + + condition = sample_graphdef.create_condition(default_value=0) + mod = compile_conditional_kernels(int) + set_handle = mod.get_kernel("set_handle") + add_one = mod.get_kernel("add_one") + + alloc = sample_graphdef.alloc(ctypes.sizeof(ctypes.c_int)) + ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) + setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition.handle, 1) + if_node = setter.if_cond(condition) + if_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + + graph = sample_graphdef.instantiate() + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + result = (ctypes.c_int * 1)() + from cuda.bindings import driver as drv + + drv.cuMemcpyDtoH(result, alloc.dptr, ctypes.sizeof(ctypes.c_int)) + assert result[0] == 1 + + +def test_instantiate_and_execute_if_else(sample_graphdef): + """If-else node: then or else branch executes based on condition.""" + _skip_unless_cc_90() + import ctypes + + from helpers.graph_kernels import compile_conditional_kernels + + condition = sample_graphdef.create_condition(default_value=0) + mod = compile_conditional_kernels(int) + set_handle = mod.get_kernel("set_handle") + add_one = mod.get_kernel("add_one") + + alloc = sample_graphdef.alloc(ctypes.sizeof(ctypes.c_int)) + ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) + setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition.handle, 0) + ie_node = setter.if_else(condition) + ie_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + n1 = ie_node.else_.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + n1.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + + graph = sample_graphdef.instantiate() + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + result = (ctypes.c_int * 1)() + from cuda.bindings import driver as drv + + drv.cuMemcpyDtoH(result, alloc.dptr, ctypes.sizeof(ctypes.c_int)) + assert result[0] == 2 + + +def test_instantiate_and_execute_switch(sample_graphdef): + """Switch node: selected branch executes based on condition value.""" + _skip_unless_cc_90() + import ctypes + + from helpers.graph_kernels import compile_conditional_kernels + + condition = sample_graphdef.create_condition(default_value=0) + mod = compile_conditional_kernels(int) + set_handle = mod.get_kernel("set_handle") + add_one = mod.get_kernel("add_one") + + alloc = sample_graphdef.alloc(ctypes.sizeof(ctypes.c_int)) + ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) + setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition.handle, 2) + sw_node = setter.switch(condition, 4) + for branch in sw_node.branches: + branch.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) + + graph = sample_graphdef.instantiate() + stream = Device().create_stream() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + result = (ctypes.c_int * 1)() + from cuda.bindings import driver as drv + + drv.cuMemcpyDtoH(result, alloc.dptr, ctypes.sizeof(ctypes.c_int)) + assert result[0] == 1 + + +def test_conditional_node_type_preserved_by_nodes(sample_graphdef): + """Conditional nodes appear as ConditionalNode base when read back from graph.""" + condition = try_create_condition(sample_graphdef) + if_node = sample_graphdef.if_cond(condition) + assert isinstance(if_node, IfNode) + + all_nodes = sample_graphdef.nodes() + matched = [n for n in all_nodes if n == if_node] + assert len(matched) == 1 + assert isinstance(matched[0], ConditionalNode) + + # ============================================================================= # Debug output # ============================================================================= diff --git a/cuda_core/tests/helpers/misc.py b/cuda_core/tests/helpers/misc.py index aa5757c4ce..6b83c751ab 100644 --- a/cuda_core/tests/helpers/misc.py +++ b/cuda_core/tests/helpers/misc.py @@ -1,6 +1,18 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import pytest + + +def try_create_condition(g, default_value=1): + """Create a Condition on graph *g*, skipping the test if unsupported.""" + from cuda.core._utils.cuda_utils import CUDAError + + try: + return g.create_condition(default_value=default_value) + except CUDAError: + pytest.skip("Conditional nodes not supported (requires CC >= 9.0)") + class StreamWrapper: """ diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index b51252e991..82a7cff1d4 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -13,6 +13,7 @@ import pytest from helpers.graph_kernels import compile_common_kernels +from helpers.misc import try_create_condition from cuda.core import Buffer, Device, Kernel, LaunchConfig, Program, Stream, system from cuda.core._graph._graphdef import GraphDef @@ -392,6 +393,74 @@ def other_callback(): return sample_graphdef.callback(other_callback) +@pytest.fixture +def sample_condition(sample_graphdef): + """A Condition object.""" + return try_create_condition(sample_graphdef) + + +@pytest.fixture +def sample_condition_alt(sample_graphdef): + """An alternate Condition from same graph.""" + return try_create_condition(sample_graphdef) + + +@pytest.fixture +def sample_if_node(sample_graphdef): + """An IfNode.""" + condition = try_create_condition(sample_graphdef) + return sample_graphdef.if_cond(condition) + + +@pytest.fixture +def sample_if_node_alt(sample_graphdef): + """An alternate IfNode from same graph.""" + condition = try_create_condition(sample_graphdef) + return sample_graphdef.if_cond(condition) + + +@pytest.fixture +def sample_if_else_node(sample_graphdef): + """An IfElseNode.""" + condition = try_create_condition(sample_graphdef) + return sample_graphdef.if_else(condition) + + +@pytest.fixture +def sample_if_else_node_alt(sample_graphdef): + """An alternate IfElseNode from same graph.""" + condition = try_create_condition(sample_graphdef) + return sample_graphdef.if_else(condition) + + +@pytest.fixture +def sample_while_node(sample_graphdef): + """A WhileNode.""" + condition = try_create_condition(sample_graphdef) + return sample_graphdef.while_loop(condition) + + +@pytest.fixture +def sample_while_node_alt(sample_graphdef): + """An alternate WhileNode from same graph.""" + condition = try_create_condition(sample_graphdef) + return sample_graphdef.while_loop(condition) + + +@pytest.fixture +def sample_switch_node(sample_graphdef): + """A SwitchNode.""" + condition = try_create_condition(sample_graphdef) + return sample_graphdef.switch(condition, 3) + + +@pytest.fixture +def sample_switch_node_alt(sample_graphdef): + """An alternate SwitchNode from same graph.""" + condition = try_create_condition(sample_graphdef) + return sample_graphdef.switch(condition, 3) + + # ============================================================================= # Type groupings # ============================================================================= @@ -407,6 +476,7 @@ def other_callback(): "sample_object_code_cubin", "sample_kernel", "sample_graphdef", + "sample_condition", "sample_root_node", "sample_empty_node", "sample_alloc_node", @@ -418,6 +488,10 @@ def other_callback(): "sample_event_record_node", "sample_event_wait_node", "sample_host_callback_node", + "sample_if_node", + "sample_if_else_node", + "sample_while_node", + "sample_switch_node", ] # Types with __eq__ support @@ -431,6 +505,7 @@ def other_callback(): "sample_object_code_cubin", "sample_kernel", "sample_graphdef", + "sample_condition", "sample_root_node", "sample_empty_node", "sample_alloc_node", @@ -442,6 +517,10 @@ def other_callback(): "sample_event_record_node", "sample_event_wait_node", "sample_host_callback_node", + "sample_if_node", + "sample_if_else_node", + "sample_while_node", + "sample_switch_node", ] # Types with __weakref__ support @@ -450,6 +529,7 @@ def other_callback(): "sample_stream", "sample_event", "sample_context", + "sample_condition", "sample_buffer", "sample_launch_config", "sample_object_code_cubin", @@ -467,6 +547,10 @@ def other_callback(): "sample_event_record_node", "sample_event_wait_node", "sample_host_callback_node", + "sample_if_node", + "sample_if_else_node", + "sample_while_node", + "sample_switch_node", ] # Pairs of distinct objects of the same type (for inequality testing) @@ -481,6 +565,7 @@ def other_callback(): ("sample_object_code_cubin", "sample_object_code_alt"), ("sample_kernel", "sample_kernel_alt"), ("sample_graphdef", "sample_graphdef_alt"), + ("sample_condition", "sample_condition_alt"), ("sample_root_node", "sample_root_node_alt"), ("sample_empty_node", "sample_empty_node_alt"), ("sample_alloc_node", "sample_alloc_node_alt"), @@ -492,6 +577,10 @@ def other_callback(): ("sample_event_record_node", "sample_event_record_node_alt"), ("sample_event_wait_node", "sample_event_wait_node_alt"), ("sample_host_callback_node", "sample_host_callback_node_alt"), + ("sample_if_node", "sample_if_node_alt"), + ("sample_if_else_node", "sample_if_else_node_alt"), + ("sample_while_node", "sample_while_node_alt"), + ("sample_switch_node", "sample_switch_node_alt"), ] # Types with public from_handle methods and how to create a copy @@ -529,6 +618,7 @@ def other_callback(): ("sample_program_nvvm", r""), # Graph types ("sample_graphdef", r""), + ("sample_condition", r""), ("sample_root_node", r""), ("sample_empty_node", r""), ("sample_alloc_node", r""), @@ -540,6 +630,10 @@ def other_callback(): ("sample_event_record_node", r""), ("sample_event_wait_node", r""), ("sample_host_callback_node", r""), + ("sample_if_node", r""), + ("sample_if_else_node", r""), + ("sample_while_node", r""), + ("sample_switch_node", r""), ] From 5cf85c4e49d7c0d29cd59e047971768917d4b112 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 15:31:18 -0800 Subject: [PATCH 14/23] Reconstruct conditional node subtypes on CUDA 13.2+ drivers Use cuGraphNodeGetParams (Python driver API) to recover the exact ConditionalNode subclass (IfNode, IfElseNode, WhileNode, SwitchNode) when reconstructing from the driver. Falls back to the generic ConditionalNode base on pre-13.2 drivers. Made-with: Cursor --- cuda_core/cuda/core/_graph/_graphdef.pxd | 3 + cuda_core/cuda/core/_graph/_graphdef.pyx | 93 +++++++++++++++++++----- cuda_core/tests/graph/test_explicit.py | 19 +++-- 3 files changed, 91 insertions(+), 24 deletions(-) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index 73cb733d66..bf82c27a7c 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -199,6 +199,9 @@ cdef class ConditionalNode(Node): cydriver.CUgraphConditionalNodeType _cond_type tuple _branches # tuple of GraphDef (non-owning wrappers) + @staticmethod + cdef ConditionalNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef class IfNode(ConditionalNode): pass diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 03296d1877..d2888af2d8 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -59,7 +59,19 @@ from cuda.core._launch_config cimport LaunchConfig from cuda.core._kernel_arg_handler cimport ParamHolder from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value -from cuda.core._utils.cuda_utils import driver +from cuda.core._utils.cuda_utils import driver, handle_return + + +cdef bint _has_cuGraphNodeGetParams = False +cdef bint _version_checked = False + +cdef bint _check_node_get_params(): + global _has_cuGraphNodeGetParams, _version_checked + if not _version_checked: + ver = handle_return(driver.cuDriverGetVersion()) + _has_cuGraphNodeGetParams = ver >= 13020 + _version_checked = True + return _has_cuGraphNodeGetParams cdef extern from "Python.h": @@ -549,16 +561,7 @@ cdef class Node: elif node_type == cydriver.CU_GRAPH_NODE_TYPE_HOST: return HostCallbackNode._create_from_driver(h_graph, node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_CONDITIONAL: - # TODO(CUDA 13.2): Use cuGraphNodeGetParams to reconstruct - # ConditionalNode subtype (IfNode, IfElseNode, WhileNode, SwitchNode). - # Until then, falls through to ConditionalNode base with no params. - n = ConditionalNode.__new__(ConditionalNode) - (n)._h_graph = h_graph - (n)._node = node - (n)._condition = None - (n)._cond_type = cydriver.CU_GRAPH_COND_TYPE_IF - (n)._branches = () - return n + return ConditionalNode._create_from_driver(h_graph, node) else: n = Node.__new__(Node) (n)._h_graph = h_graph @@ -1869,20 +1872,72 @@ cdef class ConditionalNode(Node): When created via builder methods (if_cond, if_else, while_loop, switch), a specific subclass (IfNode, IfElseNode, WhileNode, SwitchNode) is - returned. When reconstructed from the driver (pre-CUDA 13.2), this - base class is used as a fallback since the driver does not yet expose - a getter for conditional node parameters. + returned. When reconstructed from the driver on CUDA 13.2+, the + correct subclass is determined via cuGraphNodeGetParams. On older + drivers, this base class is used as a fallback. Properties ---------- - condition : Condition - The condition variable controlling execution. - cond_type : str - The conditional type ("if", "while", or "switch"). + condition : Condition or None + The condition variable controlling execution (None pre-13.2). + cond_type : str or None + The conditional type ("if", "while", or "switch"; None pre-13.2). branches : tuple of GraphDef - The body graphs for each branch. + The body graphs for each branch (empty pre-13.2). """ + @staticmethod + cdef ConditionalNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef ConditionalNode n + if not _check_node_get_params(): + n = ConditionalNode.__new__(ConditionalNode) + n._h_graph = h_graph + n._node = node + n._condition = None + n._cond_type = cydriver.CU_GRAPH_COND_TYPE_IF + n._branches = () + return n + + params = handle_return(driver.cuGraphNodeGetParams( + node)) + cond_params = params.conditional + cdef int cond_type_int = int(cond_params.type) + cdef unsigned int size = int(cond_params.size) + + cdef Condition condition = Condition.__new__(Condition) + condition._c_handle = ( + int(cond_params.handle)) + + cdef list branch_list = [] + cdef unsigned int i + cdef GraphHandle h_branch + if cond_params.phGraph_out is not None: + for i in range(size): + h_branch = create_graph_handle_ref( + int(cond_params.phGraph_out[i]), + h_graph) + branch_list.append(GraphDef._from_handle(h_branch)) + cdef tuple branches = tuple(branch_list) + + cdef type cls + if cond_type_int == cydriver.CU_GRAPH_COND_TYPE_IF: + if size == 1: + cls = IfNode + else: + cls = IfElseNode + elif cond_type_int == cydriver.CU_GRAPH_COND_TYPE_WHILE: + cls = WhileNode + else: + cls = SwitchNode + + n = cls.__new__(cls) + n._h_graph = h_graph + n._node = node + n._condition = condition + n._cond_type = cond_type_int + n._branches = branches + return n + def __repr__(self): return "" diff --git a/cuda_core/tests/graph/test_explicit.py b/cuda_core/tests/graph/test_explicit.py index 854e2b4f7b..b9a16974b9 100644 --- a/cuda_core/tests/graph/test_explicit.py +++ b/cuda_core/tests/graph/test_explicit.py @@ -36,6 +36,15 @@ ALLOC_SIZE = 1024 +def _driver_has_node_get_params(): + from cuda.bindings import driver as drv + + return drv.cuDriverGetVersion()[1] >= 13020 + + +_HAS_NODE_GET_PARAMS = _driver_has_node_get_params() + + # ============================================================================= # GraphSpec — representative graph topologies # ============================================================================= @@ -472,7 +481,7 @@ def _build_switch_node(g): IfNode, "CU_GRAPH_NODE_TYPE_CONDITIONAL", _build_if_cond_node, - reconstructed_class=ConditionalNode, + reconstructed_class=IfNode if _HAS_NODE_GET_PARAMS else ConditionalNode, ), id="if_cond", ), @@ -482,7 +491,7 @@ def _build_switch_node(g): IfElseNode, "CU_GRAPH_NODE_TYPE_CONDITIONAL", _build_if_else_node, - reconstructed_class=ConditionalNode, + reconstructed_class=IfElseNode if _HAS_NODE_GET_PARAMS else ConditionalNode, ), id="if_else", ), @@ -492,7 +501,7 @@ def _build_switch_node(g): WhileNode, "CU_GRAPH_NODE_TYPE_CONDITIONAL", _build_while_loop_node, - reconstructed_class=ConditionalNode, + reconstructed_class=WhileNode if _HAS_NODE_GET_PARAMS else ConditionalNode, ), id="while_loop", ), @@ -502,7 +511,7 @@ def _build_switch_node(g): SwitchNode, "CU_GRAPH_NODE_TYPE_CONDITIONAL", _build_switch_node, - reconstructed_class=ConditionalNode, + reconstructed_class=SwitchNode if _HAS_NODE_GET_PARAMS else ConditionalNode, ), id="switch", ), @@ -639,7 +648,7 @@ def test_node_attrs_preserved_by_nodes(node_spec): spec, g, node, expected_attrs = node_spec if not expected_attrs: pytest.skip("no type-specific attributes") - if spec.reconstructed_class is not None: + if spec.roundtrip_class != spec.expected_class: pytest.skip("reconstructed type differs — attrs not preserved") retrieved = next(n for n in g.nodes() if n == node) for attr in expected_attrs: From d993f9c3f05a2c30bde70cf8d790a150b37e825c Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 15:54:05 -0800 Subject: [PATCH 15/23] Apply developer guide styling to _graphdef.pyx Add __all__, from __future__ import annotations (replacing TYPE_CHECKING), return type annotations on all public methods and properties, and reorder imports per the 5-group convention. Made-with: Cursor --- cuda_core/cuda/core/_graph/_graphdef.pyx | 231 ++++++++++++----------- 1 file changed, 126 insertions(+), 105 deletions(-) diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index d2888af2d8..473be558f2 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -28,11 +28,7 @@ Node hierarchy: └── SwitchNode (switch conditional, N branches) """ -from dataclasses import dataclass -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from cuda.core import Device +from __future__ import annotations from cpython.ref cimport Py_INCREF @@ -45,22 +41,47 @@ from libcpp.vector cimport vector from cuda.bindings cimport cydriver +from cuda.core._event cimport Event +from cuda.core._kernel_arg_handler cimport ParamHolder +from cuda.core._launch_config cimport LaunchConfig +from cuda.core._module cimport Kernel from cuda.core._resource_handles cimport ( GraphHandle, - create_graph_handle, - create_graph_handle_ref, as_cu, as_intptr, as_py, + create_graph_handle, + create_graph_handle_ref, ) -from cuda.core._event cimport Event -from cuda.core._module cimport Kernel -from cuda.core._launch_config cimport LaunchConfig -from cuda.core._kernel_arg_handler cimport ParamHolder from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value +from dataclasses import dataclass + +from cuda.core import Device from cuda.core._utils.cuda_utils import driver, handle_return +__all__ = [ + "Condition", + "GraphAllocOptions", + "GraphDef", + "Node", + "EmptyNode", + "KernelNode", + "AllocNode", + "FreeNode", + "MemsetNode", + "MemcpyNode", + "ChildGraphNode", + "EventRecordNode", + "EventWaitNode", + "HostCallbackNode", + "ConditionalNode", + "IfNode", + "IfElseNode", + "WhileNode", + "SwitchNode", +] + cdef bint _has_cuGraphNodeGetParams = False cdef bint _version_checked = False @@ -121,19 +142,19 @@ cdef class Condition: runtime by device code via ``cudaGraphSetConditional``. """ - def __repr__(self): + def __repr__(self) -> str: return f"self._c_handle:x}>" - def __eq__(self, other): + def __eq__(self, other) -> bool: if not isinstance(other, Condition): return NotImplemented return self._c_handle == (other)._c_handle - def __hash__(self): + def __hash__(self) -> int: return hash(self._c_handle) @property - def handle(self): + def handle(self) -> int: """The raw CUgraphConditionalHandle as an int.""" return self._c_handle @@ -256,54 +277,54 @@ cdef class GraphDef: g._h_graph = h_graph return g - def __repr__(self): + def __repr__(self) -> str: return f"" - def __eq__(self, other): + def __eq__(self, other) -> bool: if not isinstance(other, GraphDef): return NotImplemented return as_intptr(self._h_graph) == as_intptr((other)._h_graph) - def __hash__(self): + def __hash__(self) -> int: return hash(as_intptr(self._h_graph)) @property - def _entry(self): + def _entry(self) -> Node: """Return the internal entry-point Node (no dependencies).""" cdef Node n = Node.__new__(Node) n._h_graph = self._h_graph n._node = NULL return n - def alloc(self, size_t size, options: GraphAllocOptions | None = None): + def alloc(self, size_t size, options: GraphAllocOptions | None = None) -> AllocNode: """Add an entry-point memory allocation node (no dependencies). See :meth:`Node.alloc` for full documentation. """ return self._entry.alloc(size, options) - def free(self, dptr): + def free(self, dptr) -> FreeNode: """Add an entry-point memory free node (no dependencies). See :meth:`Node.free` for full documentation. """ return self._entry.free(dptr) - def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0): + def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode: """Add an entry-point memset node (no dependencies). See :meth:`Node.memset` for full documentation. """ return self._entry.memset(dst, value, width, height, pitch) - def launch(self, config, kernel, *args): + def launch(self, config, kernel, *args) -> KernelNode: """Add an entry-point kernel launch node (no dependencies). See :meth:`Node.launch` for full documentation. """ return self._entry.launch(config, kernel, *args) - def join(self, *nodes): + def join(self, *nodes) -> EmptyNode: """Create an empty node that depends on all given nodes. Parameters @@ -318,42 +339,42 @@ cdef class GraphDef: """ return self._entry.join(*nodes) - def memcpy(self, dst, src, size_t size): + def memcpy(self, dst, src, size_t size) -> MemcpyNode: """Add an entry-point memcpy node (no dependencies). See :meth:`Node.memcpy` for full documentation. """ return self._entry.memcpy(dst, src, size) - def embed(self, child): + def embed(self, child: GraphDef) -> ChildGraphNode: """Add an entry-point child graph node (no dependencies). See :meth:`Node.embed` for full documentation. """ return self._entry.embed(child) - def record_event(self, event): + def record_event(self, event: Event) -> EventRecordNode: """Add an entry-point event record node (no dependencies). See :meth:`Node.record_event` for full documentation. """ return self._entry.record_event(event) - def wait_event(self, event): + def wait_event(self, event: Event) -> EventWaitNode: """Add an entry-point event wait node (no dependencies). See :meth:`Node.wait_event` for full documentation. """ return self._entry.wait_event(event) - def callback(self, fn, *, user_data=None): + def callback(self, fn, *, user_data=None) -> HostCallbackNode: """Add an entry-point host callback node (no dependencies). See :meth:`Node.callback` for full documentation. """ return self._entry.callback(fn, user_data=user_data) - def create_condition(self, default_value=None): + def create_condition(self, default_value: int | None = None) -> Condition: """Create a condition variable for use with conditional nodes. The returned :class:`Condition` object is passed to conditional-node @@ -390,28 +411,28 @@ cdef class GraphDef: cond._c_handle = c_handle return cond - def if_cond(self, condition): + def if_cond(self, condition: Condition) -> IfNode: """Add an entry-point if-conditional node (no dependencies). See :meth:`Node.if_cond` for full documentation. """ return self._entry.if_cond(condition) - def if_else(self, condition): + def if_else(self, condition: Condition) -> IfElseNode: """Add an entry-point if-else conditional node (no dependencies). See :meth:`Node.if_else` for full documentation. """ return self._entry.if_else(condition) - def while_loop(self, condition): + def while_loop(self, condition: Condition) -> WhileNode: """Add an entry-point while-loop conditional node (no dependencies). See :meth:`Node.while_loop` for full documentation. """ return self._entry.while_loop(condition) - def switch(self, condition, unsigned int count): + def switch(self, condition: Condition, unsigned int count) -> SwitchNode: """Add an entry-point switch conditional node (no dependencies). See :meth:`Node.switch` for full documentation. @@ -433,7 +454,7 @@ cdef class GraphDef: driver.CUgraph(as_intptr(self._h_graph)), 0)) return Graph._init(graph_exec) - def debug_dot_print(self, path: str, options=None): + def debug_dot_print(self, path: str, options=None) -> None: """Write a GraphViz DOT representation of the graph to a file. Parameters @@ -457,7 +478,7 @@ cdef class GraphDef: with nogil: HANDLE_RETURN(cydriver.cuGraphDebugDotPrint(graph, c_path, flags)) - def nodes(self): + def nodes(self) -> tuple: """Return all nodes in the graph. Returns @@ -481,7 +502,7 @@ cdef class GraphDef: return tuple(Node._create(self._h_graph, nodes_vec[i]) for i in range(num_nodes)) - def edges(self): + def edges(self) -> tuple: """Return all edges in the graph as (from_node, to_node) pairs. Returns @@ -514,7 +535,7 @@ cdef class GraphDef: ) @property - def handle(self): + def handle(self) -> int: """Return the underlying CUgraph handle.""" return as_py(self._h_graph) @@ -568,19 +589,19 @@ cdef class Node: (n)._node = node return n - def __repr__(self): + def __repr__(self) -> str: if self._node == NULL: return "" return f"self._node:x}>" - def __eq__(self, other): + def __eq__(self, other) -> bool: if not isinstance(other, Node): return NotImplemented cdef Node o = other return (as_intptr(self._h_graph) == as_intptr(o._h_graph) and self._node == o._node) - def __hash__(self): + def __hash__(self) -> int: return hash((as_intptr(self._h_graph), self._node)) @property @@ -600,12 +621,12 @@ cdef class Node: return driver.CUgraphNodeType(node_type) @property - def graph(self): + def graph(self) -> GraphDef: """Return the GraphDef this node belongs to.""" return GraphDef._from_handle(self._h_graph) @property - def handle(self): + def handle(self) -> int | None: """Return the underlying CUgraphNode handle as an int. Returns None for the entry node. @@ -615,7 +636,7 @@ cdef class Node: return self._node @property - def pred(self): + def pred(self) -> tuple: """Return the predecessor nodes (dependencies) of this node. Results are cached since a node's dependencies are immutable @@ -652,7 +673,7 @@ cdef class Node: return self._pred_cache @property - def succ(self): + def succ(self) -> tuple: """Return the successor nodes (dependents) of this node. Results are cached and automatically invalidated when new @@ -688,7 +709,7 @@ cdef class Node: self._succ_cache = tuple(Node._create(self._h_graph, deps[i]) for i in range(num_deps)) return self._succ_cache - def launch(self, config, kernel, *args): + def launch(self, config: LaunchConfig, kernel: Kernel, *args) -> KernelNode: """Add a kernel launch node depending on this node. Parameters @@ -741,7 +762,7 @@ cdef class Node: conf.grid, conf.block, conf.shmem_size, node_params.kern) - def join(self, *nodes): + def join(self, *nodes: Node) -> EmptyNode: """Create an empty node that depends on this node and all given nodes. This is used to synchronize multiple branches of execution. @@ -781,7 +802,7 @@ cdef class Node: (other)._succ_cache = None return EmptyNode._create_impl(self._h_graph, new_node) - def alloc(self, size_t size, options: GraphAllocOptions | None = None): + def alloc(self, size_t size, options: GraphAllocOptions | None = None) -> AllocNode: """Add a memory allocation node depending on this node. Parameters @@ -869,7 +890,7 @@ cdef class Node: self._h_graph, new_node, alloc_params.dptr, size, device_id, memory_type, tuple(peer_ids)) - def free(self, dptr): + def free(self, dptr: int) -> FreeNode: """Add a memory free node depending on this node. Parameters @@ -898,7 +919,7 @@ cdef class Node: self._succ_cache = None return FreeNode._create_with_params(self._h_graph, new_node, c_dptr) - def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0): + def memset(self, dst: int, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode: """Add a memset node depending on this node. Parameters @@ -957,7 +978,7 @@ cdef class Node: self._h_graph, new_node, c_dst, val, elem_size, width, height, pitch) - def memcpy(self, dst, src, size_t size): + def memcpy(self, dst: int, src: int, size_t size) -> MemcpyNode: """Add a memcpy node depending on this node. Copies ``size`` bytes from ``src`` to ``dst``. Memory types are @@ -1038,7 +1059,7 @@ cdef class Node: self._h_graph, new_node, c_dst, c_src, size, c_dst_type, c_src_type) - def embed(self, child): + def embed(self, child: GraphDef) -> ChildGraphNode: """Add a child graph node depending on this node. Embeds a clone of the given graph definition as a sub-graph node. @@ -1080,7 +1101,7 @@ cdef class Node: self._succ_cache = None return ChildGraphNode._create_with_params(self._h_graph, new_node, h_embedded) - def record_event(self, event): + def record_event(self, event: Event) -> EventRecordNode: """Add an event record node depending on this node. Parameters @@ -1111,7 +1132,7 @@ cdef class Node: self._succ_cache = None return EventRecordNode._create_with_params(self._h_graph, new_node, c_event) - def wait_event(self, event): + def wait_event(self, event: Event) -> EventWaitNode: """Add an event wait node depending on this node. Parameters @@ -1142,7 +1163,7 @@ cdef class Node: self._succ_cache = None return EventWaitNode._create_with_params(self._h_graph, new_node, c_event) - def callback(self, fn, *, user_data=None): + def callback(self, fn, *, user_data=None) -> HostCallbackNode: """Add a host callback node depending on this node. The callback runs on the host CPU when the graph reaches this node. @@ -1232,7 +1253,7 @@ cdef class Node: self._h_graph, new_node, callable_obj, node_params.fn, node_params.userData) - def if_cond(self, condition): + def if_cond(self, condition: Condition) -> IfNode: """Add an if-conditional node depending on this node. The body graph executes only when the condition evaluates to @@ -1252,7 +1273,7 @@ cdef class Node: self, condition, cydriver.CU_GRAPH_COND_TYPE_IF, 1, IfNode) - def if_else(self, condition): + def if_else(self, condition: Condition) -> IfElseNode: """Add an if-else conditional node depending on this node. Two body graphs: the first executes when the condition is @@ -1273,7 +1294,7 @@ cdef class Node: self, condition, cydriver.CU_GRAPH_COND_TYPE_IF, 2, IfElseNode) - def while_loop(self, condition): + def while_loop(self, condition: Condition) -> WhileNode: """Add a while-loop conditional node depending on this node. The body graph executes repeatedly while the condition @@ -1293,7 +1314,7 @@ cdef class Node: self, condition, cydriver.CU_GRAPH_COND_TYPE_WHILE, 1, WhileNode) - def switch(self, condition, unsigned int count): + def switch(self, condition: Condition, unsigned int count) -> SwitchNode: """Add a switch conditional node depending on this node. The condition value selects which branch to execute. If the @@ -1331,7 +1352,7 @@ cdef class EmptyNode(Node): n._node = node return n - def __repr__(self): + def __repr__(self) -> str: cdef Py_ssize_t n = len(self.pred) return f"" @@ -1380,31 +1401,31 @@ cdef class KernelNode(Node): params.sharedMemBytes, params.kern) - def __repr__(self): + def __repr__(self) -> str: return (f"") @property - def grid(self): + def grid(self) -> tuple: """Grid dimensions as a 3-tuple (gridDimX, gridDimY, gridDimZ).""" return self._grid @property - def block(self): + def block(self) -> tuple: """Block dimensions as a 3-tuple (blockDimX, blockDimY, blockDimZ).""" return self._block @property - def shmem_size(self): + def shmem_size(self) -> int: """Dynamic shared memory size in bytes.""" return self._shmem_size @property - def kernel(self): + def kernel(self) -> Kernel: """The Kernel object for this launch node.""" return Kernel.from_handle(self._kern) @property - def config(self): + def config(self) -> LaunchConfig: """A LaunchConfig reconstructed from this node's grid, block, and shmem_size. Note: cluster dimensions and cooperative_launch are not preserved @@ -1475,36 +1496,36 @@ cdef class AllocNode(Node): h_graph, node, params.dptr, params.bytesize, params.poolProps.location.id, memory_type, tuple(peer_ids)) - def __repr__(self): + def __repr__(self) -> str: return f"" @property - def dptr(self): + def dptr(self) -> int: """The device pointer for the allocation.""" return self._dptr @property - def bytesize(self): + def bytesize(self) -> int: """The number of bytes allocated.""" return self._bytesize @property - def device_id(self): + def device_id(self) -> int: """The device on which the allocation was made.""" return self._device_id @property - def memory_type(self): + def memory_type(self) -> str: """The type of memory: ``"device"``, ``"host"``, or ``"managed"``.""" return self._memory_type @property - def peer_access(self): + def peer_access(self) -> tuple: """Device IDs with read-write access to this allocation.""" return self._peer_access @property - def options(self): + def options(self) -> GraphAllocOptions: """A GraphAllocOptions reconstructed from this node's parameters.""" return GraphAllocOptions( device=self._device_id, @@ -1540,11 +1561,11 @@ cdef class FreeNode(Node): HANDLE_RETURN(cydriver.cuGraphMemFreeNodeGetParams(node, &dptr)) return FreeNode._create_with_params(h_graph, node, dptr) - def __repr__(self): + def __repr__(self) -> str: return f"" @property - def dptr(self): + def dptr(self) -> int: """The device pointer being freed.""" return self._dptr @@ -1595,37 +1616,37 @@ cdef class MemsetNode(Node): h_graph, node, params.dst, params.value, params.elementSize, params.width, params.height, params.pitch) - def __repr__(self): + def __repr__(self) -> str: return (f"") @property - def dptr(self): + def dptr(self) -> int: """The destination device pointer.""" return self._dptr @property - def value(self): + def value(self) -> int: """The fill value.""" return self._value @property - def element_size(self): + def element_size(self) -> int: """Element size in bytes (1, 2, or 4).""" return self._element_size @property - def width(self): + def width(self) -> int: """Width of the row in elements.""" return self._width @property - def height(self): + def height(self) -> int: """Number of rows.""" return self._height @property - def pitch(self): + def pitch(self) -> int: """Pitch in bytes (unused if height is 1).""" return self._pitch @@ -1681,24 +1702,24 @@ cdef class MemcpyNode(Node): h_graph, node, dst, src, params.WidthInBytes, params.dstMemoryType, params.srcMemoryType) - def __repr__(self): + def __repr__(self) -> str: cdef str dt = "H" if self._dst_type == cydriver.CU_MEMORYTYPE_HOST else "D" cdef str st = "H" if self._src_type == cydriver.CU_MEMORYTYPE_HOST else "D" return (f"") @property - def dst(self): + def dst(self) -> int: """The destination pointer.""" return self._dst @property - def src(self): + def src(self) -> int: """The source pointer.""" return self._src @property - def size(self): + def size(self) -> int: """The number of bytes copied.""" return self._size @@ -1731,7 +1752,7 @@ cdef class ChildGraphNode(Node): cdef GraphHandle h_child = create_graph_handle_ref(child_graph, h_graph) return ChildGraphNode._create_with_params(h_graph, node, h_child) - def __repr__(self): + def __repr__(self) -> str: cdef cydriver.CUgraph g = as_cu(self._h_child_graph) cdef size_t num_nodes = 0 with nogil: @@ -1740,7 +1761,7 @@ cdef class ChildGraphNode(Node): return f"" @property - def child_graph(self): + def child_graph(self) -> GraphDef: """The embedded graph definition (non-owning wrapper).""" return GraphDef._from_handle(self._h_child_graph) @@ -1772,11 +1793,11 @@ cdef class EventRecordNode(Node): HANDLE_RETURN(cydriver.cuGraphEventRecordNodeGetEvent(node, &event)) return EventRecordNode._create_with_params(h_graph, node, event) - def __repr__(self): + def __repr__(self) -> str: return f"self._event:x}>" @property - def event(self): + def event(self) -> Event: """The event being recorded (non-owning wrapper).""" return Event._from_handle(self._event) @@ -1808,11 +1829,11 @@ cdef class EventWaitNode(Node): HANDLE_RETURN(cydriver.cuGraphEventWaitNodeGetEvent(node, &event)) return EventWaitNode._create_with_params(h_graph, node, event) - def __repr__(self): + def __repr__(self) -> str: return f"self._event:x}>" @property - def event(self): + def event(self) -> Event: """The event being waited on (non-owning wrapper).""" return Event._from_handle(self._event) @@ -1855,7 +1876,7 @@ cdef class HostCallbackNode(Node): return HostCallbackNode._create_with_params( h_graph, node, callable_obj, params.fn, params.userData) - def __repr__(self): + def __repr__(self) -> str: if self._callable is not None: name = getattr(self._callable, '__name__', '?') return f"" @@ -1938,16 +1959,16 @@ cdef class ConditionalNode(Node): n._branches = branches return n - def __repr__(self): + def __repr__(self) -> str: return "" @property - def condition(self): + def condition(self) -> Condition | None: """The condition variable controlling execution.""" return self._condition @property - def cond_type(self): + def cond_type(self) -> str | None: """The conditional type as a string: 'if', 'while', or 'switch'. Returns None when reconstructed from the driver pre-CUDA 13.2, @@ -1963,7 +1984,7 @@ cdef class ConditionalNode(Node): return "switch" @property - def branches(self): + def branches(self) -> tuple: """The body graphs for each branch as a tuple of GraphDef. Returns an empty tuple when reconstructed from the driver @@ -1975,11 +1996,11 @@ cdef class ConditionalNode(Node): cdef class IfNode(ConditionalNode): """An if-conditional node (1 branch, executes when condition is non-zero).""" - def __repr__(self): + def __repr__(self) -> str: return f"self._condition._c_handle:x}>" @property - def then(self): + def then(self) -> GraphDef: """The 'then' branch graph.""" return self._branches[0] @@ -1987,16 +2008,16 @@ cdef class IfNode(ConditionalNode): cdef class IfElseNode(ConditionalNode): """An if-else conditional node (2 branches).""" - def __repr__(self): + def __repr__(self) -> str: return f"self._condition._c_handle:x}>" @property - def then(self): + def then(self) -> GraphDef: """The 'then' branch graph (executed when condition is non-zero).""" return self._branches[0] @property - def else_(self): + def else_(self) -> GraphDef: """The 'else' branch graph (executed when condition is zero).""" return self._branches[1] @@ -2004,11 +2025,11 @@ cdef class IfElseNode(ConditionalNode): cdef class WhileNode(ConditionalNode): """A while-loop conditional node (1 branch, repeats while condition is non-zero).""" - def __repr__(self): + def __repr__(self) -> str: return f"self._condition._c_handle:x}>" @property - def body(self): + def body(self) -> GraphDef: """The loop body graph.""" return self._branches[0] @@ -2016,7 +2037,7 @@ cdef class WhileNode(ConditionalNode): cdef class SwitchNode(ConditionalNode): """A switch conditional node (N branches, selected by condition value).""" - def __repr__(self): + def __repr__(self) -> str: cdef Py_ssize_t n = len(self._branches) return (f"self._condition._c_handle:x}" f" with {n} {'branch' if n == 1 else 'branches'}>") From da73536ecc1a073159d80d1d0de0ab35358bc6b8 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 5 Mar 2026 17:19:43 -0800 Subject: [PATCH 16/23] Fix conditional node body graphs and add integration tests cuGraphAddNode replaces the phGraph_out pointer with its own internal array rather than writing into the caller-provided buffer. Read body graph handles from params.conditional.phGraph_out[i] after the call instead of from a pre-allocated vector. Add three integration tests exercising all 14 explicit-graph node types: heat diffusion (WhileNode, ChildGraphNode, EventNodes, ...), bisection root finder (IfElseNode, IfNode), and switch dispatch (SwitchNode). Made-with: Cursor --- cuda_core/cuda/core/_graph/_graphdef.pyx | 7 +- .../tests/graph/test_explicit_integration.py | 468 ++++++++++++++++++ 2 files changed, 471 insertions(+), 4 deletions(-) create mode 100644 cuda_core/tests/graph/test_explicit_integration.py diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 473be558f2..2626a6a0e9 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -171,15 +171,12 @@ cdef ConditionalNode _make_conditional_node( f"GraphDef.create_condition()), got {type(condition).__name__}") cdef cydriver.CUgraphNodeParams params cdef cydriver.CUgraphNode new_node = NULL - cdef vector[cydriver.CUgraph] branch_graphs - branch_graphs.resize(size) c_memset(¶ms, 0, sizeof(params)) params.type = cydriver.CU_GRAPH_NODE_TYPE_CONDITIONAL params.conditional.handle = condition._c_handle params.conditional.type = cond_type params.conditional.size = size - params.conditional.phGraph_out = branch_graphs.data() cdef cydriver.CUcontext ctx = NULL cdef cydriver.CUgraph graph = as_cu(pred._h_graph) @@ -198,12 +195,14 @@ cdef ConditionalNode _make_conditional_node( HANDLE_RETURN(cydriver.cuGraphAddNode( &new_node, graph, deps, NULL, num_deps, ¶ms)) + # cuGraphAddNode sets phGraph_out to an internal array of body + # graphs (it replaces the pointer, not writing into a caller array). cdef list branch_list = [] cdef unsigned int i cdef cydriver.CUgraph bg cdef GraphHandle h_branch for i in range(size): - bg = branch_graphs[i] + bg = params.conditional.phGraph_out[i] h_branch = create_graph_handle_ref(bg, pred._h_graph) branch_list.append(GraphDef._from_handle(h_branch)) cdef tuple branches = tuple(branch_list) diff --git a/cuda_core/tests/graph/test_explicit_integration.py b/cuda_core/tests/graph/test_explicit_integration.py new file mode 100644 index 0000000000..91f442939e --- /dev/null +++ b/cuda_core/tests/graph/test_explicit_integration.py @@ -0,0 +1,468 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +"""Integration tests for explicit CUDA graph construction. + +Three test scenarios exercise complementary subsets of node types: + +test_heat_diffusion + 1D heat bar evolving toward steady state via finite differences. + Exercises: AllocNode, FreeNode, MemsetNode, ChildGraphNode, + EmptyNode, EventRecordNode, EventWaitNode, WhileNode, KernelNode, + MemcpyNode, HostCallbackNode. + +test_bisection_root + Find sqrt(2) by bisecting f(x) = x^2 - 2 on [0, 2], with an + optional Newton polish step. + Exercises: IfElseNode (interval halving), IfNode (refinement + guard), WhileNode, KernelNode, AllocNode, MemsetNode, MemcpyNode, + HostCallbackNode, FreeNode, EmptyNode. + +test_switch_dispatch + Apply one of four element-wise transforms selected at graph + creation time via a switch condition. + Exercises: SwitchNode, KernelNode, AllocNode, MemsetNode, + MemcpyNode, FreeNode. + +Together the three tests cover all 14 explicit-graph node types. +""" + +import ctypes + +import numpy as np +import pytest + +from cuda.core import Device, EventOptions, LaunchConfig, Program, ProgramOptions +from cuda.core._graph._graphdef import GraphDef +from cuda.core._utils.cuda_utils import driver, handle_return + +SIZEOF_FLOAT = 4 +SIZEOF_INT = 4 + +# =================================================================== +# Kernel sources +# =================================================================== + +_COND_PREAMBLE = r""" +extern "C" __device__ __cudart_builtin__ void CUDARTAPI +cudaGraphSetConditional(cudaGraphConditionalHandle handle, + unsigned int value); +""" + +_HEAT_KERNEL_SOURCE = ( + _COND_PREAMBLE + + r""" +extern "C" __global__ +void heat_step(float* u_next, const float* u_curr, int N, float alpha) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + if (i == 0 || i == N - 1) + u_next[i] = u_curr[i]; + else + u_next[i] = u_curr[i] + + alpha * (u_curr[i-1] - 2.0f * u_curr[i] + u_curr[i+1]); +} + +extern "C" __global__ +void countdown(cudaGraphConditionalHandle handle, int* counter) { + int c = atomicSub(counter, 1); + cudaGraphSetConditional(handle, (c > 1) ? 1u : 0u); +} +""" +) + +_BISECT_KERNEL_SOURCE = ( + _COND_PREAMBLE + + r""" +extern "C" __global__ +void bisect_eval(float* a, float* b, + cudaGraphConditionalHandle ie_cond) { + float mid = (*a + *b) * 0.5f; + float fm = mid * mid - 2.0f; + cudaGraphSetConditional(ie_cond, (fm > 0.0f) ? 1u : 0u); +} + +extern "C" __global__ +void update_hi(float* a, float* b) { + *b = (*a + *b) * 0.5f; +} + +extern "C" __global__ +void update_lo(float* a, float* b) { + *a = (*a + *b) * 0.5f; +} + +extern "C" __global__ +void countdown(cudaGraphConditionalHandle handle, int* counter) { + int c = atomicSub(counter, 1); + cudaGraphSetConditional(handle, (c > 1) ? 1u : 0u); +} + +extern "C" __global__ +void check_refine(float* a, float* b, + cudaGraphConditionalHandle if_cond) { + float mid = (*a + *b) * 0.5f; + float fm = mid * mid - 2.0f; + float abs_fm = fm < 0.0f ? -fm : fm; + cudaGraphSetConditional(if_cond, (abs_fm > 1e-10f) ? 1u : 0u); +} + +extern "C" __global__ +void newton_refine(float* a, float* b) { + float mid = (*a + *b) * 0.5f; + float refined = mid - (mid * mid - 2.0f) / (2.0f * mid); + *a = refined; + *b = refined; +} +""" +) + +_SWITCH_KERNEL_SOURCE = r""" +extern "C" __global__ +void negate_it(int* x) { *x = -(*x); } + +extern "C" __global__ +void double_it(int* x) { *x = 2 * (*x); } + +extern "C" __global__ +void square_it(int* x) { *x = (*x) * (*x); } +""" + +# =================================================================== +# Compilation helpers +# =================================================================== + + +def _nvrtc_opts(): + arch = "".join(f"{i}" for i in Device().compute_capability) + return ProgramOptions(std="c++17", arch=f"sm_{arch}") + + +def _compile_heat_kernels(): + prog = Program(_HEAT_KERNEL_SOURCE, code_type="c++", options=_nvrtc_opts()) + try: + mod = prog.compile( + "cubin", + name_expressions=("heat_step", "countdown"), + ) + except Exception: + pytest.skip("NVRTC does not support cudaGraphConditionalHandle") + return mod.get_kernel("heat_step"), mod.get_kernel("countdown") + + +def _compile_bisect_kernels(): + names = ( + "bisect_eval", + "update_hi", + "update_lo", + "countdown", + "check_refine", + "newton_refine", + ) + prog = Program(_BISECT_KERNEL_SOURCE, code_type="c++", options=_nvrtc_opts()) + try: + mod = prog.compile("cubin", name_expressions=names) + except Exception: + pytest.skip("NVRTC does not support cudaGraphConditionalHandle") + return tuple(mod.get_kernel(n) for n in names) + + +def _compile_switch_kernels(): + names = ("negate_it", "double_it", "square_it") + prog = Program(_SWITCH_KERNEL_SOURCE, code_type="c++", options=_nvrtc_opts()) + mod = prog.compile("cubin", name_expressions=names) + return tuple(mod.get_kernel(n) for n in names) + + +# =================================================================== +# Test 1 — Heat diffusion (WhileNode, ChildGraphNode, EventNodes, …) +# +# alloc(curr) ─ memset(0) ──┐ +# alloc(next) ─ memset(0) ──┼─ join ─ embed(bc) ─ rec(start) ─ WHILE ──┐ +# alloc(ctr) ─ memset(50) ─┘ │ +# ┌─────────────────────────────────────────────────────────────────────┘ +# └─ wait(start) ─ rec(end) ─ memcpy(→host) ─ callback +# ─ free(curr) ─ free(next) ─ free(ctr) +# +# bc graph: memset(T_LEFT) ─ memset(T_RIGHT) +# while body: heat_step ─ memcpy(curr ← next) ─ countdown +# =================================================================== + +_HEAT_N = 32 +_HEAT_T_LEFT = np.float32(100.0) +_HEAT_T_RIGHT = np.float32(0.0) +_HEAT_ALPHA = np.float32(0.4) +_HEAT_ITERS = 50 + + +def _heat_reference(): + """Compute the reference heat solution on the host (NumPy).""" + u = np.zeros(_HEAT_N, dtype=np.float32) + u[0] = _HEAT_T_LEFT + u[-1] = _HEAT_T_RIGHT + u_next = np.empty_like(u) + for _ in range(_HEAT_ITERS): + u_next[0] = u[0] + u_next[-1] = u[-1] + u_next[1:-1] = u[1:-1] + _HEAT_ALPHA * (u[:-2] - 2.0 * u[1:-1] + u[2:]) + u, u_next = u_next, u + return u + + +def test_heat_diffusion(init_cuda): + """1D heat-bar simulation exercising most explicit-graph node types.""" + dev = Device() + + if dev.compute_capability < (9, 0): + pytest.skip("Conditional nodes require compute capability >= 9.0") + + k_heat, k_countdown = _compile_heat_kernels() + + host_ptr = handle_return(driver.cuMemAllocHost(_HEAT_N * SIZEOF_FLOAT)) + + try: + _run_heat_graph(dev, k_heat, k_countdown, host_ptr) + finally: + handle_return(driver.cuMemFreeHost(host_ptr)) + + +def _run_heat_graph(dev, k_heat, k_countdown, host_ptr): + """Build, instantiate, launch, and verify the heat-diffusion graph.""" + + # Definitions + g = GraphDef() + condition = g.create_condition(default_value=1) + event_start = dev.create_event(EventOptions(enable_timing=True)) + event_end = dev.create_event(EventOptions(enable_timing=True)) + results = {} + + def capture_result(): + arr = (ctypes.c_float * _HEAT_N).from_address(host_ptr) + results["data"] = np.array(arr, copy=True) + + block = min(_HEAT_N, 256) + grid = (_HEAT_N + block - 1) // block + heat_cfg = LaunchConfig(grid=grid, block=block) + tick_cfg = LaunchConfig(grid=1, block=1) + + # Phase 1 — Allocate device memory + a_curr = g.alloc(_HEAT_N * SIZEOF_FLOAT) + a_next = g.alloc(_HEAT_N * SIZEOF_FLOAT) + a_ctr = g.alloc(SIZEOF_INT) + + # Phase 2 — Initialise buffers + m_curr = a_curr.memset(a_curr.dptr, 0, _HEAT_N * SIZEOF_FLOAT) + m_next = a_next.memset(a_next.dptr, 0, _HEAT_N * SIZEOF_FLOAT) + m_ctr = a_ctr.memset(a_ctr.dptr, np.int32(_HEAT_ITERS), 1) + + # Phase 3 — Boundary conditions (child graph) + p = g.join(m_curr, m_next, m_ctr) \ + .embed(GraphDef() + .memset(a_curr.dptr, np.float32(_HEAT_T_LEFT), 1) + .memset( + a_curr.dptr + (_HEAT_N - 1) * SIZEOF_FLOAT, + np.float32(_HEAT_T_RIGHT), + 1, + ) + .graph + ) \ + .record_event(event_start) \ + + # Phase 4 — Iterate + loop = p.while_loop(condition) + loop.body.launch(heat_cfg, k_heat, a_next.dptr, a_curr.dptr, + np.int32(_HEAT_N), _HEAT_ALPHA) \ + .memcpy(a_curr.dptr, a_next.dptr, _HEAT_N * SIZEOF_FLOAT) \ + .launch(tick_cfg, k_countdown, condition.handle, a_ctr.dptr) + + # Phase 5 — After loop: timing end, readback, verify, free memory + loop.wait_event(event_start) \ + .record_event(event_end) \ + .memcpy(host_ptr, a_curr.dptr, _HEAT_N * SIZEOF_FLOAT) \ + .callback(capture_result) \ + .free(a_curr.dptr) \ + .free(a_next.dptr) \ + .free(a_ctr.dptr) + + # Phase 6 — Instantiate, launch, verify + graph = g.instantiate() + stream = dev.create_stream() + graph.launch(stream) + stream.sync() + + assert "data" in results, "Host callback did not execute" + np.testing.assert_allclose(results["data"], _heat_reference(), rtol=1e-5) + + +# =================================================================== +# Test 2 — Bisection root finder (IfElseNode, IfNode) +# +# Find sqrt(2) by bisecting f(x) = x^2 - 2 on [0, 2]. +# +# alloc(a) ─ memset(0.0) ──┐ +# alloc(b) ─ memset(2.0) ──┼─ join ─ WHILE(while_cond) ──────────────────┐ +# alloc(ctr) ─ memset(20) ─┘ │ +# ┌───────────────────────────────────────────────────────────────────────┘ +# └─ check_refine ─ IF(if_cond) ─ memcpy(→host) ─ callback +# └─ body: newton_refine +# ─ free(a) ─ free(b) ─ free(ctr) +# +# while body: +# bisect_eval ─ IF_ELSE(ie_cond) ─ countdown +# ├─ then: update_hi (b = mid) [f(mid) > 0] +# └─ else: update_lo (a = mid) [f(mid) ≤ 0] +# =================================================================== + +_BISECT_ITERS = 20 + + +def test_bisection_root(init_cuda): + """Bisection search for sqrt(2) with optional Newton refinement. + + Exercises IfElseNode (interval halving) and IfNode (refinement guard). + """ + dev = Device() + + if dev.compute_capability < (9, 0): + pytest.skip("Conditional nodes require compute capability >= 9.0") + + k_eval, k_hi, k_lo, k_cd, k_check, k_newton = _compile_bisect_kernels() + + host_ptr = handle_return(driver.cuMemAllocHost(SIZEOF_FLOAT)) + + try: + _run_bisection_graph(dev, k_eval, k_hi, k_lo, k_cd, k_check, k_newton, host_ptr) + finally: + handle_return(driver.cuMemFreeHost(host_ptr)) + + +def _run_bisection_graph(dev, k_eval, k_hi, k_lo, k_cd, k_check, k_newton, host_ptr): + """Build, instantiate, launch, and verify the bisection graph.""" + + # Definitions + g = GraphDef() + cfg = LaunchConfig(grid=1, block=1) + results = {} + + def capture_result(): + results["root"] = ctypes.c_float.from_address(host_ptr).value + + # Allocate and initialise: a = 0.0, b = 2.0, counter = ITERS + a = g.alloc(SIZEOF_FLOAT) + b = g.alloc(SIZEOF_FLOAT) + ctr = g.alloc(SIZEOF_INT) + + p = g.join( + a.memset(a.dptr, np.float32(0.0), 1), + b.memset(b.dptr, np.float32(2.0), 1), + ctr.memset(ctr.dptr, np.int32(_BISECT_ITERS), 1), + ) + + # While loop: bisection iterations + while_cond = g.create_condition(default_value=1) + ie_cond = g.create_condition(default_value=0) + loop = p.while_loop(while_cond) + + ie = loop.body.launch( + cfg, + k_eval, + a.dptr, + b.dptr, + ie_cond.handle, + ).if_else(ie_cond) + ie.then.launch(cfg, k_hi, a.dptr, b.dptr) + ie.else_.launch(cfg, k_lo, a.dptr, b.dptr) + ie.launch(cfg, k_cd, while_cond.handle, ctr.dptr) + + # Post-loop: Newton refinement (IfNode), readback, free + if_cond = g.create_condition(default_value=0) + if_node = loop.launch( + cfg, + k_check, + a.dptr, + b.dptr, + if_cond.handle, + ).if_cond(if_cond) + if_node.then.launch(cfg, k_newton, a.dptr, b.dptr) + + (if_node.memcpy(host_ptr, a.dptr, SIZEOF_FLOAT).callback(capture_result).free(a.dptr).free(b.dptr).free(ctr.dptr)) + + # Instantiate, launch, verify + graph = g.instantiate() + stream = dev.create_stream() + graph.launch(stream) + stream.sync() + + assert "root" in results, "Host callback did not execute" + np.testing.assert_allclose( + results["root"], + np.sqrt(np.float32(2.0)), + rtol=1e-6, + ) + + +# =================================================================== +# Test 3 — Switch dispatch (SwitchNode) +# +# A mode value (0-3) selects one of four transforms on a scalar: +# +# alloc(x) ─ memset(42) ─ SWITCH(mode, 4) +# ├─ 0: negate(x) +# ├─ 1: double(x) +# ├─ 2: square(x) +# └─ 3: (identity) +# ─ memcpy(→host) ─ free(x) +# =================================================================== + +_SWITCH_VALUE = 42 + + +@pytest.mark.parametrize( + "mode, expected", + [ + (0, -_SWITCH_VALUE), + (1, 2 * _SWITCH_VALUE), + (2, _SWITCH_VALUE * _SWITCH_VALUE), + (3, _SWITCH_VALUE), + ], +) +def test_switch_dispatch(init_cuda, mode, expected): + """Runtime kernel selection via SwitchNode.""" + dev = Device() + + if dev.compute_capability < (9, 0): + pytest.skip("Conditional nodes require compute capability >= 9.0") + + k_negate, k_double, k_square = _compile_switch_kernels() + + host_ptr = handle_return(driver.cuMemAllocHost(SIZEOF_INT)) + + try: + _run_switch_graph(dev, mode, k_negate, k_double, k_square, host_ptr) + + result = ctypes.c_int.from_address(host_ptr).value + assert result == expected + finally: + handle_return(driver.cuMemFreeHost(host_ptr)) + + +def _run_switch_graph(dev, mode, k_negate, k_double, k_square, host_ptr): + """Build, instantiate, launch, and verify the switch-dispatch graph.""" + g = GraphDef() + cfg = LaunchConfig(grid=1, block=1) + + x = g.alloc(SIZEOF_INT) + sw_cond = g.create_condition(default_value=mode) + sw = x.memset(x.dptr, np.int32(_SWITCH_VALUE), 1).switch(sw_cond, 4) + + sw.branches[0].launch(cfg, k_negate, x.dptr) + sw.branches[1].launch(cfg, k_double, x.dptr) + sw.branches[2].launch(cfg, k_square, x.dptr) + # branch 3: identity (no kernel — value unchanged) + + sw.memcpy(host_ptr, x.dptr, SIZEOF_INT).free(x.dptr) + + graph = g.instantiate() + stream = dev.create_stream() + graph.launch(stream) + stream.sync() From b463ccbc94fcc3bcbb9a69ac5a2f3b4bcf1eb47d Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 6 Mar 2026 07:45:11 -0800 Subject: [PATCH 17/23] Add lifetime and error/edge-case tests for explicit graph construction test_explicit_lifetime.py verifies the RAII parent-capture mechanism in create_graph_handle_ref prevents dangling references when parent GraphDef objects are deleted while child/body graph handles remain. test_explicit_errors.py covers input validation (type checks for conditional methods, invalid memset values, null free, cross-graph condition misuse), edge cases (join variants, multiple instantiation, unmatched alloc), and boundary condition execution (while-loop zero iterations, if-cond false, switch out-of-range). Made-with: Cursor --- cuda_core/tests/graph/test_explicit_errors.py | 242 ++++++++++++++++++ .../tests/graph/test_explicit_lifetime.py | 161 ++++++++++++ 2 files changed, 403 insertions(+) create mode 100644 cuda_core/tests/graph/test_explicit_errors.py create mode 100644 cuda_core/tests/graph/test_explicit_lifetime.py diff --git a/cuda_core/tests/graph/test_explicit_errors.py b/cuda_core/tests/graph/test_explicit_errors.py new file mode 100644 index 0000000000..e65dbe31d7 --- /dev/null +++ b/cuda_core/tests/graph/test_explicit_errors.py @@ -0,0 +1,242 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +"""Tests for error handling, input validation, and edge cases in explicit graphs. + +These tests verify that the explicit graph API properly validates inputs, +raises appropriate exceptions for misuse, and handles boundary conditions +correctly. +""" + +import ctypes + +import pytest +from helpers.graph_kernels import compile_common_kernels +from helpers.misc import try_create_condition + +from cuda.core import Device, LaunchConfig +from cuda.core._graph._graphdef import ( + Condition, + EmptyNode, + GraphDef, +) +from cuda.core._utils.cuda_utils import CUDAError + +SIZEOF_INT = ctypes.sizeof(ctypes.c_int) + + +# ============================================================================= +# Type validation — wrong types for conditional node methods +# ============================================================================= + + +@pytest.mark.parametrize( + "method, args", + [ + pytest.param("if_cond", (42,), id="if_cond_int"), + pytest.param("if_else", ("not a condition",), id="if_else_str"), + pytest.param("while_loop", (None,), id="while_loop_none"), + pytest.param("switch", ([1, 2, 3], 4), id="switch_list"), + ], +) +def test_conditional_rejects_non_condition(init_cuda, method, args): + """Conditional node methods reject non-Condition arguments.""" + g = GraphDef() + with pytest.raises(TypeError, match="Condition"): + getattr(g, method)(*args) + + +def test_embed_rejects_non_graphdef(init_cuda): + """embed() rejects non-GraphDef arguments.""" + g = GraphDef() + with pytest.raises((TypeError, AttributeError)): + g.embed("not a graph") + + +# ============================================================================= +# Value validation — invalid parameter values +# ============================================================================= + + +def test_free_null_pointer(init_cuda): + """free(0) raises a CUDA error.""" + g = GraphDef() + with pytest.raises(CUDAError): + g.free(0) + + +def test_memset_invalid_value_size(init_cuda): + """memset with 3-byte value (not 1, 2, or 4) raises ValueError.""" + g = GraphDef() + alloc = g.alloc(1024) + with pytest.raises(ValueError): + alloc.memset(alloc.dptr, b"\x01\x02\x03", 100) + + +def test_switch_zero_branches(init_cuda): + """switch with count=0 raises an error.""" + g = GraphDef() + condition = try_create_condition(g) + with pytest.raises(CUDAError): + g.switch(condition, 0) + + +# ============================================================================= +# Cross-graph misuse +# ============================================================================= + + +def test_condition_from_different_graph(init_cuda): + """Using a condition created for graph A in graph B raises an error.""" + g1 = GraphDef() + g2 = GraphDef() + condition = try_create_condition(g1) + with pytest.raises(CUDAError): + g2.if_cond(condition) + + +# ============================================================================= +# Edge cases — valid but unusual usage patterns +# ============================================================================= + + +def test_join_no_extra_nodes(init_cuda): + """join() from entry with no extra nodes creates a single empty node.""" + g = GraphDef() + joined = g.join() + assert isinstance(joined, EmptyNode) + assert len(g.nodes()) == 1 + + +def test_join_single_predecessor(init_cuda): + """node.join() with no extra args creates a single-dep empty node.""" + g = GraphDef() + a = g.alloc(1024) + joined = a.join() + assert isinstance(joined, EmptyNode) + assert set(joined.pred) == {a} + + +def test_multiple_instantiation(init_cuda): + """Same GraphDef can be instantiated multiple times independently.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + cfg = LaunchConfig(grid=1, block=1) + + g = GraphDef() + g.launch(cfg, kernel) + g1 = g.instantiate() + g2 = g.instantiate() + assert g1 is not g2 + + +def test_unmatched_alloc_succeeds(init_cuda): + """Alloc without corresponding free is valid (graph-scoped lifetime).""" + g = GraphDef() + g.alloc(1024) + graph = g.instantiate() + stream = Device().create_stream() + graph.launch(stream) + stream.sync() + + +def test_create_condition_no_default_value(init_cuda): + """create_condition with no default_value succeeds.""" + g = GraphDef() + try: + condition = g.create_condition() + except CUDAError: + pytest.skip("Conditional nodes not supported (requires CC >= 9.0)") + assert isinstance(condition, Condition) + + +# ============================================================================= +# Boundary condition execution — conditional nodes with extreme values +# ============================================================================= + + +def _skip_unless_cc_90(): + if Device(0).compute_capability < (9, 0): + pytest.skip("Conditional node execution requires CC >= 9.0") + + +def test_while_loop_zero_iterations(init_cuda): + """While loop with default_value=0 never executes its body.""" + _skip_unless_cc_90() + + mod = compile_common_kernels() + add_one = mod.get_kernel("add_one") + cfg = LaunchConfig(grid=1, block=1) + + g = GraphDef() + condition = g.create_condition(default_value=0) + alloc = g.alloc(SIZEOF_INT) + ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) + loop = ms.while_loop(condition) + loop.body.launch(cfg, add_one, alloc.dptr) + + graph = g.instantiate() + stream = Device().create_stream() + graph.launch(stream) + stream.sync() + + result = (ctypes.c_int * 1)() + from cuda.bindings import driver as drv + + drv.cuMemcpyDtoH(result, alloc.dptr, SIZEOF_INT) + assert result[0] == 0, "Body should not have executed" + + +def test_if_cond_false_skips_body(init_cuda): + """If conditional with default_value=0 does not execute its body.""" + _skip_unless_cc_90() + + mod = compile_common_kernels() + add_one = mod.get_kernel("add_one") + cfg = LaunchConfig(grid=1, block=1) + + g = GraphDef() + condition = g.create_condition(default_value=0) + alloc = g.alloc(SIZEOF_INT) + ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) + if_node = ms.if_cond(condition) + if_node.then.launch(cfg, add_one, alloc.dptr) + + graph = g.instantiate() + stream = Device().create_stream() + graph.launch(stream) + stream.sync() + + result = (ctypes.c_int * 1)() + from cuda.bindings import driver as drv + + drv.cuMemcpyDtoH(result, alloc.dptr, SIZEOF_INT) + assert result[0] == 0, "Body should not have executed" + + +def test_switch_oob_skips_all_branches(init_cuda): + """Switch with out-of-range condition value does not execute any branch.""" + _skip_unless_cc_90() + + mod = compile_common_kernels() + add_one = mod.get_kernel("add_one") + cfg = LaunchConfig(grid=1, block=1) + + g = GraphDef() + condition = g.create_condition(default_value=99) + alloc = g.alloc(SIZEOF_INT) + ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) + sw = ms.switch(condition, 3) + for branch in sw.branches: + branch.launch(cfg, add_one, alloc.dptr) + + graph = g.instantiate() + stream = Device().create_stream() + graph.launch(stream) + stream.sync() + + result = (ctypes.c_int * 1)() + from cuda.bindings import driver as drv + + drv.cuMemcpyDtoH(result, alloc.dptr, SIZEOF_INT) + assert result[0] == 0, "No branch should have executed" diff --git a/cuda_core/tests/graph/test_explicit_lifetime.py b/cuda_core/tests/graph/test_explicit_lifetime.py new file mode 100644 index 0000000000..f713e63ad1 --- /dev/null +++ b/cuda_core/tests/graph/test_explicit_lifetime.py @@ -0,0 +1,161 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +"""Tests for resource lifetime management in explicit CUDA graphs. + +These tests verify that the RAII mechanism in GraphHandle correctly +prevents dangling references when parent Python objects are deleted +while child/body graph references remain alive. +""" + +import gc + +import pytest +from helpers.graph_kernels import compile_common_kernels +from helpers.misc import try_create_condition + +from cuda.core import LaunchConfig +from cuda.core._graph._graphdef import ( + ChildGraphNode, + ConditionalNode, + GraphDef, +) + + +# ============================================================================= +# Conditional body graph lifetime +# ============================================================================= + + +def _make_if(g, cond): + node = g.if_cond(cond) + return [node.then] + + +def _make_if_else(g, cond): + node = g.if_else(cond) + return [node.then, node.else_] + + +def _make_while(g, cond): + node = g.while_loop(cond) + return [node.body] + + +def _make_switch(g, cond): + node = g.switch(cond, 4) + return list(node.branches) + + +_COND_BUILDERS = [ + pytest.param(_make_if, 1, id="if"), + pytest.param(_make_if_else, 2, id="if_else"), + pytest.param(_make_while, 1, id="while"), + pytest.param(_make_switch, 4, id="switch"), +] + + +@pytest.mark.parametrize("builder, expected_count", _COND_BUILDERS) +def test_branches_survive_parent_deletion(init_cuda, builder, expected_count): + """All branch graphs remain valid after parent GraphDef is deleted.""" + g = GraphDef() + condition = try_create_condition(g) + branches = builder(g, condition) + assert len(branches) == expected_count + + del g, condition + gc.collect() + + for branch in branches: + assert branch.nodes() == () + + +@pytest.mark.parametrize("builder, expected_count", _COND_BUILDERS) +def test_branches_usable_after_parent_deletion(init_cuda, builder, expected_count): + """Nodes can be added to branch graphs after parent GraphDef is deleted.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + + g = GraphDef() + condition = try_create_condition(g) + branches = builder(g, condition) + + del g, condition + gc.collect() + + for branch in branches: + branch.launch(config, kernel) + assert len(branch.nodes()) == 1 + + +def test_reconstructed_body_survives_parent_deletion(init_cuda): + """Body graph obtained via nodes() reconstruction survives parent deletion.""" + g = GraphDef() + condition = try_create_condition(g) + g.while_loop(condition) + + all_nodes = g.nodes() + cond_nodes = [n for n in all_nodes if isinstance(n, ConditionalNode)] + assert len(cond_nodes) == 1 + + branches = cond_nodes[0].branches + if not branches: + pytest.skip("Body reconstruction requires CUDA 13.2+") + body = branches[0] + + del g, condition, all_nodes, cond_nodes, branches + gc.collect() + + assert body.nodes() == () + + +# ============================================================================= +# Child graph (embed) lifetime +# ============================================================================= + + +def test_child_graph_survives_parent_deletion(init_cuda): + """Embedded child graph remains valid after parent GraphDef is deleted.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + + child_def = GraphDef() + child_def.launch(config, kernel) + child_def.launch(config, kernel) + + g = GraphDef() + node = g.embed(child_def) + child_ref = node.child_graph + + del g, node, child_def + gc.collect() + + assert len(child_ref.nodes()) == 2 + + +def test_nested_child_graph_lifetime(init_cuda): + """Grandchild graph keeps entire ancestor chain alive.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + + inner = GraphDef() + inner.launch(config, kernel) + + middle = GraphDef() + middle.embed(inner) + + outer = GraphDef() + outer_node = outer.embed(middle) + + middle_ref = outer_node.child_graph + middle_nodes = middle_ref.nodes() + child_node = [n for n in middle_nodes if isinstance(n, ChildGraphNode)][0] + grandchild = child_node.child_graph + + del outer, outer_node, middle, inner, middle_ref, middle_nodes, child_node + gc.collect() + + assert len(grandchild.nodes()) == 1 From f1cceb207c5f4d0d41652d8f146fdec5227bed89 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 6 Mar 2026 10:59:56 -0800 Subject: [PATCH 18/23] Add RAII NodeHandle, event/kernel lifetime via user objects, consolidate Event factories - Introduce NodeHandle (shared_ptr with NodeBox) to tie node lifetime to owning graph, replacing raw CUgraphNode in Node objects - Attach EventHandle/KernelHandle copies as CUDA user objects to graphs, preventing dangling references when Python wrappers are GC'd - Consolidate Event factories to _init and _from_handle(EventHandle) - Inline as_cu() calls throughout _graphdef.pyx - Add lifetime tests validating event, kernel, and child-graph survival Made-with: Cursor --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 23 + cuda_core/cuda/core/_cpp/resource_handles.hpp | 21 + cuda_core/cuda/core/_event.pxd | 2 +- cuda_core/cuda/core/_event.pyx | 27 +- cuda_core/cuda/core/_graph/_graphdef.pxd | 57 ++- cuda_core/cuda/core/_graph/_graphdef.pyx | 401 ++++++++++-------- cuda_core/cuda/core/_resource_handles.pxd | 7 + cuda_core/cuda/core/_resource_handles.pyx | 6 + .../tests/graph/test_explicit_lifetime.py | 156 ++++++- 9 files changed, 464 insertions(+), 236 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index eade0d3e54..74ae71fd92 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -853,6 +853,29 @@ GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent) return GraphHandle(box, &box->resource); } +namespace { +struct NodeBox { + CUgraphNode resource; + GraphHandle h_graph; +}; +} // namespace + +static const NodeBox* get_box(const NodeHandle& h) { + const CUgraphNode* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(NodeBox, resource) + ); +} + +NodeHandle create_node_handle(CUgraphNode node, const GraphHandle& h_graph) { + auto box = std::make_shared(NodeBox{node, h_graph}); + return NodeHandle(box, &box->resource); +} + +GraphHandle node_get_graph(const NodeHandle& h) noexcept { + return h ? get_box(h)->h_graph : GraphHandle{}; +} + // ============================================================================ // Graphics Resource Handles // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 7a6ba8ed90..15f42f0dda 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -147,6 +147,7 @@ using MemoryPoolHandle = std::shared_ptr; using LibraryHandle = std::shared_ptr; using KernelHandle = std::shared_ptr; using GraphHandle = std::shared_ptr; +using NodeHandle = std::shared_ptr; using GraphicsResourceHandle = std::shared_ptr; using NvrtcProgramHandle = std::shared_ptr; using NvvmProgramHandle = std::shared_ptr; @@ -373,6 +374,18 @@ GraphHandle create_graph_handle(CUgraph graph); // but h_parent will be prevented from destruction while this handle exists. GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent); +// ============================================================================ +// Graph node handle functions +// ============================================================================ + +// Create a node handle. Nodes are owned by their parent graph (not +// independently destroyable). The GraphHandle dependency ensures the +// graph outlives any node reference. +NodeHandle create_node_handle(CUgraphNode node, const GraphHandle& h_graph); + +// Extract the owning graph handle from a node handle. +GraphHandle node_get_graph(const NodeHandle& h) noexcept; + // ============================================================================ // Graphics resource handle functions // ============================================================================ @@ -473,6 +486,10 @@ inline CUgraph as_cu(const GraphHandle& h) noexcept { return h ? *h : nullptr; } +inline CUgraphNode as_cu(const NodeHandle& h) noexcept { + return h ? *h : nullptr; +} + inline CUgraphicsResource as_cu(const GraphicsResourceHandle& h) noexcept { return h ? *h : nullptr; } @@ -527,6 +544,10 @@ inline std::intptr_t as_intptr(const GraphHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } +inline std::intptr_t as_intptr(const NodeHandle& h) noexcept { + return reinterpret_cast(as_cu(h)); +} + inline std::intptr_t as_intptr(const GraphicsResourceHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } diff --git a/cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/_event.pxd index 7f60b8cbc3..b0cbb13c95 100644 --- a/cuda_core/cuda/core/_event.pxd +++ b/cuda_core/cuda/core/_event.pxd @@ -22,6 +22,6 @@ cdef class Event: cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free) @staticmethod - cdef Event _from_handle(cydriver.CUevent raw_event) + cdef Event _from_handle(EventHandle h_event) cpdef close(self) diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 25480a76e1..12d43b1e2b 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -12,7 +12,6 @@ from cuda.core._resource_handles cimport ( ContextHandle, EventHandle, create_event_handle, - create_event_handle_ref, create_event_handle_ipc, as_intptr, as_cu, @@ -127,9 +126,8 @@ cdef class Event: return self @staticmethod - cdef Event _from_handle(cydriver.CUevent raw_event): - """Create a non-owning Event from a raw CUevent (internal use).""" - cdef EventHandle h_event = create_event_handle_ref(raw_event) + cdef Event _from_handle(EventHandle h_event): + """Create an Event wrapping an existing EventHandle.""" cdef Event self = Event.__new__(Event) self._h_event = h_event self._h_context = ContextHandle() @@ -140,27 +138,6 @@ cdef class Event: self._device_id = -1 return self - @staticmethod - def from_handle(handle) -> Event: - """Create a non-owning :obj:`Event` from a foreign event handle. - - Parameters - ---------- - handle : int - Event handle representing the address of a foreign - event object (CUevent). - - Notes - ----- - The returned Event does not own the underlying CUevent and will - not destroy it when garbage collected. This is intended for - wrapping events managed by other subsystems (e.g., CUDA graphs). - """ - if not isinstance(handle, int): - raise TypeError(f"handle must be an integer, got {type(handle).__name__}") - cdef cydriver.CUevent raw = handle - return Event._from_handle(raw) - cpdef close(self): """Destroy the event. diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index bf82c27a7c..bff91172f4 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -5,7 +5,7 @@ from libc.stddef cimport size_t from cuda.bindings cimport cydriver -from cuda.core._resource_handles cimport GraphHandle +from cuda.core._resource_handles cimport EventHandle, GraphHandle, KernelHandle, NodeHandle cdef class Condition @@ -45,8 +45,7 @@ cdef class GraphDef: cdef class Node: cdef: - GraphHandle _h_graph - cydriver.CUgraphNode _node # NULL for entry node + NodeHandle _h_node tuple _pred_cache tuple _succ_cache object __weakref__ @@ -57,7 +56,7 @@ cdef class Node: cdef class EmptyNode(Node): @staticmethod - cdef EmptyNode _create_impl(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef EmptyNode _create_impl(NodeHandle h_node) cdef class KernelNode(Node): @@ -65,15 +64,15 @@ cdef class KernelNode(Node): tuple _grid tuple _block unsigned int _shmem_size - cydriver.CUkernel _kern + KernelHandle _h_kernel @staticmethod - cdef KernelNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef KernelNode _create_with_params(NodeHandle h_node, tuple grid, tuple block, unsigned int shmem_size, - cydriver.CUkernel kern) + KernelHandle h_kernel) @staticmethod - cdef KernelNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef KernelNode _create_from_driver(NodeHandle h_node) cdef class AllocNode(Node): @@ -85,12 +84,12 @@ cdef class AllocNode(Node): tuple _peer_access @staticmethod - cdef AllocNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef AllocNode _create_with_params(NodeHandle h_node, cydriver.CUdeviceptr dptr, size_t bytesize, int device_id, str memory_type, tuple peer_access) @staticmethod - cdef AllocNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef AllocNode _create_from_driver(NodeHandle h_node) cdef class FreeNode(Node): @@ -98,11 +97,11 @@ cdef class FreeNode(Node): cydriver.CUdeviceptr _dptr @staticmethod - cdef FreeNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef FreeNode _create_with_params(NodeHandle h_node, cydriver.CUdeviceptr dptr) @staticmethod - cdef FreeNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef FreeNode _create_from_driver(NodeHandle h_node) cdef class MemsetNode(Node): @@ -115,13 +114,13 @@ cdef class MemsetNode(Node): size_t _pitch @staticmethod - cdef MemsetNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef MemsetNode _create_with_params(NodeHandle h_node, cydriver.CUdeviceptr dptr, unsigned int value, unsigned int element_size, size_t width, size_t height, size_t pitch) @staticmethod - cdef MemsetNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef MemsetNode _create_from_driver(NodeHandle h_node) cdef class MemcpyNode(Node): @@ -133,13 +132,13 @@ cdef class MemcpyNode(Node): cydriver.CUmemorytype _src_type @staticmethod - cdef MemcpyNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef MemcpyNode _create_with_params(NodeHandle h_node, cydriver.CUdeviceptr dst, cydriver.CUdeviceptr src, size_t size, cydriver.CUmemorytype dst_type, cydriver.CUmemorytype src_type) @staticmethod - cdef MemcpyNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef MemcpyNode _create_from_driver(NodeHandle h_node) cdef class ChildGraphNode(Node): @@ -147,35 +146,35 @@ cdef class ChildGraphNode(Node): GraphHandle _h_child_graph @staticmethod - cdef ChildGraphNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef ChildGraphNode _create_with_params(NodeHandle h_node, GraphHandle h_child_graph) @staticmethod - cdef ChildGraphNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef ChildGraphNode _create_from_driver(NodeHandle h_node) cdef class EventRecordNode(Node): cdef: - cydriver.CUevent _event + EventHandle _h_event @staticmethod - cdef EventRecordNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, - cydriver.CUevent event) + cdef EventRecordNode _create_with_params(NodeHandle h_node, + EventHandle h_event) @staticmethod - cdef EventRecordNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef EventRecordNode _create_from_driver(NodeHandle h_node) cdef class EventWaitNode(Node): cdef: - cydriver.CUevent _event + EventHandle _h_event @staticmethod - cdef EventWaitNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, - cydriver.CUevent event) + cdef EventWaitNode _create_with_params(NodeHandle h_node, + EventHandle h_event) @staticmethod - cdef EventWaitNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef EventWaitNode _create_from_driver(NodeHandle h_node) cdef class HostCallbackNode(Node): @@ -185,12 +184,12 @@ cdef class HostCallbackNode(Node): void* _user_data @staticmethod - cdef HostCallbackNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef HostCallbackNode _create_with_params(NodeHandle h_node, object callable_obj, cydriver.CUhostFn fn, void* user_data) @staticmethod - cdef HostCallbackNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef HostCallbackNode _create_from_driver(NodeHandle h_node) cdef class ConditionalNode(Node): @@ -200,7 +199,7 @@ cdef class ConditionalNode(Node): tuple _branches # tuple of GraphDef (non-owning wrappers) @staticmethod - cdef ConditionalNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node) + cdef ConditionalNode _create_from_driver(NodeHandle h_node) cdef class IfNode(ConditionalNode): diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 2626a6a0e9..54f25dbce5 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -46,12 +46,20 @@ from cuda.core._kernel_arg_handler cimport ParamHolder from cuda.core._launch_config cimport LaunchConfig from cuda.core._module cimport Kernel from cuda.core._resource_handles cimport ( + EventHandle, GraphHandle, + KernelHandle, + LibraryHandle, + NodeHandle, as_cu, as_intptr, as_py, + create_event_handle_ref, create_graph_handle, create_graph_handle_ref, + create_kernel_handle_ref, + create_node_handle, + node_get_graph, ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value @@ -107,6 +115,16 @@ cdef void _py_host_destructor(void* data) noexcept with gil: _py_decref(data) +cdef void _destroy_event_handle_copy(void* ptr) noexcept nogil: + cdef EventHandle* p = ptr + del p + + +cdef void _destroy_kernel_handle_copy(void* ptr) noexcept nogil: + cdef KernelHandle* p = ptr + del p + + cdef void _attach_user_object( cydriver.CUgraph graph, void* ptr, cydriver.CUhostFn destroy) except *: @@ -179,12 +197,13 @@ cdef ConditionalNode _make_conditional_node( params.conditional.size = size cdef cydriver.CUcontext ctx = NULL - cdef cydriver.CUgraph graph = as_cu(pred._h_graph) + cdef GraphHandle h_graph = node_get_graph(pred._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(pred._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 - if pred._node != NULL: - deps = &pred._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 with nogil: @@ -193,7 +212,7 @@ cdef ConditionalNode _make_conditional_node( with nogil: HANDLE_RETURN(cydriver.cuGraphAddNode( - &new_node, graph, deps, NULL, num_deps, ¶ms)) + &new_node, as_cu(h_graph), deps, NULL, num_deps, ¶ms)) # cuGraphAddNode sets phGraph_out to an internal array of body # graphs (it replaces the pointer, not writing into a caller array). @@ -203,13 +222,12 @@ cdef ConditionalNode _make_conditional_node( cdef GraphHandle h_branch for i in range(size): bg = params.conditional.phGraph_out[i] - h_branch = create_graph_handle_ref(bg, pred._h_graph) + h_branch = create_graph_handle_ref(bg, h_graph) branch_list.append(GraphDef._from_handle(h_branch)) cdef tuple branches = tuple(branch_list) cdef ConditionalNode n = node_cls.__new__(node_cls) - n._h_graph = pred._h_graph - n._node = new_node + n._h_node = create_node_handle(new_node, h_graph) n._condition = condition n._cond_type = cond_type n._branches = branches @@ -291,8 +309,7 @@ cdef class GraphDef: def _entry(self) -> Node: """Return the internal entry-point Node (no dependencies).""" cdef Node n = Node.__new__(Node) - n._h_graph = self._h_graph - n._node = NULL + n._h_node = create_node_handle(NULL, self._h_graph) return n def alloc(self, size_t size, options: GraphAllocOptions | None = None) -> AllocNode: @@ -399,12 +416,11 @@ cdef class GraphDef: default_val = default_value flags = cydriver.CU_GRAPH_COND_ASSIGN_DEFAULT - cdef cydriver.CUgraph graph = as_cu(self._h_graph) cdef cydriver.CUcontext ctx = NULL with nogil: HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) HANDLE_RETURN(cydriver.cuGraphConditionalHandleCreate( - &c_handle, graph, ctx, default_val, flags)) + &c_handle, as_cu(self._h_graph), ctx, default_val, flags)) cdef Condition cond = Condition.__new__(Condition) cond._c_handle = c_handle @@ -471,11 +487,10 @@ cdef class GraphDef: raise TypeError("options must be a GraphDebugPrintOptions instance") flags = options._to_flags() - cdef cydriver.CUgraph graph = as_cu(self._h_graph) cdef bytes path_bytes = path.encode('utf-8') cdef const char* c_path = path_bytes with nogil: - HANDLE_RETURN(cydriver.cuGraphDebugDotPrint(graph, c_path, flags)) + HANDLE_RETURN(cydriver.cuGraphDebugDotPrint(as_cu(self._h_graph), c_path, flags)) def nodes(self) -> tuple: """Return all nodes in the graph. @@ -485,11 +500,10 @@ cdef class GraphDef: tuple of Node All nodes in the graph. """ - cdef cydriver.CUgraph graph = as_cu(self._h_graph) cdef size_t num_nodes = 0 with nogil: - HANDLE_RETURN(cydriver.cuGraphGetNodes(graph, NULL, &num_nodes)) + HANDLE_RETURN(cydriver.cuGraphGetNodes(as_cu(self._h_graph), NULL, &num_nodes)) if num_nodes == 0: return () @@ -497,7 +511,7 @@ cdef class GraphDef: cdef vector[cydriver.CUgraphNode] nodes_vec nodes_vec.resize(num_nodes) with nogil: - HANDLE_RETURN(cydriver.cuGraphGetNodes(graph, nodes_vec.data(), &num_nodes)) + HANDLE_RETURN(cydriver.cuGraphGetNodes(as_cu(self._h_graph), nodes_vec.data(), &num_nodes)) return tuple(Node._create(self._h_graph, nodes_vec[i]) for i in range(num_nodes)) @@ -510,11 +524,10 @@ cdef class GraphDef: Each element is a (from_node, to_node) pair representing a dependency edge in the graph. """ - cdef cydriver.CUgraph graph = as_cu(self._h_graph) cdef size_t num_edges = 0 with nogil: - HANDLE_RETURN(cydriver.cuGraphGetEdges(graph, NULL, NULL, NULL, &num_edges)) + HANDLE_RETURN(cydriver.cuGraphGetEdges(as_cu(self._h_graph), NULL, NULL, NULL, &num_edges)) if num_edges == 0: return () @@ -525,7 +538,7 @@ cdef class GraphDef: to_nodes.resize(num_edges) with nogil: HANDLE_RETURN(cydriver.cuGraphGetEdges( - graph, from_nodes.data(), to_nodes.data(), NULL, &num_edges)) + as_cu(self._h_graph), from_nodes.data(), to_nodes.data(), NULL, &num_edges)) return tuple( (Node._create(self._h_graph, from_nodes[i]), @@ -552,56 +565,55 @@ cdef class Node: """Factory: dispatch to the right subclass based on node type.""" if node == NULL: n = Node.__new__(Node) - (n)._h_graph = h_graph - (n)._node = NULL + (n)._h_node = create_node_handle(node, h_graph) return n + cdef NodeHandle h_node = create_node_handle(node, h_graph) cdef cydriver.CUgraphNodeType node_type with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetType(node, &node_type)) if node_type == cydriver.CU_GRAPH_NODE_TYPE_EMPTY: - return EmptyNode._create_impl(h_graph, node) + return EmptyNode._create_impl(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_KERNEL: - return KernelNode._create_from_driver(h_graph, node) + return KernelNode._create_from_driver(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEM_ALLOC: - return AllocNode._create_from_driver(h_graph, node) + return AllocNode._create_from_driver(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEM_FREE: - return FreeNode._create_from_driver(h_graph, node) + return FreeNode._create_from_driver(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEMSET: - return MemsetNode._create_from_driver(h_graph, node) + return MemsetNode._create_from_driver(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_MEMCPY: - return MemcpyNode._create_from_driver(h_graph, node) + return MemcpyNode._create_from_driver(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_GRAPH: - return ChildGraphNode._create_from_driver(h_graph, node) + return ChildGraphNode._create_from_driver(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_EVENT_RECORD: - return EventRecordNode._create_from_driver(h_graph, node) + return EventRecordNode._create_from_driver(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_WAIT_EVENT: - return EventWaitNode._create_from_driver(h_graph, node) + return EventWaitNode._create_from_driver(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_HOST: - return HostCallbackNode._create_from_driver(h_graph, node) + return HostCallbackNode._create_from_driver(h_node) elif node_type == cydriver.CU_GRAPH_NODE_TYPE_CONDITIONAL: - return ConditionalNode._create_from_driver(h_graph, node) + return ConditionalNode._create_from_driver(h_node) else: n = Node.__new__(Node) - (n)._h_graph = h_graph - (n)._node = node + (n)._h_node = h_node return n def __repr__(self) -> str: - if self._node == NULL: + cdef cydriver.CUgraphNode node = as_cu(self._h_node) + if node == NULL: return "" - return f"self._node:x}>" + return f"node:x}>" def __eq__(self, other) -> bool: if not isinstance(other, Node): return NotImplemented cdef Node o = other - return (as_intptr(self._h_graph) == as_intptr(o._h_graph) and - self._node == o._node) + return as_intptr(self._h_node) == as_intptr(o._h_node) def __hash__(self) -> int: - return hash((as_intptr(self._h_graph), self._node)) + return hash(as_intptr(self._h_node)) @property def type(self): @@ -612,17 +624,18 @@ cdef class Node: CUgraphNodeType or None The node type enum value, or None for the entry node. """ - if self._node == NULL: + cdef cydriver.CUgraphNode node = as_cu(self._h_node) + if node == NULL: return None cdef cydriver.CUgraphNodeType node_type with nogil: - HANDLE_RETURN(cydriver.cuGraphNodeGetType(self._node, &node_type)) + HANDLE_RETURN(cydriver.cuGraphNodeGetType(node, &node_type)) return driver.CUgraphNodeType(node_type) @property def graph(self) -> GraphDef: """Return the GraphDef this node belongs to.""" - return GraphDef._from_handle(self._h_graph) + return GraphDef._from_handle(node_get_graph(self._h_node)) @property def handle(self) -> int | None: @@ -630,9 +643,10 @@ cdef class Node: Returns None for the entry node. """ - if self._node == NULL: + cdef cydriver.CUgraphNode node = as_cu(self._h_node) + if node == NULL: return None - return self._node + return node @property def pred(self) -> tuple: @@ -649,12 +663,12 @@ cdef class Node: if self._pred_cache is not None: return self._pred_cache - if self._node == NULL: + cdef cydriver.CUgraphNode node = as_cu(self._h_node) + if node == NULL: self._pred_cache = () return self._pred_cache cdef size_t num_deps = 0 - cdef cydriver.CUgraphNode node = self._node with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetDependencies(node, NULL, NULL, &num_deps)) @@ -668,7 +682,8 @@ cdef class Node: with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetDependencies(node, deps.data(), NULL, &num_deps)) - self._pred_cache = tuple(Node._create(self._h_graph, deps[i]) for i in range(num_deps)) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + self._pred_cache = tuple(Node._create(h_graph, deps[i]) for i in range(num_deps)) return self._pred_cache @property @@ -686,12 +701,12 @@ cdef class Node: if self._succ_cache is not None: return self._succ_cache - if self._node == NULL: + cdef cydriver.CUgraphNode node = as_cu(self._h_node) + if node == NULL: self._succ_cache = () return self._succ_cache cdef size_t num_deps = 0 - cdef cydriver.CUgraphNode node = self._node with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetDependentNodes(node, NULL, NULL, &num_deps)) @@ -705,7 +720,8 @@ cdef class Node: with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetDependentNodes(node, deps.data(), NULL, &num_deps)) - self._succ_cache = tuple(Node._create(self._h_graph, deps[i]) for i in range(num_deps)) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + self._succ_cache = tuple(Node._create(h_graph, deps[i]) for i in range(num_deps)) return self._succ_cache def launch(self, config: LaunchConfig, kernel: Kernel, *args) -> KernelNode: @@ -731,12 +747,13 @@ cdef class Node: cdef cydriver.CUDA_KERNEL_NODE_PARAMS node_params cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 - if self._node != NULL: - deps = &self._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 node_params.kern = as_cu(ker._h_kernel) @@ -753,13 +770,17 @@ cdef class Node: node_params.ctx = NULL with nogil: - HANDLE_RETURN(cydriver.cuGraphAddKernelNode(&new_node, graph, deps, num_deps, &node_params)) + HANDLE_RETURN(cydriver.cuGraphAddKernelNode( + &new_node, as_cu(h_graph), deps, num_deps, &node_params)) + + _attach_user_object(as_cu(h_graph), new KernelHandle(ker._h_kernel), + _destroy_kernel_handle_copy) self._succ_cache = None return KernelNode._create_with_params( - self._h_graph, new_node, + create_node_handle(new_node, h_graph), conf.grid, conf.block, conf.shmem_size, - node_params.kern) + ker._h_kernel) def join(self, *nodes: Node) -> EmptyNode: """Create an empty node that depends on this node and all given nodes. @@ -778,28 +799,30 @@ cdef class Node: """ cdef vector[cydriver.CUgraphNode] deps cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) cdef Node other cdef cydriver.CUgraphNode* deps_ptr = NULL cdef size_t num_deps = 0 + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) - if self._node != NULL: - deps.push_back(self._node) + if pred_node != NULL: + deps.push_back(pred_node) for other in nodes: - if (other)._node != NULL: - deps.push_back((other)._node) + if as_cu((other)._h_node) != NULL: + deps.push_back(as_cu((other)._h_node)) num_deps = deps.size() if num_deps > 0: deps_ptr = deps.data() with nogil: - HANDLE_RETURN(cydriver.cuGraphAddEmptyNode(&new_node, graph, deps_ptr, num_deps)) + HANDLE_RETURN(cydriver.cuGraphAddEmptyNode( + &new_node, as_cu(h_graph), deps_ptr, num_deps)) self._succ_cache = None for other in nodes: (other)._succ_cache = None - return EmptyNode._create_impl(self._h_graph, new_node) + return EmptyNode._create_impl(create_node_handle(new_node, h_graph)) def alloc(self, size_t size, options: GraphAllocOptions | None = None) -> AllocNode: """Add a memory allocation node depending on this node. @@ -829,12 +852,13 @@ cdef class Node: cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS alloc_params cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 - if self._node != NULL: - deps = &self._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 cdef vector[cydriver.CUmemAccessDesc] access_descs @@ -882,11 +906,12 @@ cdef class Node: alloc_params.accessDescCount = access_descs.size() with nogil: - HANDLE_RETURN(cydriver.cuGraphAddMemAllocNode(&new_node, graph, deps, num_deps, &alloc_params)) + HANDLE_RETURN(cydriver.cuGraphAddMemAllocNode( + &new_node, as_cu(h_graph), deps, num_deps, &alloc_params)) self._succ_cache = None return AllocNode._create_with_params( - self._h_graph, new_node, alloc_params.dptr, size, + create_node_handle(new_node, h_graph), alloc_params.dptr, size, device_id, memory_type, tuple(peer_ids)) def free(self, dptr: int) -> FreeNode: @@ -903,20 +928,22 @@ cdef class Node: A new FreeNode representing the free operation. """ cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 cdef cydriver.CUdeviceptr c_dptr = dptr - if self._node != NULL: - deps = &self._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 with nogil: - HANDLE_RETURN(cydriver.cuGraphAddMemFreeNode(&new_node, graph, deps, num_deps, c_dptr)) + HANDLE_RETURN(cydriver.cuGraphAddMemFreeNode( + &new_node, as_cu(h_graph), deps, num_deps, c_dptr)) self._succ_cache = None - return FreeNode._create_with_params(self._h_graph, new_node, c_dptr) + return FreeNode._create_with_params(create_node_handle(new_node, h_graph), c_dptr) def memset(self, dst: int, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode: """Add a memset node depending on this node. @@ -946,12 +973,13 @@ cdef class Node: cdef cydriver.CUDA_MEMSET_NODE_PARAMS memset_params cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 - if self._node != NULL: - deps = &self._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 cdef cydriver.CUdeviceptr c_dst = dst @@ -969,12 +997,12 @@ cdef class Node: with nogil: HANDLE_RETURN(cydriver.cuGraphAddMemsetNode( - &new_node, graph, deps, num_deps, + &new_node, as_cu(h_graph), deps, num_deps, &memset_params, ctx)) self._succ_cache = None return MemsetNode._create_with_params( - self._h_graph, new_node, c_dst, + create_node_handle(new_node, h_graph), c_dst, val, elem_size, width, height, pitch) def memcpy(self, dst: int, src: int, size_t size) -> MemcpyNode: @@ -1039,23 +1067,24 @@ cdef class Node: params.Depth = 1 cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 - if self._node != NULL: - deps = &self._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 cdef cydriver.CUcontext ctx = NULL with nogil: HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) HANDLE_RETURN(cydriver.cuGraphAddMemcpyNode( - &new_node, graph, deps, num_deps, ¶ms, ctx)) + &new_node, as_cu(h_graph), deps, num_deps, ¶ms, ctx)) self._succ_cache = None return MemcpyNode._create_with_params( - self._h_graph, new_node, c_dst, c_src, size, + create_node_handle(new_node, h_graph), c_dst, c_src, size, c_dst_type, c_src_type) def embed(self, child: GraphDef) -> ChildGraphNode: @@ -1076,29 +1105,30 @@ cdef class Node: A new ChildGraphNode representing the embedded sub-graph. """ cdef GraphDef child_def = child - cdef cydriver.CUgraph child_graph = as_cu(child_def._h_graph) cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 - if self._node != NULL: - deps = &self._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 with nogil: HANDLE_RETURN(cydriver.cuGraphAddChildGraphNode( - &new_node, graph, deps, num_deps, child_graph)) + &new_node, as_cu(h_graph), deps, num_deps, as_cu(child_def._h_graph))) cdef cydriver.CUgraph embedded_graph = NULL with nogil: HANDLE_RETURN(cydriver.cuGraphChildGraphNodeGetGraph( new_node, &embedded_graph)) - cdef GraphHandle h_embedded = create_graph_handle_ref(embedded_graph, self._h_graph) + cdef GraphHandle h_embedded = create_graph_handle_ref(embedded_graph, h_graph) self._succ_cache = None - return ChildGraphNode._create_with_params(self._h_graph, new_node, h_embedded) + return ChildGraphNode._create_with_params( + create_node_handle(new_node, h_graph), h_embedded) def record_event(self, event: Event) -> EventRecordNode: """Add an event record node depending on this node. @@ -1114,22 +1144,26 @@ cdef class Node: A new EventRecordNode representing the event record operation. """ cdef Event ev = event - cdef cydriver.CUevent c_event = as_cu(ev._h_event) cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 - if self._node != NULL: - deps = &self._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 with nogil: HANDLE_RETURN(cydriver.cuGraphAddEventRecordNode( - &new_node, graph, deps, num_deps, c_event)) + &new_node, as_cu(h_graph), deps, num_deps, as_cu(ev._h_event))) + + _attach_user_object(as_cu(h_graph), new EventHandle(ev._h_event), + _destroy_event_handle_copy) self._succ_cache = None - return EventRecordNode._create_with_params(self._h_graph, new_node, c_event) + return EventRecordNode._create_with_params( + create_node_handle(new_node, h_graph), ev._h_event) def wait_event(self, event: Event) -> EventWaitNode: """Add an event wait node depending on this node. @@ -1145,22 +1179,26 @@ cdef class Node: A new EventWaitNode representing the event wait operation. """ cdef Event ev = event - cdef cydriver.CUevent c_event = as_cu(ev._h_event) cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 - if self._node != NULL: - deps = &self._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 with nogil: HANDLE_RETURN(cydriver.cuGraphAddEventWaitNode( - &new_node, graph, deps, num_deps, c_event)) + &new_node, as_cu(h_graph), deps, num_deps, as_cu(ev._h_event))) + + _attach_user_object(as_cu(h_graph), new EventHandle(ev._h_event), + _destroy_event_handle_copy) self._succ_cache = None - return EventWaitNode._create_with_params(self._h_graph, new_node, c_event) + return EventWaitNode._create_with_params( + create_node_handle(new_node, h_graph), ev._h_event) def callback(self, fn, *, user_data=None) -> HostCallbackNode: """Add a host callback node depending on this node. @@ -1199,15 +1237,16 @@ cdef class Node: cdef cydriver.CUDA_HOST_NODE_PARAMS node_params cdef cydriver.CUgraphNode new_node = NULL - cdef cydriver.CUgraph graph = as_cu(self._h_graph) + cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 cdef void* c_user_data = NULL cdef object callable_obj = None cdef void* fn_pyobj = NULL - if self._node != NULL: - deps = &self._node + if pred_node != NULL: + deps = &pred_node num_deps = 1 if isinstance(fn, ct._CFuncPtr): @@ -1225,7 +1264,7 @@ cdef class Node: "failed to allocate user_data buffer") c_memcpy(c_user_data, buf, len(buf)) _attach_user_object( - graph, c_user_data, + as_cu(h_graph), c_user_data, free) node_params.userData = c_user_data @@ -1240,16 +1279,16 @@ cdef class Node: node_params.fn = _py_host_trampoline node_params.userData = fn_pyobj _attach_user_object( - graph, fn_pyobj, + as_cu(h_graph), fn_pyobj, _py_host_destructor) with nogil: HANDLE_RETURN(cydriver.cuGraphAddHostNode( - &new_node, graph, deps, num_deps, &node_params)) + &new_node, as_cu(h_graph), deps, num_deps, &node_params)) self._succ_cache = None return HostCallbackNode._create_with_params( - self._h_graph, new_node, callable_obj, + create_node_handle(new_node, h_graph), callable_obj, node_params.fn, node_params.userData) def if_cond(self, condition: Condition) -> IfNode: @@ -1345,10 +1384,9 @@ cdef class EmptyNode(Node): """A synchronization / join node with no operation.""" @staticmethod - cdef EmptyNode _create_impl(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef EmptyNode _create_impl(NodeHandle h_node): cdef EmptyNode n = EmptyNode.__new__(EmptyNode) - n._h_graph = h_graph - n._node = node + n._h_node = h_node return n def __repr__(self) -> str: @@ -1374,31 +1412,33 @@ cdef class KernelNode(Node): """ @staticmethod - cdef KernelNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef KernelNode _create_with_params(NodeHandle h_node, tuple grid, tuple block, unsigned int shmem_size, - cydriver.CUkernel kern): + KernelHandle h_kernel): """Create from known params (called by launch() builder).""" cdef KernelNode n = KernelNode.__new__(KernelNode) - n._h_graph = h_graph - n._node = node + n._h_node = h_node n._grid = grid n._block = block n._shmem_size = shmem_size - n._kern = kern + n._h_kernel = h_kernel return n @staticmethod - cdef KernelNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef KernelNode _create_from_driver(NodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_KERNEL_NODE_PARAMS params with nogil: HANDLE_RETURN(cydriver.cuGraphKernelNodeGetParams(node, ¶ms)) + cdef LibraryHandle empty_lib + cdef KernelHandle h_kernel = create_kernel_handle_ref(params.kern, empty_lib) return KernelNode._create_with_params( - h_graph, node, + h_node, (params.gridDimX, params.gridDimY, params.gridDimZ), (params.blockDimX, params.blockDimY, params.blockDimZ), params.sharedMemBytes, - params.kern) + h_kernel) def __repr__(self) -> str: return (f"") @@ -1421,7 +1461,7 @@ cdef class KernelNode(Node): @property def kernel(self) -> Kernel: """The Kernel object for this launch node.""" - return Kernel.from_handle(self._kern) + return Kernel._from_obj(self._h_kernel) @property def config(self) -> LaunchConfig: @@ -1454,13 +1494,12 @@ cdef class AllocNode(Node): """ @staticmethod - cdef AllocNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef AllocNode _create_with_params(NodeHandle h_node, cydriver.CUdeviceptr dptr, size_t bytesize, int device_id, str memory_type, tuple peer_access): """Create from known params (called by alloc() builder).""" cdef AllocNode n = AllocNode.__new__(AllocNode) - n._h_graph = h_graph - n._node = node + n._h_node = h_node n._dptr = dptr n._bytesize = bytesize n._device_id = device_id @@ -1469,8 +1508,9 @@ cdef class AllocNode(Node): return n @staticmethod - cdef AllocNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef AllocNode _create_from_driver(NodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS params with nogil: HANDLE_RETURN(cydriver.cuGraphMemAllocNodeGetParams(node, ¶ms)) @@ -1492,7 +1532,7 @@ cdef class AllocNode(Node): peer_ids.append(params.accessDescs[i].location.id) return AllocNode._create_with_params( - h_graph, node, params.dptr, params.bytesize, + h_node, params.dptr, params.bytesize, params.poolProps.location.id, memory_type, tuple(peer_ids)) def __repr__(self) -> str: @@ -1543,22 +1583,22 @@ cdef class FreeNode(Node): """ @staticmethod - cdef FreeNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef FreeNode _create_with_params(NodeHandle h_node, cydriver.CUdeviceptr dptr): """Create from known params (called by free() builder).""" cdef FreeNode n = FreeNode.__new__(FreeNode) - n._h_graph = h_graph - n._node = node + n._h_node = h_node n._dptr = dptr return n @staticmethod - cdef FreeNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef FreeNode _create_from_driver(NodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUdeviceptr dptr with nogil: HANDLE_RETURN(cydriver.cuGraphMemFreeNodeGetParams(node, &dptr)) - return FreeNode._create_with_params(h_graph, node, dptr) + return FreeNode._create_with_params(h_node, dptr) def __repr__(self) -> str: return f"" @@ -1589,14 +1629,13 @@ cdef class MemsetNode(Node): """ @staticmethod - cdef MemsetNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef MemsetNode _create_with_params(NodeHandle h_node, cydriver.CUdeviceptr dptr, unsigned int value, unsigned int element_size, size_t width, size_t height, size_t pitch): """Create from known params (called by memset() builder).""" cdef MemsetNode n = MemsetNode.__new__(MemsetNode) - n._h_graph = h_graph - n._node = node + n._h_node = h_node n._dptr = dptr n._value = value n._element_size = element_size @@ -1606,13 +1645,14 @@ cdef class MemsetNode(Node): return n @staticmethod - cdef MemsetNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef MemsetNode _create_from_driver(NodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_MEMSET_NODE_PARAMS params with nogil: HANDLE_RETURN(cydriver.cuGraphMemsetNodeGetParams(node, ¶ms)) return MemsetNode._create_with_params( - h_graph, node, params.dst, params.value, + h_node, params.dst, params.value, params.elementSize, params.width, params.height, params.pitch) def __repr__(self) -> str: @@ -1664,14 +1704,13 @@ cdef class MemcpyNode(Node): """ @staticmethod - cdef MemcpyNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef MemcpyNode _create_with_params(NodeHandle h_node, cydriver.CUdeviceptr dst, cydriver.CUdeviceptr src, size_t size, cydriver.CUmemorytype dst_type, cydriver.CUmemorytype src_type): """Create from known params (called by memcpy() builder).""" cdef MemcpyNode n = MemcpyNode.__new__(MemcpyNode) - n._h_graph = h_graph - n._node = node + n._h_node = h_node n._dst = dst n._src = src n._size = size @@ -1680,8 +1719,9 @@ cdef class MemcpyNode(Node): return n @staticmethod - cdef MemcpyNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef MemcpyNode _create_from_driver(NodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_MEMCPY3D params with nogil: HANDLE_RETURN(cydriver.cuGraphMemcpyNodeGetParams(node, ¶ms)) @@ -1698,7 +1738,7 @@ cdef class MemcpyNode(Node): src = params.srcDevice return MemcpyNode._create_with_params( - h_graph, node, dst, src, params.WidthInBytes, + h_node, dst, src, params.WidthInBytes, params.dstMemoryType, params.srcMemoryType) def __repr__(self) -> str: @@ -1733,23 +1773,24 @@ cdef class ChildGraphNode(Node): """ @staticmethod - cdef ChildGraphNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef ChildGraphNode _create_with_params(NodeHandle h_node, GraphHandle h_child_graph): """Create from known params (called by embed() builder).""" cdef ChildGraphNode n = ChildGraphNode.__new__(ChildGraphNode) - n._h_graph = h_graph - n._node = node + n._h_node = h_node n._h_child_graph = h_child_graph return n @staticmethod - cdef ChildGraphNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef ChildGraphNode _create_from_driver(NodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUgraph child_graph = NULL with nogil: HANDLE_RETURN(cydriver.cuGraphChildGraphNodeGetGraph(node, &child_graph)) + cdef GraphHandle h_graph = node_get_graph(h_node) cdef GraphHandle h_child = create_graph_handle_ref(child_graph, h_graph) - return ChildGraphNode._create_with_params(h_graph, node, h_child) + return ChildGraphNode._create_with_params(h_node, h_child) def __repr__(self) -> str: cdef cydriver.CUgraph g = as_cu(self._h_child_graph) @@ -1771,34 +1812,35 @@ cdef class EventRecordNode(Node): Properties ---------- event : Event - The event being recorded (non-owning wrapper). + The event being recorded. """ @staticmethod - cdef EventRecordNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, - cydriver.CUevent event): + cdef EventRecordNode _create_with_params(NodeHandle h_node, + EventHandle h_event): """Create from known params (called by record_event() builder).""" cdef EventRecordNode n = EventRecordNode.__new__(EventRecordNode) - n._h_graph = h_graph - n._node = node - n._event = event + n._h_node = h_node + n._h_event = h_event return n @staticmethod - cdef EventRecordNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef EventRecordNode _create_from_driver(NodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUevent event with nogil: HANDLE_RETURN(cydriver.cuGraphEventRecordNodeGetEvent(node, &event)) - return EventRecordNode._create_with_params(h_graph, node, event) + cdef EventHandle h_event = create_event_handle_ref(event) + return EventRecordNode._create_with_params(h_node, h_event) def __repr__(self) -> str: - return f"self._event:x}>" + return f"" @property def event(self) -> Event: - """The event being recorded (non-owning wrapper).""" - return Event._from_handle(self._event) + """The event being recorded.""" + return Event._from_handle(self._h_event) cdef class EventWaitNode(Node): @@ -1807,34 +1849,35 @@ cdef class EventWaitNode(Node): Properties ---------- event : Event - The event being waited on (non-owning wrapper). + The event being waited on. """ @staticmethod - cdef EventWaitNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, - cydriver.CUevent event): + cdef EventWaitNode _create_with_params(NodeHandle h_node, + EventHandle h_event): """Create from known params (called by wait_event() builder).""" cdef EventWaitNode n = EventWaitNode.__new__(EventWaitNode) - n._h_graph = h_graph - n._node = node - n._event = event + n._h_node = h_node + n._h_event = h_event return n @staticmethod - cdef EventWaitNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef EventWaitNode _create_from_driver(NodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUevent event with nogil: HANDLE_RETURN(cydriver.cuGraphEventWaitNodeGetEvent(node, &event)) - return EventWaitNode._create_with_params(h_graph, node, event) + cdef EventHandle h_event = create_event_handle_ref(event) + return EventWaitNode._create_with_params(h_node, h_event) def __repr__(self) -> str: - return f"self._event:x}>" + return f"" @property def event(self) -> Event: - """The event being waited on (non-owning wrapper).""" - return Event._from_handle(self._event) + """The event being waited on.""" + return Event._from_handle(self._h_event) cdef class HostCallbackNode(Node): @@ -1847,33 +1890,31 @@ cdef class HostCallbackNode(Node): """ @staticmethod - cdef HostCallbackNode _create_with_params(GraphHandle h_graph, cydriver.CUgraphNode node, + cdef HostCallbackNode _create_with_params(NodeHandle h_node, object callable_obj, cydriver.CUhostFn fn, void* user_data): """Create from known params (called by callback() builder).""" cdef HostCallbackNode n = HostCallbackNode.__new__(HostCallbackNode) - n._h_graph = h_graph - n._node = node + n._h_node = h_node n._callable = callable_obj n._fn = fn n._user_data = user_data return n @staticmethod - cdef HostCallbackNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef HostCallbackNode _create_from_driver(NodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" + cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_HOST_NODE_PARAMS params with nogil: HANDLE_RETURN(cydriver.cuGraphHostNodeGetParams(node, ¶ms)) cdef object callable_obj = None if params.fn == _py_host_trampoline: - # cast Py_INCREFs — HostCallbackNode holds its own - # reference, independent of the user object's reference. callable_obj = params.userData return HostCallbackNode._create_with_params( - h_graph, node, callable_obj, params.fn, params.userData) + h_node, callable_obj, params.fn, params.userData) def __repr__(self) -> str: if self._callable is not None: @@ -1907,17 +1948,17 @@ cdef class ConditionalNode(Node): """ @staticmethod - cdef ConditionalNode _create_from_driver(GraphHandle h_graph, cydriver.CUgraphNode node): + cdef ConditionalNode _create_from_driver(NodeHandle h_node): cdef ConditionalNode n if not _check_node_get_params(): n = ConditionalNode.__new__(ConditionalNode) - n._h_graph = h_graph - n._node = node + n._h_node = h_node n._condition = None n._cond_type = cydriver.CU_GRAPH_COND_TYPE_IF n._branches = () return n + cdef cydriver.CUgraphNode node = as_cu(h_node) params = handle_return(driver.cuGraphNodeGetParams( node)) cond_params = params.conditional @@ -1928,6 +1969,7 @@ cdef class ConditionalNode(Node): condition._c_handle = ( int(cond_params.handle)) + cdef GraphHandle h_graph = node_get_graph(h_node) cdef list branch_list = [] cdef unsigned int i cdef GraphHandle h_branch @@ -1951,8 +1993,7 @@ cdef class ConditionalNode(Node): cls = SwitchNode n = cls.__new__(cls) - n._h_graph = h_graph - n._node = node + n._h_node = h_node n._condition = condition n._cond_type = cond_type_int n._branches = branches diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index cb622cd70a..f8497b61b1 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -27,6 +27,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const cydriver.CUlibrary] LibraryHandle ctypedef shared_ptr[const cydriver.CUkernel] KernelHandle ctypedef shared_ptr[const cydriver.CUgraph] GraphHandle + ctypedef shared_ptr[const cydriver.CUgraphNode] NodeHandle ctypedef shared_ptr[const cydriver.CUgraphicsResource] GraphicsResourceHandle ctypedef shared_ptr[const cynvrtc.nvrtcProgram] NvrtcProgramHandle @@ -50,6 +51,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUlibrary as_cu(LibraryHandle h) noexcept nogil cydriver.CUkernel as_cu(KernelHandle h) noexcept nogil cydriver.CUgraph as_cu(GraphHandle h) noexcept nogil + cydriver.CUgraphNode as_cu(NodeHandle h) noexcept nogil cydriver.CUgraphicsResource as_cu(GraphicsResourceHandle h) noexcept nogil cynvrtc.nvrtcProgram as_cu(NvrtcProgramHandle h) noexcept nogil cynvvm.nvvmProgram as_cu(NvvmProgramHandle h) noexcept nogil @@ -65,6 +67,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": intptr_t as_intptr(LibraryHandle h) noexcept nogil intptr_t as_intptr(KernelHandle h) noexcept nogil intptr_t as_intptr(GraphHandle h) noexcept nogil + intptr_t as_intptr(NodeHandle h) noexcept nogil intptr_t as_intptr(GraphicsResourceHandle h) noexcept nogil intptr_t as_intptr(NvrtcProgramHandle h) noexcept nogil intptr_t as_intptr(NvvmProgramHandle h) noexcept nogil @@ -162,6 +165,10 @@ cdef KernelHandle create_kernel_handle_ref( cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil +# Graph node handles +cdef NodeHandle create_node_handle(cydriver.CUgraphNode node, const GraphHandle& h_graph) except+ nogil +cdef GraphHandle node_get_graph(const NodeHandle& h) noexcept nogil + # Graphics resource handles cdef GraphicsResourceHandle create_graphics_resource_handle( cydriver.CUgraphicsResource resource) except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 7bffcd542b..5e068a7f1e 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -137,6 +137,12 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": GraphHandle create_graph_handle_ref "cuda_core::create_graph_handle_ref" ( cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil + # Graph node handles + NodeHandle create_node_handle "cuda_core::create_node_handle" ( + cydriver.CUgraphNode node, const GraphHandle& h_graph) except+ nogil + GraphHandle node_get_graph "cuda_core::node_get_graph" ( + const NodeHandle& h) noexcept nogil + # Graphics resource handles GraphicsResourceHandle create_graphics_resource_handle "cuda_core::create_graphics_resource_handle" ( cydriver.CUgraphicsResource resource) except+ nogil diff --git a/cuda_core/tests/graph/test_explicit_lifetime.py b/cuda_core/tests/graph/test_explicit_lifetime.py index f713e63ad1..1dfbf34484 100644 --- a/cuda_core/tests/graph/test_explicit_lifetime.py +++ b/cuda_core/tests/graph/test_explicit_lifetime.py @@ -14,7 +14,7 @@ from helpers.graph_kernels import compile_common_kernels from helpers.misc import try_create_condition -from cuda.core import LaunchConfig +from cuda.core import Device, EventOptions, LaunchConfig from cuda.core._graph._graphdef import ( ChildGraphNode, ConditionalNode, @@ -159,3 +159,157 @@ def test_nested_child_graph_lifetime(init_cuda): gc.collect() assert len(grandchild.nodes()) == 1 + + +# ============================================================================= +# Event lifetime — event nodes should keep the Event alive +# ============================================================================= + + +def test_event_record_node_keeps_event_alive(init_cuda): + """EventRecordNode should keep the Event alive after original is deleted.""" + dev = Device() + g = GraphDef() + alloc = g.alloc(1024) + + event = dev.create_event(EventOptions(enable_timing=False)) + node = alloc.record_event(event) + + del event + gc.collect() + + retrieved = node.event + assert retrieved.is_done is True + + +def test_event_wait_node_keeps_event_alive(init_cuda): + """EventWaitNode should keep the Event alive after original is deleted.""" + dev = Device() + g = GraphDef() + alloc = g.alloc(1024) + + event = dev.create_event(EventOptions(enable_timing=False)) + node = alloc.wait_event(event) + + del event + gc.collect() + + retrieved = node.event + assert retrieved.is_done is True + + +def test_event_survives_graph_instantiation_and_execution(init_cuda): + """Graph with event nodes executes correctly after original Event is deleted.""" + dev = Device() + g = GraphDef() + + event = dev.create_event(EventOptions(enable_timing=False)) + rec = g.record_event(event) + rec.wait_event(event) + + del event + gc.collect() + + graph = g.instantiate() + stream = dev.create_stream() + graph.launch(stream) + stream.sync() + + +def test_event_survives_graph_clone_and_execution(init_cuda): + """Cloned graph with event nodes executes after original Event is deleted. + + This is the critical test for CUDA User Objects: a graph clone does + not inherit Python-level references, so only user objects (which + propagate through cuGraphClone) can keep the event alive. + """ + from cuda.core._utils.cuda_utils import driver, handle_return + + dev = Device() + g = GraphDef() + + event = dev.create_event(EventOptions(enable_timing=False)) + rec = g.record_event(event) + rec.wait_event(event) + + cloned_cu_graph = handle_return( + driver.cuGraphClone(driver.CUgraph(g.handle))) + + del event, g, rec + gc.collect() + + graph_exec = handle_return(driver.cuGraphInstantiate(cloned_cu_graph, 0)) + stream = dev.create_stream() + handle_return( + driver.cuGraphLaunch( + graph_exec, driver.CUstream(int(stream.handle)))) + stream.sync() + + +# ============================================================================= +# Kernel lifetime — kernel nodes should keep the Kernel/Module alive +# ============================================================================= + + +def test_kernel_node_keeps_kernel_alive(init_cuda): + """KernelNode should keep the Kernel alive after original is deleted.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + + g = GraphDef() + node = g.launch(config, kernel) + + del kernel, mod + gc.collect() + + retrieved = node.kernel + assert retrieved.attributes.max_threads_per_block() > 0 + + +def test_kernel_survives_graph_instantiation_and_execution(init_cuda): + """Graph with kernel node executes correctly after Kernel/Module is deleted.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + + g = GraphDef() + g.launch(config, kernel) + + del kernel, mod + gc.collect() + + graph = g.instantiate() + stream = Device().create_stream() + graph.launch(stream) + stream.sync() + + +def test_kernel_survives_graph_clone_and_execution(init_cuda): + """Cloned graph with kernel node executes after Kernel/Module is deleted. + + Validates that CUDA User Objects keep the kernel's library alive + through graph cloning (where Python-level references are lost). + """ + from cuda.core._utils.cuda_utils import driver, handle_return + + dev = Device() + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + + g = GraphDef() + g.launch(config, kernel) + + cloned_cu_graph = handle_return( + driver.cuGraphClone(driver.CUgraph(g.handle))) + + del kernel, mod, g + gc.collect() + + graph_exec = handle_return(driver.cuGraphInstantiate(cloned_cu_graph, 0)) + stream = dev.create_stream() + handle_return( + driver.cuGraphLaunch( + graph_exec, driver.CUstream(int(stream.handle)))) + stream.sync() From 133719b917167aa3e799353f7c29ea3edfd471b7 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 6 Mar 2026 11:23:54 -0800 Subject: [PATCH 19/23] Move Event metadata fields to EventBox, access via get_box() pointer arithmetic Event no longer caches timing_disabled, busy_waited, ipc_enabled, device_id, or h_context as Python-side fields. All metadata lives on EventBox (C++ anonymous namespace) and is accessed through overloaded get_box() + getter functions (get_event_timing_disabled, etc.). The Event class now holds only _h_event and _ipc_descriptor. Made-with: Cursor --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 47 ++++++++++++-- cuda_core/cuda/core/_cpp/resource_handles.hpp | 16 ++++- cuda_core/cuda/core/_event.pxd | 5 -- cuda_core/cuda/core/_event.pyx | 65 +++++++++---------- cuda_core/cuda/core/_resource_handles.pxd | 14 +++- cuda_core/cuda/core/_resource_handles.pyx | 18 ++++- 6 files changed, 115 insertions(+), 50 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 74ae71fd92..69dea7e4df 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -321,10 +321,44 @@ StreamHandle get_per_thread_stream() { namespace { struct EventBox { CUevent resource; + bool timing_disabled; + bool busy_waited; + bool ipc_enabled; + int device_id; + ContextHandle h_context; }; } // namespace -EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) { +static const EventBox* get_box(const EventHandle& h) { + const CUevent* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(EventBox, resource) + ); +} + +bool get_event_timing_disabled(const EventHandle& h) noexcept { + return h ? get_box(h)->timing_disabled : true; +} + +bool get_event_busy_waited(const EventHandle& h) noexcept { + return h ? get_box(h)->busy_waited : false; +} + +bool get_event_ipc_enabled(const EventHandle& h) noexcept { + return h ? get_box(h)->ipc_enabled : false; +} + +int get_event_device_id(const EventHandle& h) noexcept { + return h ? get_box(h)->device_id : -1; +} + +ContextHandle get_event_context(const EventHandle& h) noexcept { + return h ? get_box(h)->h_context : ContextHandle{}; +} + +EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, + bool timing_disabled, bool busy_waited, + bool ipc_enabled, int device_id) { GILReleaseGuard gil; CUevent event; if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) { @@ -332,7 +366,7 @@ EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) } auto box = std::shared_ptr( - new EventBox{event}, + new EventBox{event, timing_disabled, busy_waited, ipc_enabled, device_id, h_ctx}, [h_ctx](const EventBox* b) { GILReleaseGuard gil; p_cuEventDestroy(b->resource); @@ -343,15 +377,16 @@ EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) } EventHandle create_event_handle_noctx(unsigned int flags) { - return create_event_handle(ContextHandle{}, flags); + return create_event_handle(ContextHandle{}, flags, true, false, false, -1); } EventHandle create_event_handle_ref(CUevent event) { - auto box = std::make_shared(EventBox{event}); + auto box = std::make_shared(EventBox{event, true, false, false, -1, {}}); return EventHandle(box, &box->resource); } -EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { +EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle, + bool busy_waited) { GILReleaseGuard gil; CUevent event; if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) { @@ -359,7 +394,7 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { } auto box = std::shared_ptr( - new EventBox{event}, + new EventBox{event, true, busy_waited, true, -1, {}}, [](const EventBox* b) { GILReleaseGuard gil; p_cuEventDestroy(b->resource); diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 15f42f0dda..9015ef9065 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -205,9 +205,12 @@ StreamHandle get_per_thread_stream(); // Create an owning event handle by calling cuEventCreate. // The event structurally depends on the provided context handle. +// Metadata fields are stored in the EventBox for later retrieval. // When the last reference is released, cuEventDestroy is called automatically. // Returns empty handle on error (caller must check). -EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags); +EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, + bool timing_disabled, bool busy_waited, + bool ipc_enabled, int device_id); // Create an owning event handle without context dependency. // Use for temporary events that are created and destroyed in the same scope. @@ -219,13 +222,22 @@ EventHandle create_event_handle_noctx(unsigned int flags); // The originating process owns the event and its context. // When the last reference is released, cuEventDestroy is called automatically. // Returns empty handle on error (caller must check). -EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle); +EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle, + bool busy_waited); // Create a non-owning event handle (references existing event). // Use for events that are managed by the CUDA graph or another owner. // The event will NOT be destroyed when the handle is released. +// Metadata defaults to unknown (timing_disabled=true, device_id=-1). EventHandle create_event_handle_ref(CUevent event); +// Event metadata accessors (read from EventBox via pointer arithmetic) +bool get_event_timing_disabled(const EventHandle& h) noexcept; +bool get_event_busy_waited(const EventHandle& h) noexcept; +bool get_event_ipc_enabled(const EventHandle& h) noexcept; +int get_event_device_id(const EventHandle& h) noexcept; +ContextHandle get_event_context(const EventHandle& h) noexcept; + // ============================================================================ // Memory pool handle functions // ============================================================================ diff --git a/cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/_event.pxd index b0cbb13c95..5710b13699 100644 --- a/cuda_core/cuda/core/_event.pxd +++ b/cuda_core/cuda/core/_event.pxd @@ -10,12 +10,7 @@ cdef class Event: cdef: EventHandle _h_event - ContextHandle _h_context - bint _timing_disabled - bint _busy_waited - bint _ipc_enabled object _ipc_descriptor - int _device_id object __weakref__ @staticmethod diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 12d43b1e2b..4a0491d865 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -13,6 +13,11 @@ from cuda.core._resource_handles cimport ( EventHandle, create_event_handle, create_event_handle_ipc, + get_event_timing_disabled, + get_event_busy_waited, + get_event_ipc_enabled, + get_event_device_id, + get_event_context, as_intptr, as_cu, as_py, @@ -95,47 +100,44 @@ cdef class Event: cdef Event self = cls.__new__(cls) cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options") cdef unsigned int flags = 0x0 - self._timing_disabled = False - self._busy_waited = False - self._ipc_enabled = False + cdef bint timing_disabled = False + cdef bint busy_waited = False + cdef bint ipc_enabled = False self._ipc_descriptor = None if not opts.enable_timing: flags |= cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING - self._timing_disabled = True + timing_disabled = True if opts.busy_waited_sync: flags |= cydriver.CUevent_flags.CU_EVENT_BLOCKING_SYNC - self._busy_waited = True + busy_waited = True if opts.ipc_enabled: if is_free: raise TypeError( "IPC-enabled events must be bound; use Stream.record for creation." ) flags |= cydriver.CUevent_flags.CU_EVENT_INTERPROCESS - self._ipc_enabled = True - if not self._timing_disabled: + ipc_enabled = True + if not timing_disabled: raise TypeError("IPC-enabled events cannot use timing.") - # C++ creates the event and returns owning handle with context dependency - cdef EventHandle h_event = create_event_handle(h_context, flags) + cdef EventHandle h_event = create_event_handle( + h_context, flags, timing_disabled, busy_waited, ipc_enabled, device_id) if not h_event: raise RuntimeError("Failed to create CUDA event") self._h_event = h_event - self._h_context = h_context - self._device_id = device_id - if opts.ipc_enabled: + if ipc_enabled: self.get_ipc_descriptor() return self @staticmethod cdef Event _from_handle(EventHandle h_event): - """Create an Event wrapping an existing EventHandle.""" + """Create an Event wrapping an existing EventHandle. + + Metadata (timing, busy_waited, ipc, device_id) is read from the + EventBox via pointer arithmetic — no fields are cached on Event. + """ cdef Event self = Event.__new__(Event) self._h_event = h_event - self._h_context = ContextHandle() - self._timing_disabled = True - self._busy_waited = False - self._ipc_enabled = False self._ipc_descriptor = None - self._device_id = -1 return self cpdef close(self): @@ -204,7 +206,7 @@ cdef class Event: with nogil: HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, as_cu(self._h_event))) cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) - self._ipc_descriptor = IPCEventDescriptor._init(data_b, self._busy_waited) + self._ipc_descriptor = IPCEventDescriptor._init(data_b, get_event_busy_waited(self._h_event)) return self._ipc_descriptor @classmethod @@ -213,33 +215,27 @@ cdef class Event: cdef cydriver.CUipcEventHandle data memcpy(data.reserved, (ipc_descriptor._reserved), sizeof(data.reserved)) cdef Event self = Event.__new__(cls) - # IPC events: the originating process owns the event and its context - cdef EventHandle h_event = create_event_handle_ipc(data) + cdef EventHandle h_event = create_event_handle_ipc(data, ipc_descriptor._busy_waited) if not h_event: raise RuntimeError("Failed to open IPC event handle") self._h_event = h_event - self._h_context = ContextHandle() - self._timing_disabled = True - self._busy_waited = ipc_descriptor._busy_waited - self._ipc_enabled = True self._ipc_descriptor = ipc_descriptor - self._device_id = -1 return self @property def is_ipc_enabled(self) -> bool: """Return True if the event can be shared across process boundaries, otherwise False.""" - return self._ipc_enabled + return get_event_ipc_enabled(self._h_event) @property def is_timing_disabled(self) -> bool: """Return True if the event does not record timing data, otherwise False.""" - return self._timing_disabled + return get_event_timing_disabled(self._h_event) @property def is_sync_busy_waited(self) -> bool: """Return True if the event synchronization would keep the CPU busy-waiting, otherwise False.""" - return self._busy_waited + return get_event_busy_waited(self._h_event) def sync(self): """Synchronize until the event completes. @@ -287,15 +283,18 @@ cdef class Event: context is set current after a event is created. """ - if self._device_id >= 0: + cdef int dev_id = get_event_device_id(self._h_event) + if dev_id >= 0: from ._device import Device # avoid circular import - return Device(self._device_id) + return Device(dev_id) @property def context(self) -> Context: """Return the :obj:`~_context.Context` associated with this event.""" - if self._h_context and self._device_id >= 0: - return Context._from_handle(Context, self._h_context, self._device_id) + cdef ContextHandle h_ctx = get_event_context(self._h_event) + cdef int dev_id = get_event_device_id(self._h_event) + if h_ctx and dev_id >= 0: + return Context._from_handle(Context, h_ctx, dev_id) cdef class IPCEventDescriptor: diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index f8497b61b1..5e4ae1789d 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -115,11 +115,21 @@ cdef StreamHandle get_legacy_stream() except+ nogil cdef StreamHandle get_per_thread_stream() except+ nogil # Event handles -cdef EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) except+ nogil +cdef EventHandle create_event_handle( + const ContextHandle& h_ctx, unsigned int flags, + bint timing_disabled, bint busy_waited, + bint ipc_enabled, int device_id) except+ nogil cdef EventHandle create_event_handle_noctx(unsigned int flags) except+ nogil cdef EventHandle create_event_handle_ref(cydriver.CUevent event) except+ nogil cdef EventHandle create_event_handle_ipc( - const cydriver.CUipcEventHandle& ipc_handle) except+ nogil + const cydriver.CUipcEventHandle& ipc_handle, bint busy_waited) except+ nogil + +# Event metadata getters +cdef bint get_event_timing_disabled(const EventHandle& h) noexcept nogil +cdef bint get_event_busy_waited(const EventHandle& h) noexcept nogil +cdef bint get_event_ipc_enabled(const EventHandle& h) noexcept nogil +cdef int get_event_device_id(const EventHandle& h) noexcept nogil +cdef ContextHandle get_event_context(const EventHandle& h) noexcept nogil # Memory pool handles cdef MemoryPoolHandle create_mempool_handle( diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 5e068a7f1e..928406f5cf 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -71,13 +71,27 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Event handles (note: _create_event_handle* are internal due to C++ overloading) EventHandle create_event_handle "cuda_core::create_event_handle" ( - const ContextHandle& h_ctx, unsigned int flags) except+ nogil + const ContextHandle& h_ctx, unsigned int flags, + bint timing_disabled, bint busy_waited, + bint ipc_enabled, int device_id) except+ nogil EventHandle create_event_handle_noctx "cuda_core::create_event_handle_noctx" ( unsigned int flags) except+ nogil EventHandle create_event_handle_ref "cuda_core::create_event_handle_ref" ( cydriver.CUevent event) except+ nogil EventHandle create_event_handle_ipc "cuda_core::create_event_handle_ipc" ( - const cydriver.CUipcEventHandle& ipc_handle) except+ nogil + const cydriver.CUipcEventHandle& ipc_handle, bint busy_waited) except+ nogil + + # Event metadata getters + bint get_event_timing_disabled "cuda_core::get_event_timing_disabled" ( + const EventHandle& h) noexcept nogil + bint get_event_busy_waited "cuda_core::get_event_busy_waited" ( + const EventHandle& h) noexcept nogil + bint get_event_ipc_enabled "cuda_core::get_event_ipc_enabled" ( + const EventHandle& h) noexcept nogil + int get_event_device_id "cuda_core::get_event_device_id" ( + const EventHandle& h) noexcept nogil + ContextHandle get_event_context "cuda_core::get_event_context" ( + const EventHandle& h) noexcept nogil # Memory pool handles MemoryPoolHandle create_mempool_handle "cuda_core::create_mempool_handle" ( From e7ebe534b0b97eb43139dfc5a78098c61dc53846 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 6 Mar 2026 11:35:57 -0800 Subject: [PATCH 20/23] Add HandleRegistry template and event reverse-lookup Introduce HandleRegistry class template for mapping raw CUDA handles back to their owning shared_ptr. create_event_handle_ref now checks the registry first, recovering full metadata when the event is already managed. Add tests verifying metadata preservation through reconstruction and GC. Made-with: Cursor --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 58 ++++++++++++++++++- .../tests/graph/test_explicit_lifetime.py | 46 +++++++++++++++ 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 69dea7e4df..e178596e51 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -163,6 +163,49 @@ class GILAcquireGuard { } // namespace +// ============================================================================ +// Handle reverse-lookup registry +// +// Maps raw CUDA handles (CUevent, CUkernel, etc.) back to their owning +// shared_ptr so that _ref constructors can recover full metadata. +// Uses weak_ptr to avoid preventing destruction. +// ============================================================================ + +template +class HandleRegistry { +public: + void register_handle(Key key, const Handle& h) { + std::lock_guard lock(mutex_); + map_[key] = h; + } + + void unregister_handle(Key key) noexcept { + try { + std::lock_guard lock(mutex_); + auto it = map_.find(key); + if (it != map_.end() && it->second.expired()) { + map_.erase(it); + } + } catch (...) {} + } + + Handle lookup(Key key) { + std::lock_guard lock(mutex_); + auto it = map_.find(key); + if (it != map_.end()) { + if (auto h = it->second.lock()) { + return h; + } + map_.erase(it); + } + return {}; + } + +private: + std::mutex mutex_; + std::unordered_map> map_; +}; + // ============================================================================ // Thread-local error handling // ============================================================================ @@ -356,6 +399,8 @@ ContextHandle get_event_context(const EventHandle& h) noexcept { return h ? get_box(h)->h_context : ContextHandle{}; } +static HandleRegistry event_registry; + EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, bool timing_disabled, bool busy_waited, bool ipc_enabled, int device_id) { @@ -368,12 +413,15 @@ EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, auto box = std::shared_ptr( new EventBox{event, timing_disabled, busy_waited, ipc_enabled, device_id, h_ctx}, [h_ctx](const EventBox* b) { + event_registry.unregister_handle(b->resource); GILReleaseGuard gil; p_cuEventDestroy(b->resource); delete b; } ); - return EventHandle(box, &box->resource); + EventHandle h(box, &box->resource); + event_registry.register_handle(event, h); + return h; } EventHandle create_event_handle_noctx(unsigned int flags) { @@ -381,6 +429,9 @@ EventHandle create_event_handle_noctx(unsigned int flags) { } EventHandle create_event_handle_ref(CUevent event) { + if (auto h = event_registry.lookup(event)) { + return h; + } auto box = std::make_shared(EventBox{event, true, false, false, -1, {}}); return EventHandle(box, &box->resource); } @@ -396,12 +447,15 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle, auto box = std::shared_ptr( new EventBox{event, true, busy_waited, true, -1, {}}, [](const EventBox* b) { + event_registry.unregister_handle(b->resource); GILReleaseGuard gil; p_cuEventDestroy(b->resource); delete b; } ); - return EventHandle(box, &box->resource); + EventHandle h(box, &box->resource); + event_registry.register_handle(event, h); + return h; } // ============================================================================ diff --git a/cuda_core/tests/graph/test_explicit_lifetime.py b/cuda_core/tests/graph/test_explicit_lifetime.py index 1dfbf34484..7fe574204d 100644 --- a/cuda_core/tests/graph/test_explicit_lifetime.py +++ b/cuda_core/tests/graph/test_explicit_lifetime.py @@ -198,6 +198,52 @@ def test_event_wait_node_keeps_event_alive(init_cuda): assert retrieved.is_done is True +def test_event_record_node_preserves_metadata(init_cuda): + """Reconstructed EventRecordNode recovers full Event metadata via reverse lookup.""" + dev = Device() + g = GraphDef() + + event = dev.create_event(EventOptions(enable_timing=True, busy_waited_sync=True)) + node = g.record_event(event) + + reconstructed = node.event + assert reconstructed.is_timing_disabled is False + assert reconstructed.is_sync_busy_waited is True + assert reconstructed.is_ipc_enabled is False + assert reconstructed.device is not None + + +def test_event_wait_node_preserves_metadata(init_cuda): + """Reconstructed EventWaitNode recovers full Event metadata via reverse lookup.""" + dev = Device() + g = GraphDef() + + event = dev.create_event(EventOptions(enable_timing=False)) + node = g.wait_event(event) + + reconstructed = node.event + assert reconstructed.is_timing_disabled is True + assert reconstructed.is_sync_busy_waited is False + assert reconstructed.device is not None + + +def test_event_metadata_survives_gc(init_cuda): + """Event metadata is preserved through reverse lookup even after original is GC'd.""" + dev = Device() + g = GraphDef() + + event = dev.create_event(EventOptions(enable_timing=True, busy_waited_sync=True)) + node = g.record_event(event) + + del event + gc.collect() + + retrieved = node.event + assert retrieved.is_timing_disabled is False + assert retrieved.is_sync_busy_waited is True + assert retrieved.is_done is True + + def test_event_survives_graph_instantiation_and_execution(init_cuda): """Graph with event nodes executes correctly after original Event is deleted.""" dev = Device() From f0bbf664eb0c4546945871096e921ec9be594f80 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 6 Mar 2026 12:23:04 -0800 Subject: [PATCH 21/23] Add HandleRegistry template, event reverse-lookup, refactor IPC cache Introduce HandleRegistry class template for mapping raw CUDA handles back to their owning shared_ptr. Event registry enables create_event_handle_ref to recover full metadata when the driver returns a CUevent we already manage. Refactor IPC pointer cache to use the same template with a separate mutex for atomic check-then-import. Add tests verifying event metadata preservation through reconstruction and GC. Made-with: Cursor --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 46 ++++++------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index e178596e51..bb0659d94d 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -171,15 +171,15 @@ class GILAcquireGuard { // Uses weak_ptr to avoid preventing destruction. // ============================================================================ -template +template> class HandleRegistry { public: - void register_handle(Key key, const Handle& h) { + void register_handle(const Key& key, const Handle& h) { std::lock_guard lock(mutex_); map_[key] = h; } - void unregister_handle(Key key) noexcept { + void unregister_handle(const Key& key) noexcept { try { std::lock_guard lock(mutex_); auto it = map_.find(key); @@ -189,7 +189,7 @@ class HandleRegistry { } catch (...) {} } - Handle lookup(Key key) { + Handle lookup(const Key& key) { std::lock_guard lock(mutex_); auto it = map_.find(key); if (it != map_.end()) { @@ -203,7 +203,7 @@ class HandleRegistry { private: std::mutex mutex_; - std::unordered_map> map_; + std::unordered_map, Hash> map_; }; // ============================================================================ @@ -762,61 +762,43 @@ struct ExportDataKeyHash { } -static std::mutex ipc_ptr_cache_mutex; -static std::unordered_map, ExportDataKeyHash> ipc_ptr_cache; +static HandleRegistry ipc_ptr_cache; +static std::mutex ipc_import_mutex; DevicePtrHandle deviceptr_import_ipc(const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) { auto data = const_cast( reinterpret_cast(export_data)); if (use_ipc_ptr_cache()) { - // Check cache before calling cuMemPoolImportPointer ExportDataKey key; std::memcpy(&key.data, data, sizeof(key.data)); - std::lock_guard lock(ipc_ptr_cache_mutex); + std::lock_guard lock(ipc_import_mutex); - auto it = ipc_ptr_cache.find(key); - if (it != ipc_ptr_cache.end()) { - if (auto box = it->second.lock()) { - // Cache hit - return existing handle - return DevicePtrHandle(box, &box->resource); - } - ipc_ptr_cache.erase(it); // Expired entry + if (auto h = ipc_ptr_cache.lookup(key)) { + return h; } - // Cache miss - import the pointer GILReleaseGuard gil; CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { return {}; } - // Create new handle with cache-clearing deleter auto box = std::shared_ptr( new DevicePtrBox{ptr, h_stream}, [h_pool, key](DevicePtrBox* b) { + ipc_ptr_cache.unregister_handle(key); GILReleaseGuard gil; - try { - std::lock_guard lock(ipc_ptr_cache_mutex); - // Only erase if expired - avoids race where another thread - // replaced the entry with a new import before we acquired the lock. - auto it = ipc_ptr_cache.find(key); - if (it != ipc_ptr_cache.end() && it->second.expired()) { - ipc_ptr_cache.erase(it); - } - } catch (...) { - // Cache cleanup is best-effort - swallow exceptions in destructor context - } p_cuMemFreeAsync(b->resource, as_cu(b->h_stream)); delete b; } ); - ipc_ptr_cache[key] = box; - return DevicePtrHandle(box, &box->resource); + DevicePtrHandle h(box, &box->resource); + ipc_ptr_cache.register_handle(key, h); + return h; } else { - // No caching - simple handle creation GILReleaseGuard gil; CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { From b830e6e20e12a5ac4567411111ad7c1ac2e661b8 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 6 Mar 2026 13:47:59 -0800 Subject: [PATCH 22/23] Add kernel reverse-lookup registry, fix create_kernel_handle_ref semantics Restructure create_kernel_handle to register directly in a HandleRegistry, and simplify create_kernel_handle_ref to lookup-or-ref (dropping the LibraryHandle parameter). Add get_kernel_library accessor for KernelBox metadata. Kernel.from_handle now recovers the owning handle automatically for cuda.core-created kernels, cross-checks caller-supplied ObjectCode on mismatch, and retains foreign ObjectCode via _keepalive. Rename Kernel._from_obj to _from_handle for consistency with the project. Made-with: Cursor --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 25 +++++++-- cuda_core/cuda/core/_cpp/resource_handles.hpp | 11 ++-- cuda_core/cuda/core/_graph/_graphdef.pyx | 6 +-- cuda_core/cuda/core/_module.pxd | 3 +- cuda_core/cuda/core/_module.pyx | 53 +++++++++---------- cuda_core/cuda/core/_resource_handles.pxd | 4 +- cuda_core/cuda/core/_resource_handles.pyx | 4 +- .../tests/graph/test_explicit_lifetime.py | 52 +++++++++++++++++- cuda_core/tests/test_module.py | 38 +++++++++++++ 9 files changed, 153 insertions(+), 43 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index bb0659d94d..e7339122ac 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -877,10 +877,12 @@ LibraryHandle create_library_handle_ref(CUlibrary library) { namespace { struct KernelBox { CUkernel resource; - LibraryHandle h_library; // Keeps library alive + LibraryHandle h_library; }; } // namespace +static HandleRegistry kernel_registry; + KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name) { GILReleaseGuard gil; CUkernel kernel; @@ -888,14 +890,29 @@ KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* na return {}; } - return create_kernel_handle_ref(kernel, h_library); + auto box = std::make_shared(KernelBox{kernel, h_library}); + KernelHandle h(box, &box->resource); + kernel_registry.register_handle(kernel, h); + return h; } -KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library) { - auto box = std::make_shared(KernelBox{kernel, h_library}); +KernelHandle create_kernel_handle_ref(CUkernel kernel) { + if (auto h = kernel_registry.lookup(kernel)) { + return h; + } + auto box = std::make_shared(KernelBox{kernel, {}}); return KernelHandle(box, &box->resource); } +LibraryHandle get_kernel_library(const KernelHandle& h) noexcept { + if (!h) return {}; + const CUkernel* p = h.get(); + auto* box = reinterpret_cast( + reinterpret_cast(p) - offsetof(KernelBox, resource) + ); + return box->h_library; +} + // ============================================================================ // Graph Handles // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 9015ef9065..c306345b17 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -367,9 +367,14 @@ LibraryHandle create_library_handle_ref(CUlibrary library); // Returns empty handle on error (caller must check). KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name); -// Create a non-owning kernel handle with library dependency. -// Use for borrowed kernels. The library handle keeps the library alive. -KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library); +// Create a kernel handle from a raw CUkernel. +// If the kernel is already managed (in the registry), returns the owning +// handle with library dependency. Otherwise returns a non-owning ref. +KernelHandle create_kernel_handle_ref(CUkernel kernel); + +// Get the library handle associated with a kernel (from KernelBox). +// Returns empty handle if the kernel has no library dependency. +LibraryHandle get_kernel_library(const KernelHandle& h) noexcept; // ============================================================================ // Graph handle functions diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 54f25dbce5..053e39e886 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -49,7 +49,6 @@ from cuda.core._resource_handles cimport ( EventHandle, GraphHandle, KernelHandle, - LibraryHandle, NodeHandle, as_cu, as_intptr, @@ -1431,8 +1430,7 @@ cdef class KernelNode(Node): cdef cydriver.CUDA_KERNEL_NODE_PARAMS params with nogil: HANDLE_RETURN(cydriver.cuGraphKernelNodeGetParams(node, ¶ms)) - cdef LibraryHandle empty_lib - cdef KernelHandle h_kernel = create_kernel_handle_ref(params.kern, empty_lib) + cdef KernelHandle h_kernel = create_kernel_handle_ref(params.kern) return KernelNode._create_with_params( h_node, (params.gridDimX, params.gridDimY, params.gridDimZ), @@ -1461,7 +1459,7 @@ cdef class KernelNode(Node): @property def kernel(self) -> Kernel: """The Kernel object for this launch node.""" - return Kernel._from_obj(self._h_kernel) + return Kernel._from_handle(self._h_kernel) @property def config(self) -> LaunchConfig: diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd index 9468de3dff..1d3a0772c3 100644 --- a/cuda_core/cuda/core/_module.pxd +++ b/cuda_core/cuda/core/_module.pxd @@ -16,10 +16,11 @@ cdef class Kernel: KernelHandle _h_kernel KernelAttributes _attributes # lazy KernelOccupancy _occupancy # lazy + object _keepalive object __weakref__ @staticmethod - cdef Kernel _from_obj(KernelHandle h_kernel) + cdef Kernel _from_handle(KernelHandle h_kernel) cdef tuple _get_arguments_info(self, bint param_info=*) diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index ca5562f990..f34b24c096 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -22,6 +22,7 @@ from cuda.core._resource_handles cimport ( create_library_handle_ref, create_kernel_handle, create_kernel_handle_ref, + get_kernel_library, get_last_error, as_cu, as_py, @@ -493,7 +494,7 @@ cdef class Kernel: raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.") @staticmethod - cdef Kernel _from_obj(KernelHandle h_kernel): + cdef Kernel _from_handle(KernelHandle h_kernel): cdef Kernel ker = Kernel.__new__(Kernel) ker._h_kernel = h_kernel ker._attributes = None @@ -567,9 +568,7 @@ cdef class Kernel: @staticmethod def from_handle(handle, mod: ObjectCode = None) -> Kernel: - """Creates a new :obj:`Kernel` object from a foreign kernel handle. - - Uses a CUkernel pointer address to create a new :obj:`Kernel` object. + """Creates a new :obj:`Kernel` object from a kernel handle. Parameters ---------- @@ -577,37 +576,37 @@ cdef class Kernel: Kernel handle representing the address of a foreign kernel object (CUkernel). mod : :obj:`ObjectCode`, optional - The ObjectCode object associated with this kernel. If not provided, - a placeholder ObjectCode will be created. Note that without a proper - ObjectCode, certain operations may be limited. + The ObjectCode object associated with this kernel. Provides + library lifetime for foreign kernels not created by + cuda.core. """ - # Validate that handle is an integer if not isinstance(handle, int): raise TypeError(f"handle must be an integer, got {type(handle).__name__}") - # Convert the integer handle to CUkernel cdef cydriver.CUkernel cu_kernel = handle - cdef KernelHandle h_kernel - cdef cydriver.CUlibrary cu_library - cdef cydriver.CUresult err - - # If no module provided, create a placeholder and try to get the library - if mod is None: - mod = ObjectCode._init(b"", "cubin") - if _is_cukernel_get_library_supported(): - # Try to get the owning library via cuKernelGetLibrary - with nogil: - err = cydriver.cuKernelGetLibrary(&cu_library, cu_kernel) - if err == cydriver.CUDA_SUCCESS: - mod._h_library = create_library_handle_ref(cu_library) - - # Create kernel handle with library dependency - h_kernel = create_kernel_handle_ref(cu_kernel, mod._h_library) + cdef KernelHandle h_kernel = create_kernel_handle_ref(cu_kernel) if not h_kernel: HANDLE_RETURN(get_last_error()) - return Kernel._from_obj(h_kernel) + cdef LibraryHandle h_existing_lib = get_kernel_library(h_kernel) + cdef LibraryHandle h_caller_lib + + if mod is not None: + h_caller_lib = (mod)._h_library + if h_existing_lib and h_caller_lib: + if as_cu(h_existing_lib) != as_cu(h_caller_lib): + import warnings + warnings.warn( + "The library from the provided ObjectCode does not match " + "the library associated with this kernel.", + stacklevel=2, + ) + + cdef Kernel k = Kernel._from_handle(h_kernel) + if mod is not None and not h_existing_lib: + k._keepalive = mod + return k def __eq__(self, other) -> bool: if not isinstance(other, Kernel): @@ -825,7 +824,7 @@ cdef class ObjectCode: cdef KernelHandle h_kernel = create_kernel_handle(self._h_library, name) if not h_kernel: HANDLE_RETURN(get_last_error()) - return Kernel._from_obj(h_kernel) + return Kernel._from_handle(h_kernel) @property def code(self) -> CodeTypeT: diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 5e4ae1789d..7eca9d1221 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -168,8 +168,8 @@ cdef LibraryHandle create_library_handle_ref(cydriver.CUlibrary library) except+ # Kernel handles cdef KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name) except+ nogil -cdef KernelHandle create_kernel_handle_ref( - cydriver.CUkernel kernel, const LibraryHandle& h_library) except+ nogil +cdef KernelHandle create_kernel_handle_ref(cydriver.CUkernel kernel) except+ nogil +cdef LibraryHandle get_kernel_library(const KernelHandle& h) noexcept nogil # Graph handles cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 928406f5cf..be8955ce92 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -143,7 +143,9 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": KernelHandle create_kernel_handle "cuda_core::create_kernel_handle" ( const LibraryHandle& h_library, const char* name) except+ nogil KernelHandle create_kernel_handle_ref "cuda_core::create_kernel_handle_ref" ( - cydriver.CUkernel kernel, const LibraryHandle& h_library) except+ nogil + cydriver.CUkernel kernel) except+ nogil + LibraryHandle get_kernel_library "cuda_core::get_kernel_library" ( + const KernelHandle& h) noexcept nogil # Graph handles GraphHandle create_graph_handle "cuda_core::create_graph_handle" ( diff --git a/cuda_core/tests/graph/test_explicit_lifetime.py b/cuda_core/tests/graph/test_explicit_lifetime.py index 7fe574204d..d99590e355 100644 --- a/cuda_core/tests/graph/test_explicit_lifetime.py +++ b/cuda_core/tests/graph/test_explicit_lifetime.py @@ -14,11 +14,12 @@ from helpers.graph_kernels import compile_common_kernels from helpers.misc import try_create_condition -from cuda.core import Device, EventOptions, LaunchConfig +from cuda.core import Device, EventOptions, Kernel, LaunchConfig from cuda.core._graph._graphdef import ( ChildGraphNode, ConditionalNode, GraphDef, + KernelNode, ) @@ -359,3 +360,52 @@ def test_kernel_survives_graph_clone_and_execution(init_cuda): driver.cuGraphLaunch( graph_exec, driver.CUstream(int(stream.handle)))) stream.sync() + + +# ============================================================================= +# Kernel handle recovery — from_handle and graph node reconstruction +# ============================================================================= + + +def test_kernel_from_handle_recovers_library(init_cuda): + """Kernel.from_handle on a cuda.core-created kernel recovers the library + dependency, keeping it alive after the original objects are deleted.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + handle = int(kernel.handle) + + reconstructed = Kernel.from_handle(handle) + + del kernel, mod + gc.collect() + + assert reconstructed.attributes.max_threads_per_block() > 0 + + +def test_kernel_node_reconstruction_preserves_validity(init_cuda): + """A KernelNode reconstructed via DAG traversal has a valid kernel, + kept alive by user objects and existing node references.""" + mod = compile_common_kernels() + kernel = mod.get_kernel("empty_kernel") + config = LaunchConfig(grid=1, block=1) + + g = GraphDef() + kernel_node = g.launch(config, kernel) + # Chain a second node so we can reconstruct the kernel node via pred + event = Device().create_event() + successor = kernel_node.record_event(event) + + del kernel, mod + gc.collect() + + # Reconstruct the kernel node through DAG traversal + # successor.pred -> Node._create -> KernelNode._create_from_driver + # -> create_kernel_handle_ref -> handle recovery + reconstructed = successor.pred[0] + assert isinstance(reconstructed, KernelNode) + assert reconstructed.kernel.attributes.max_threads_per_block() > 0 + + graph = g.instantiate() + stream = Device().create_stream() + graph.launch(stream) + stream.sync() diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index e74b1fc672..cc88f6b19a 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -511,6 +511,44 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): assert int(kernel1.handle) == int(kernel2.handle) == int(kernel3.handle) == handle +def test_kernel_from_handle_library_mismatch_warning(init_cuda): + """Kernel.from_handle warns when caller-supplied module differs from the kernel's library.""" + prog1 = Program(SAXPY_KERNEL, code_type="c++") + mod1 = prog1.compile("cubin", name_expressions=("saxpy",)) + kernel = mod1.get_kernel("saxpy") + handle = int(kernel.handle) + + prog2 = Program(SAXPY_KERNEL, code_type="c++") + mod2 = prog2.compile("cubin", name_expressions=("saxpy",)) + mod2.get_kernel("saxpy") + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + k = Kernel.from_handle(handle, mod2) + assert len(w) == 1 + assert "does not match" in str(w[0].message) + + assert k.attributes.max_threads_per_block() > 0 + + +def test_kernel_from_handle_foreign_kernel(init_cuda): + """Kernel.from_handle with a driver-level kernel not created by cuda.core.""" + prog = Program(SAXPY_KERNEL, code_type="c++") + mod = prog.compile("cubin", name_expressions=("saxpy",)) + cubin = mod.code + sym_map = mod.symbol_mapping + + cu_lib = handle_return( + driver.cuLibraryLoadData(cubin, [], [], 0, [], [], 0)) + mangled = sym_map["saxpy"] + cu_kernel = handle_return( + driver.cuLibraryGetKernel(cu_lib, mangled)) + handle = int(cu_kernel) + + k = Kernel.from_handle(handle) + assert k.attributes.max_threads_per_block() > 0 + + def test_kernel_keeps_library_alive(init_cuda): """Test that a Kernel keeps its underlying library alive after ObjectCode goes out of scope.""" import gc From b55782a613497dfb85d53333371bfd51559465c0 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 6 Mar 2026 14:10:33 -0800 Subject: [PATCH 23/23] Rename NodeHandle to GraphNodeHandle for consistency with driver terminology Avoids ambiguity with potential future node types in other domains. Renames NodeBox, create_node_handle, and node_get_graph accordingly. Made-with: Cursor --- .github/actions/fetch_ctk/action.yml | 2 +- .github/workflows/bandit.yml | 2 +- .github/workflows/build-wheel.yml | 2 +- .github/workflows/codeql.yml | 4 +- .github/workflows/coverage.yml | 3 +- .github/workflows/test-wheel-linux.yml | 2 +- .github/workflows/test-wheel-windows.yml | 2 +- .../0_Introduction/clock_nvrtc_test.py | 92 ++--- .../simpleCubemapTexture_test.py | 180 ++++----- .../examples/0_Introduction/simpleP2P_test.py | 168 +++++---- .../0_Introduction/simpleZeroCopy_test.py | 122 +++--- .../0_Introduction/systemWideAtomics_test.py | 98 ++--- .../0_Introduction/vectorAddDrv_test.py | 136 +++---- .../0_Introduction/vectorAddMMAP_test.py | 210 +++++------ .../streamOrderedAllocation_test.py | 178 ++++----- .../globalToShmemAsyncCopy_test.py | 352 +++++++++--------- .../3_CUDA_Features/simpleCudaGraphs_test.py | 328 ++++++++-------- .../conjugateGradientMultiBlockCG_test.py | 270 +++++++------- cuda_bindings/examples/common/common.py | 78 ++-- cuda_bindings/examples/common/helper_cuda.py | 34 +- .../examples/common/helper_string.py | 8 +- .../examples/extra/isoFDModelling_test.py | 326 ++++++++-------- .../examples/extra/jit_program_test.py | 134 ++++--- cuda_core/cuda/core/_cpp/resource_handles.cpp | 29 +- cuda_core/cuda/core/_cpp/resource_handles.hpp | 17 +- cuda_core/cuda/core/_graph/_graphdef.pxd | 44 +-- cuda_core/cuda/core/_graph/_graphdef.pyx | 111 +++--- cuda_core/cuda/core/_module.pyx | 1 - cuda_core/cuda/core/_resource_handles.pxd | 11 +- cuda_core/cuda/core/_resource_handles.pyx | 6 +- cuda_core/examples/cuda_graphs.py | 6 +- cuda_core/examples/gl_interop_plasma.py | 4 +- cuda_core/examples/pytorch_example.py | 16 +- cuda_core/examples/saxpy.py | 30 +- .../examples/simple_multi_gpu_example.py | 14 +- cuda_core/examples/strided_memory_view_gpu.py | 16 +- cuda_core/examples/thread_block_cluster.py | 4 +- cuda_core/examples/vector_add.py | 14 +- .../tests/graph/test_explicit_integration.py | 94 ++--- .../tests/graph/test_explicit_lifetime.py | 17 +- cuda_core/tests/test_module.py | 6 +- .../_dynamic_libs/descriptor_catalog.py | 23 ++ .../_dynamic_libs/search_platform.py | 12 +- .../_headers/find_nvidia_headers.py | 36 ++ cuda_pathfinder/docs/nv-versions.json | 4 + .../docs/source/release/1.4.1-notes.rst | 49 +++ cuda_pathfinder/pyproject.toml | 4 +- .../tests/test_find_nvidia_headers.py | 126 ++++++- ...st_load_nvidia_dynamic_lib_using_mocker.py | 173 +++++++++ ruff.toml | 4 +- 50 files changed, 2006 insertions(+), 1596 deletions(-) create mode 100644 cuda_pathfinder/docs/source/release/1.4.1-notes.rst create mode 100644 cuda_pathfinder/tests/test_load_nvidia_dynamic_lib_using_mocker.py diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml index 001e3a84d8..e938fcc5b3 100644 --- a/.github/actions/fetch_ctk/action.yml +++ b/.github/actions/fetch_ctk/action.yml @@ -14,7 +14,7 @@ inputs: cuda-components: description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'" required: false - default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,libnvfatbin" + default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin" cuda-path: description: "where the CTK components will be installed to, relative to $PWD" required: false diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml index b7ed18b696..7ecbcdd1a1 100644 --- a/.github/workflows/bandit.yml +++ b/.github/workflows/bandit.yml @@ -42,6 +42,6 @@ jobs: with: args: "check --select S --ignore ${{ steps.ignore-codes.outputs.codes }} --output-format sarif --output-file results.sarif" - name: Upload SARIF file - uses: github/codeql-action/upload-sarif@v4.32.4 + uses: github/codeql-action/upload-sarif@v4.32.5 with: sarif_file: results.sarif diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml index dd2ede5c67..2a227d4ee9 100644 --- a/.github/workflows/build-wheel.yml +++ b/.github/workflows/build-wheel.yml @@ -369,7 +369,7 @@ jobs: OLD_BRANCH=$(yq '.backport_branch' ci/versions.yml) OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*" - LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId') + LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId') if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then echo "LATEST_PRIOR_RUN_ID not found!" exit 1 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 8f02dcbd6a..eea2466f7d 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -31,13 +31,13 @@ jobs: uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Initialize CodeQL - uses: github/codeql-action/init@0ec47d036c68ae0cf94c629009b1029407111281 # v3.31.8 + uses: github/codeql-action/init@40f0fa95c41fede7b43f035cb47aac899ee0ba0a # v3.31.8 with: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} queries: security-extended - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@0ec47d036c68ae0cf94c629009b1029407111281 # v3.31.8 + uses: github/codeql-action/analyze@40f0fa95c41fede7b43f035cb47aac899ee0ba0a # v3.31.8 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 765b623b3a..e65439a77e 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -98,7 +98,8 @@ jobs: - name: Build cuda-pathfinder run: | - .venv/bin/pip install -v ./cuda_pathfinder --group test + cd cuda_pathfinder + ../.venv/bin/pip install -v . --group test - name: Build cuda-bindings run: | diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 3c80128bb1..c5061a16eb 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -151,7 +151,7 @@ jobs: OLD_BRANCH=${{ needs.compute-matrix.outputs.OLD_BRANCH }} OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*" - LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId') + LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId') if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then echo "LATEST_PRIOR_RUN_ID not found!" exit 1 diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index dc9a31719f..478826c525 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -137,7 +137,7 @@ jobs: run: | $OLD_BRANCH = yq '.backport_branch' ci/versions.yml $OLD_BASENAME = "cuda-bindings-python${env:PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*" - $runData = gh run list -b $OLD_BRANCH -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | ConvertFrom-Json + $runData = gh run list -b $OLD_BRANCH -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | ConvertFrom-Json if (-not $runData -or $runData.Length -eq 0 -or -not $runData[0].databaseId -or [string]::IsNullOrEmpty($runData[0].databaseId)) { Write-Host "LATEST_PRIOR_RUN_ID not found!" exit 1 diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py index dc1084bea8..d67f180fe0 100644 --- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py +++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py @@ -5,7 +5,7 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda @@ -50,8 +50,8 @@ } """ -NUM_BLOCKS = 64 -NUM_THREADS = 256 +num_blocks = 64 +num_threads = 256 def elems_to_bytes(nelems, dt): @@ -64,52 +64,52 @@ def main(): if platform.machine() == "armv7l": pytest.skip("clock_nvrtc is not supported on ARMv7") - timer = np.empty(NUM_BLOCKS * 2, dtype="int64") - hinput = np.empty(NUM_THREADS * 2, dtype="float32") + timer = np.empty(num_blocks * 2, dtype="int64") + hinput = np.empty(num_threads * 2, dtype="float32") - for i in range(NUM_THREADS * 2): + for i in range(num_threads * 2): hinput[i] = i - devID = findCudaDevice() - with common.KernelHelper(clock_nvrtc, devID) as kernelHelper: - kernel_addr = kernelHelper.getFunction(b"timedReduction") - - dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes)) - doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32))) - dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes)) - checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes)) - - args = ((dinput, doutput, dtimer), (None, None, None)) - shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32) - - grid_dims = (NUM_BLOCKS, 1, 1) - block_dims = (NUM_THREADS, 1, 1) - - checkCudaErrors( - cuda.cuLaunchKernel( - kernel_addr, - *grid_dims, # grid dim - *block_dims, # block dim - shared_memory_nbytes, - 0, # shared mem, stream - args, - 0, - ) - ) # arguments - - checkCudaErrors(cuda.cuCtxSynchronize()) - checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes)) - checkCudaErrors(cuda.cuMemFree(dinput)) - checkCudaErrors(cuda.cuMemFree(doutput)) - checkCudaErrors(cuda.cuMemFree(dtimer)) - - avgElapsedClocks = 0.0 - - for i in range(NUM_BLOCKS): - avgElapsedClocks += timer[i + NUM_BLOCKS] - timer[i] - - avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS - print(f"Average clocks/block = {avgElapsedClocks}") + dev_id = find_cuda_device() + kernel_helper = common.KernelHelper(clock_nvrtc, dev_id) + kernel_addr = kernel_helper.get_function(b"timedReduction") + + dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes)) + doutput = check_cuda_errors(cuda.cuMemAlloc(elems_to_bytes(num_blocks, np.float32))) + dtimer = check_cuda_errors(cuda.cuMemAlloc(timer.nbytes)) + check_cuda_errors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes)) + + args = ((dinput, doutput, dtimer), (None, None, None)) + shared_memory_nbytes = elems_to_bytes(2 * num_threads, np.float32) + + grid_dims = (num_blocks, 1, 1) + block_dims = (num_threads, 1, 1) + + check_cuda_errors( + cuda.cuLaunchKernel( + kernel_addr, + *grid_dims, # grid dim + *block_dims, # block dim + shared_memory_nbytes, + 0, # shared mem, stream + args, + 0, + ) + ) # arguments + + check_cuda_errors(cuda.cuCtxSynchronize()) + check_cuda_errors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes)) + check_cuda_errors(cuda.cuMemFree(dinput)) + check_cuda_errors(cuda.cuMemFree(doutput)) + check_cuda_errors(cuda.cuMemFree(dtimer)) + + avg_elapsed_clocks = 0.0 + + for i in range(num_blocks): + avg_elapsed_clocks += timer[i + num_blocks] - timer[i] + + avg_elapsed_clocks = avg_elapsed_clocks / num_blocks + print(f"Average clocks/block = {avg_elapsed_clocks}") if __name__ == "__main__": diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py index 75f1b0800d..5d764509ce 100644 --- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py @@ -7,12 +7,12 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -simpleCubemapTexture = """\ +simple_cubemap_texture = """\ extern "C" __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex) { @@ -83,14 +83,14 @@ def main(): # Use command-line specified CUDA device, otherwise use device with highest Gflops/s - devID = findCudaDevice() + dev_id = find_cuda_device() # Get number of SMs on this GPU - deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID)) + device_props = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id)) print( - f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}" + f"CUDA device [{device_props.name}] has {device_props.multiProcessorCount} Multi-Processors SM {device_props.major}.{device_props.minor}" ) - if deviceProps.major < 2: + if device_props.major < 2: import pytest pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.") @@ -107,15 +107,15 @@ def main(): h_data_ref = np.repeat(np.arange(num_layers, dtype=h_data.dtype), cubemap_size) - h_data # Allocate device memory for result - d_data = checkCudaErrors(cudart.cudaMalloc(size)) + d_data = check_cuda_errors(cudart.cudaMalloc(size)) # Allocate array and copy image data - channelDesc = checkCudaErrors( + channel_desc = check_cuda_errors( cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat) ) - cu_3darray = checkCudaErrors( + cu_3darray = check_cuda_errors( cudart.cudaMalloc3DArray( - channelDesc, + channel_desc, cudart.make_cudaExtent(width, width, num_faces), cudart.cudaArrayCubemap, ) @@ -128,90 +128,90 @@ def main(): myparms.dstArray = cu_3darray myparms.extent = cudart.make_cudaExtent(width, width, num_faces) myparms.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice - checkCudaErrors(cudart.cudaMemcpy3D(myparms)) - - texRes = cudart.cudaResourceDesc() - texRes.resType = cudart.cudaResourceType.cudaResourceTypeArray - texRes.res.array.array = cu_3darray - - texDescr = cudart.cudaTextureDesc() - texDescr.normalizedCoords = True - texDescr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear - texDescr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap - texDescr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap - texDescr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap - texDescr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType - - tex = checkCudaErrors(cudart.cudaCreateTextureObject(texRes, texDescr, None)) - dimBlock = cudart.dim3() - dimBlock.x = 8 - dimBlock.y = 8 - dimBlock.z = 1 - dimGrid = cudart.dim3() - dimGrid.x = width / dimBlock.x - dimGrid.y = width / dimBlock.y - dimGrid.z = 1 + check_cuda_errors(cudart.cudaMemcpy3D(myparms)) + + tex_res = cudart.cudaResourceDesc() + tex_res.resType = cudart.cudaResourceType.cudaResourceTypeArray + tex_res.res.array.array = cu_3darray + + tex_descr = cudart.cudaTextureDesc() + tex_descr.normalizedCoords = True + tex_descr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear + tex_descr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap + tex_descr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap + tex_descr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap + tex_descr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType + + tex = check_cuda_errors(cudart.cudaCreateTextureObject(tex_res, tex_descr, None)) + dim_block = cudart.dim3() + dim_block.x = 8 + dim_block.y = 8 + dim_block.z = 1 + dim_grid = cudart.dim3() + dim_grid.x = width / dim_block.x + dim_grid.y = width / dim_block.y + dim_grid.z = 1 print( - f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads" + f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dim_grid.x} x {dim_grid.y}, each block has 8 x 8 threads" ) - with common.KernelHelper(simpleCubemapTexture, devID) as kernelHelper: - _transformKernel = kernelHelper.getFunction(b"transformKernel") - kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None)) - checkCudaErrors( - cuda.cuLaunchKernel( - _transformKernel, - dimGrid.x, - dimGrid.y, - dimGrid.z, # grid dim - dimBlock.x, - dimBlock.y, - dimBlock.z, # block dim - 0, - 0, # shared mem and stream - kernelArgs, - 0, - ) - ) # arguments - - checkCudaErrors(cudart.cudaDeviceSynchronize()) - - start = time.time() - - # Execute the kernel - checkCudaErrors( - cuda.cuLaunchKernel( - _transformKernel, - dimGrid.x, - dimGrid.y, - dimGrid.z, # grid dim - dimBlock.x, - dimBlock.y, - dimBlock.z, # block dim - 0, - 0, # shared mem and stream - kernelArgs, - 0, - ) - ) # arguments - - checkCudaErrors(cudart.cudaDeviceSynchronize()) - stop = time.time() - print(f"Processing time: {stop - start:.3f} msec") - print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec") - - # Allocate mem for the result on host side - h_odata = np.empty_like(h_data) - # Copy result from device to host - checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)) - - checkCudaErrors(cudart.cudaDestroyTextureObject(tex)) - checkCudaErrors(cudart.cudaFree(d_data)) - checkCudaErrors(cudart.cudaFreeArray(cu_3darray)) - - MIN_EPSILON_ERROR = 5.0e-3 - if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR: + kernel_helper = common.KernelHelper(simple_cubemap_texture, dev_id) + _transform_kernel = kernel_helper.get_function(b"transformKernel") + kernel_args = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None)) + check_cuda_errors( + cuda.cuLaunchKernel( + _transform_kernel, + dim_grid.x, + dim_grid.y, + dim_grid.z, # grid dim + dim_block.x, + dim_block.y, + dim_block.z, # block dim + 0, + 0, # shared mem and stream + kernel_args, + 0, + ) + ) # arguments + + check_cuda_errors(cudart.cudaDeviceSynchronize()) + + start = time.time() + + # Execute the kernel + check_cuda_errors( + cuda.cuLaunchKernel( + _transform_kernel, + dim_grid.x, + dim_grid.y, + dim_grid.z, # grid dim + dim_block.x, + dim_block.y, + dim_block.z, # block dim + 0, + 0, # shared mem and stream + kernel_args, + 0, + ) + ) # arguments + + check_cuda_errors(cudart.cudaDeviceSynchronize()) + stop = time.time() + print(f"Processing time: {stop - start:.3f} msec") + print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec") + + # Allocate mem for the result on host side + h_odata = np.empty_like(h_data) + # Copy result from device to host + check_cuda_errors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)) + + check_cuda_errors(cudart.cudaDestroyTextureObject(tex)) + check_cuda_errors(cudart.cudaFree(d_data)) + check_cuda_errors(cudart.cudaFreeArray(cu_3darray)) + + min_epsilon_error = 5.0e-3 + if np.max(np.abs(h_odata - h_data_ref)) > min_epsilon_error: print("Failed", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py index a60dbac5bc..09dafa1be1 100644 --- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py @@ -7,7 +7,7 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors +from common.helper_cuda import check_cuda_errors from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart @@ -41,24 +41,24 @@ def main(): # Number of GPUs print("Checking for multiple GPUs...") - gpu_n = checkCudaErrors(cudart.cudaGetDeviceCount()) + gpu_n = check_cuda_errors(cudart.cudaGetDeviceCount()) print(f"CUDA-capable device count: {gpu_n}") if gpu_n < 2: pytest.skip("Two or more GPUs with Peer-to-Peer access capability are required") - prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] + prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] # Check possibility for peer access print("\nChecking GPU(s) for support of peer to peer memory access...") - p2pCapableGPUs = [-1, -1] + p2p_capable_gp_us = [-1, -1] for i in range(gpu_n): - p2pCapableGPUs[0] = i + p2p_capable_gp_us[0] = i for j in range(gpu_n): if i == j: continue - i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j)) - j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i)) + i_access_j = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(i, j)) + j_access_i = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(j, i)) print( "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No" @@ -70,54 +70,54 @@ def main(): ) ) if i_access_j and j_access_i: - p2pCapableGPUs[1] = j + p2p_capable_gp_us[1] = j break - if p2pCapableGPUs[1] != -1: + if p2p_capable_gp_us[1] != -1: break - if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1: + if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1: pytest.skip("Peer to Peer access is not available amongst GPUs in the system") # Use first pair of p2p capable GPUs detected - gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]] + gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]] # Enable peer access print(f"Enabling peer access between GPU{gpuid[0]} and GPU{gpuid[1]}...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0)) - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[0], 0)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + check_cuda_errors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + check_cuda_errors(cudart.cudaDeviceEnablePeerAccess(gpuid[0], 0)) # Allocate buffers buf_size = 1024 * 1024 * 16 * np.dtype(np.float32).itemsize print(f"Allocating buffers ({int(buf_size / 1024 / 1024)}MB on GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - g0 = checkCudaErrors(cudart.cudaMalloc(buf_size)) - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - g1 = checkCudaErrors(cudart.cudaMalloc(buf_size)) - h0 = checkCudaErrors(cudart.cudaMallocHost(buf_size)) # Automatically portable with UVA + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + g0 = check_cuda_errors(cudart.cudaMalloc(buf_size)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + g1 = check_cuda_errors(cudart.cudaMalloc(buf_size)) + h0 = check_cuda_errors(cudart.cudaMallocHost(buf_size)) # Automatically portable with UVA # Create CUDA event handles print("Creating event handles...") eventflags = cudart.cudaEventBlockingSync - start_event = checkCudaErrors(cudart.cudaEventCreateWithFlags(eventflags)) - stop_event = checkCudaErrors(cudart.cudaEventCreateWithFlags(eventflags)) + start_event = check_cuda_errors(cudart.cudaEventCreateWithFlags(eventflags)) + stop_event = check_cuda_errors(cudart.cudaEventCreateWithFlags(eventflags)) # P2P memcopy() benchmark - checkCudaErrors(cudart.cudaEventRecord(start_event, cudart.cudaStream_t(0))) + check_cuda_errors(cudart.cudaEventRecord(start_event, cudart.cudaStream_t(0))) for i in range(100): # With UVA we don't need to specify source and target devices, the # runtime figures this out by itself from the pointers # Ping-pong copy between GPUs if i % 2 == 0: - checkCudaErrors(cudart.cudaMemcpy(g1, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) + check_cuda_errors(cudart.cudaMemcpy(g1, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) else: - checkCudaErrors(cudart.cudaMemcpy(g0, g1, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) + check_cuda_errors(cudart.cudaMemcpy(g0, g1, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) - checkCudaErrors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0))) - checkCudaErrors(cudart.cudaEventSynchronize(stop_event)) - time_memcpy = checkCudaErrors(cudart.cudaEventElapsedTime(start_event, stop_event)) + check_cuda_errors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0))) + check_cuda_errors(cudart.cudaEventSynchronize(stop_event)) + time_memcpy = check_cuda_errors(cudart.cudaEventElapsedTime(start_event, stop_event)) print( f"cudaMemcpyPeer / cudaMemcpy between GPU{gpuid[0]} and GPU{gpuid[1]}: {(1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / 1024.0 / 1024.0 / 1024.0:.2f}GB/s" ) @@ -129,8 +129,8 @@ def main(): for i in range(int(buf_size / np.dtype(np.float32).itemsize)): h0_local[i] = i % 4096 - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - checkCudaErrors(cudart.cudaMemcpy(g0, h0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + check_cuda_errors(cudart.cudaMemcpy(g0, h0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) # Kernel launch configuration threads = cudart.dim3() @@ -145,57 +145,61 @@ def main(): # Run kernel on GPU 1, reading input from the GPU 0 buffer, writing # output to the GPU 1 buffer print(f"Run kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - - with common.KernelHelper(simplep2p, gpuid[1]) as kernelHelper: - simple_kernel_1 = kernelHelper.getFunction(b"SimpleKernel") - kernel_args_1 = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p)) - checkCudaErrors( - cuda.cuLaunchKernel( - simple_kernel_1, - blocks.x, - blocks.y, - blocks.z, - threads.x, - threads.y, - threads.z, - 0, - 0, - kernel_args_1, - 0, - ) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + + kernel_helper = [None] * 2 + _simple_kernel = [None] * 2 + kernel_args = [None] * 2 + + kernel_helper[1] = common.KernelHelper(simplep2p, gpuid[1]) + _simple_kernel[1] = kernel_helper[1].get_function(b"SimpleKernel") + kernel_args[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p)) + check_cuda_errors( + cuda.cuLaunchKernel( + _simple_kernel[1], + blocks.x, + blocks.y, + blocks.z, + threads.x, + threads.y, + threads.z, + 0, + 0, + kernel_args[1], + 0, ) + ) - checkCudaErrors(cudart.cudaDeviceSynchronize()) + check_cuda_errors(cudart.cudaDeviceSynchronize()) # Run kernel on GPU 0, reading input from the GPU 1 buffer, writing # output to the GPU 0 buffer print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - with common.KernelHelper(simplep2p, gpuid[0]) as kernelHelper: - simple_kernel_0 = kernelHelper.getFunction(b"SimpleKernel") - kernel_args_0 = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p)) - checkCudaErrors( - cuda.cuLaunchKernel( - simple_kernel_0, - blocks.x, - blocks.y, - blocks.z, - threads.x, - threads.y, - threads.z, - 0, - 0, - kernel_args_0, - 0, - ) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + kernel_helper[0] = common.KernelHelper(simplep2p, gpuid[0]) + _simple_kernel[0] = kernel_helper[0].get_function(b"SimpleKernel") + kernel_args[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p)) + check_cuda_errors( + cuda.cuLaunchKernel( + _simple_kernel[0], + blocks.x, + blocks.y, + blocks.z, + threads.x, + threads.y, + threads.z, + 0, + 0, + kernel_args[0], + 0, ) + ) - checkCudaErrors(cudart.cudaDeviceSynchronize()) + check_cuda_errors(cudart.cudaDeviceSynchronize()) # Copy data back to host and verify print(f"Copy data back to host from GPU{gpuid[0]} and verify results...") - checkCudaErrors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) + check_cuda_errors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) error_count = 0 @@ -210,23 +214,23 @@ def main(): # Disable peer access (also unregisters memory for non-UVA cases) print("Disabling peer access...") - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - checkCudaErrors(cudart.cudaDeviceDisablePeerAccess(gpuid[1])) - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - checkCudaErrors(cudart.cudaDeviceDisablePeerAccess(gpuid[0])) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + check_cuda_errors(cudart.cudaDeviceDisablePeerAccess(gpuid[1])) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + check_cuda_errors(cudart.cudaDeviceDisablePeerAccess(gpuid[0])) # Cleanup and shutdown print("Shutting down...") - checkCudaErrors(cudart.cudaEventDestroy(start_event)) - checkCudaErrors(cudart.cudaEventDestroy(stop_event)) - checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) - checkCudaErrors(cudart.cudaFree(g0)) - checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - checkCudaErrors(cudart.cudaFree(g1)) - checkCudaErrors(cudart.cudaFreeHost(h0)) + check_cuda_errors(cudart.cudaEventDestroy(start_event)) + check_cuda_errors(cudart.cudaEventDestroy(stop_event)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) + check_cuda_errors(cudart.cudaFree(g0)) + check_cuda_errors(cudart.cudaSetDevice(gpuid[1])) + check_cuda_errors(cudart.cudaFree(g1)) + check_cuda_errors(cudart.cudaFreeHost(h0)) for i in range(gpu_n): - checkCudaErrors(cudart.cudaSetDevice(i)) + check_cuda_errors(cudart.cudaSetDevice(i)) if error_count != 0: print("Test failed!", file=sys.stderr) diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py index ea64017b95..d4bf44e19a 100644 --- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py @@ -9,13 +9,13 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors -from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt +from common.helper_cuda import check_cuda_errors +from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -simpleZeroCopy = """\ +simple_zero_copy = """\ extern "C" __global__ void vectorAddGPU(float *a, float *b, float *c, int N) { @@ -31,7 +31,7 @@ def main(): idev = 0 - bPinGenericMemory = False + b_pin_generic_memory = False import pytest @@ -47,7 +47,7 @@ def main(): if platform.machine() == "sbsa": pytest.skip("simpleZeroCopy is not supported on sbsa") - if checkCmdLineFlag("help"): + if check_cmd_line_flag("help"): print("Usage: simpleZeroCopy [OPTION]\n", file=sys.stderr) print("Options:", file=sys.stderr) print(" device=[device #] Specify the device to be used", file=sys.stderr) @@ -55,50 +55,50 @@ def main(): sys.exit(1) # Get the device selected by the user or default to 0, and then set it. - if checkCmdLineFlag("device="): - deviceCount = cudart.cudaGetDeviceCount() - idev = int(getCmdLineArgumentInt("device=")) + if check_cmd_line_flag("device="): + device_count = cudart.cudaGetDeviceCount() + idev = int(get_cmd_line_argument_int("device=")) - if idev >= deviceCount or idev < 0: + if idev >= device_count or idev < 0: print(f"Device number {idev} is invalid, will use default CUDA device 0.") idev = 0 - if checkCmdLineFlag("use_generic_memory"): - bPinGenericMemory = True + if check_cmd_line_flag("use_generic_memory"): + b_pin_generic_memory = True - if bPinGenericMemory: + if b_pin_generic_memory: print("> Using Generic System Paged Memory (malloc)") else: print("> Using CUDA Host Allocated (cudaHostAlloc)") - checkCudaErrors(cudart.cudaSetDevice(idev)) + check_cuda_errors(cudart.cudaSetDevice(idev)) # Verify the selected device supports mapped memory and set the device flags for mapping host memory. - deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(idev)) + device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(idev)) - if not deviceProp.canMapHostMemory: + if not device_prop.canMapHostMemory: pytest.skip(f"Device {idev} does not support mapping CPU host memory!") - checkCudaErrors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost)) + check_cuda_errors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost)) # Allocate mapped CPU memory nelem = 1048576 num_bytes = nelem * np.dtype(np.float32).itemsize - if bPinGenericMemory: + if b_pin_generic_memory: a = np.empty(nelem, dtype=np.float32) b = np.empty(nelem, dtype=np.float32) c = np.empty(nelem, dtype=np.float32) - checkCudaErrors(cudart.cudaHostRegister(a, num_bytes, cudart.cudaHostRegisterMapped)) - checkCudaErrors(cudart.cudaHostRegister(b, num_bytes, cudart.cudaHostRegisterMapped)) - checkCudaErrors(cudart.cudaHostRegister(c, num_bytes, cudart.cudaHostRegisterMapped)) + check_cuda_errors(cudart.cudaHostRegister(a, num_bytes, cudart.cudaHostRegisterMapped)) + check_cuda_errors(cudart.cudaHostRegister(b, num_bytes, cudart.cudaHostRegisterMapped)) + check_cuda_errors(cudart.cudaHostRegister(c, num_bytes, cudart.cudaHostRegisterMapped)) else: flags = cudart.cudaHostAllocMapped - a_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags)) - b_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags)) - c_ptr = checkCudaErrors(cudart.cudaHostAlloc(num_bytes, flags)) + a_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags)) + b_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags)) + c_ptr = check_cuda_errors(cudart.cudaHostAlloc(num_bytes, flags)) a = (ctypes.c_float * nelem).from_address(a_ptr) b = (ctypes.c_float * nelem).from_address(b_ptr) @@ -110,9 +110,9 @@ def main(): b[n] = rnd.random() # Get the device pointers for the pinned CPU memory mapped into the GPU memory space - d_a = checkCudaErrors(cudart.cudaHostGetDevicePointer(a, 0)) - d_b = checkCudaErrors(cudart.cudaHostGetDevicePointer(b, 0)) - d_c = checkCudaErrors(cudart.cudaHostGetDevicePointer(c, 0)) + d_a = check_cuda_errors(cudart.cudaHostGetDevicePointer(a, 0)) + d_b = check_cuda_errors(cudart.cudaHostGetDevicePointer(b, 0)) + d_c = check_cuda_errors(cudart.cudaHostGetDevicePointer(c, 0)) # Call the GPU kernel using the CPU pointers residing in CPU mapped memory print("> vectorAddGPU kernel will add vectors using mapped CPU memory...") @@ -124,57 +124,57 @@ def main(): grid.x = math.ceil(nelem / float(block.x)) grid.y = 1 grid.z = 1 - with common.KernelHelper(simpleZeroCopy, idev) as kernelHelper: - _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU") - kernelArgs = ( - (d_a, d_b, d_c, nelem), - (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), + kernel_helper = common.KernelHelper(simple_zero_copy, idev) + _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU") + kernel_args = ( + (d_a, d_b, d_c, nelem), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), + ) + check_cuda_errors( + cuda.cuLaunchKernel( + _vector_add_gpu, + grid.x, + grid.y, + grid.z, + block.x, + block.y, + block.z, + 0, + cuda.CU_STREAM_LEGACY, + kernel_args, + 0, ) - checkCudaErrors( - cuda.cuLaunchKernel( - _vectorAddGPU, - grid.x, - grid.y, - grid.z, - block.x, - block.y, - block.z, - 0, - cuda.CU_STREAM_LEGACY, - kernelArgs, - 0, - ) - ) - checkCudaErrors(cudart.cudaDeviceSynchronize()) + ) + check_cuda_errors(cudart.cudaDeviceSynchronize()) print("> Checking the results from vectorAddGPU() ...") # Compare the results - errorNorm = 0.0 - refNorm = 0.0 + error_norm = 0.0 + ref_norm = 0.0 for n in range(nelem): ref = a[n] + b[n] diff = c[n] - ref - errorNorm += diff * diff - refNorm += ref * ref + error_norm += diff * diff + ref_norm += ref * ref - errorNorm = math.sqrt(errorNorm) - refNorm = math.sqrt(refNorm) + error_norm = math.sqrt(error_norm) + ref_norm = math.sqrt(ref_norm) # Memory clean up print("Releasing CPU memory...") - if bPinGenericMemory: - checkCudaErrors(cudart.cudaHostUnregister(a)) - checkCudaErrors(cudart.cudaHostUnregister(b)) - checkCudaErrors(cudart.cudaHostUnregister(c)) + if b_pin_generic_memory: + check_cuda_errors(cudart.cudaHostUnregister(a)) + check_cuda_errors(cudart.cudaHostUnregister(b)) + check_cuda_errors(cudart.cudaHostUnregister(c)) else: - checkCudaErrors(cudart.cudaFreeHost(a)) - checkCudaErrors(cudart.cudaFreeHost(b)) - checkCudaErrors(cudart.cudaFreeHost(c)) + check_cuda_errors(cudart.cudaFreeHost(a)) + check_cuda_errors(cudart.cudaFreeHost(b)) + check_cuda_errors(cudart.cudaFreeHost(c)) - if errorNorm / refNorm >= 1.0e-7: + if error_norm / ref_norm >= 1.0e-7: print("FAILED", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py index df52462854..94a356101f 100644 --- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py +++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py @@ -7,12 +7,12 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -systemWideAtomics = """\ +system_wide_atomics = """\ #define LOOP_NUM 50 extern "C" @@ -63,21 +63,21 @@ #! @param reference reference data, computed but preallocated #! @param idata input data as provided to device #! @param len number of elements in reference / idata -def verify(testData, length): +def verify(test_data, length): val = 0 for i in range(length * LOOP_NUM): val += 10 - if val != testData[0]: - print(f"atomicAdd failed val = {val} testData = {testData[0]}") + if val != test_data[0]: + print(f"atomicAdd failed val = {val} test_data = {test_data[0]}") return False val = 0 found = False for i in range(length): # second element should be a member of [0, len) - if i == testData[1]: + if i == test_data[1]: found = True break @@ -91,7 +91,7 @@ def verify(testData, length): # third element should be len-1 val = max(val, i) - if val != testData[2]: + if val != test_data[2]: print("atomicMax failed") return False @@ -100,7 +100,7 @@ def verify(testData, length): for i in range(length): val = min(val, i) - if val != testData[3]: + if val != test_data[3]: print("atomicMin failed") return False @@ -110,7 +110,7 @@ def verify(testData, length): for i in range(length * LOOP_NUM): val = 0 if val >= limit else val + 1 - if val != testData[4]: + if val != test_data[4]: print("atomicInc failed") return False @@ -120,7 +120,7 @@ def verify(testData, length): for i in range(length * LOOP_NUM): val = limit if (val == 0) or (val > limit) else val - 1 - if val != testData[5]: + if val != test_data[5]: print("atomicDec failed") return False @@ -128,7 +128,7 @@ def verify(testData, length): for i in range(length): # seventh element should be a member of [0, len) - if i == testData[6]: + if i == test_data[6]: found = True break @@ -142,13 +142,13 @@ def verify(testData, length): # 8th element should be 1 val &= 2 * i + 7 - if val != testData[7]: + if val != test_data[7]: print("atomicAnd failed") return False # 9th element should be 0xff val = -1 - if val != testData[8]: + if val != test_data[8]: print("atomicOr failed") return False @@ -158,7 +158,7 @@ def verify(testData, length): # 11th element should be 0xff val ^= i - if val != testData[9]: + if val != test_data[9]: print("atomicXor failed") return False @@ -172,72 +172,74 @@ def main(): pytest.skip("Atomics not supported on Windows") # set device - dev_id = findCudaDevice() - device_prop = checkCudaErrors(cudart.cudaGetDeviceProperties(dev_id)) + dev_id = find_cuda_device() + device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id)) if not device_prop.managedMemory: pytest.skip("Unified Memory not supported on this device") - computeMode = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id)) - if computeMode == cudart.cudaComputeMode.cudaComputeModeProhibited: + compute_mode = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id) + ) + if compute_mode == cudart.cudaComputeMode.cudaComputeModeProhibited: pytest.skip("This sample requires a device in either default or process exclusive mode") if device_prop.major < 6: pytest.skip("Requires a minimum CUDA compute 6.0 capability") - numThreads = 256 - numBlocks = 64 - numData = 10 + num_threads = 256 + num_blocks = 64 + num_data = 10 if device_prop.pageableMemoryAccess: print("CAN access pageable memory") - atom_arr_h = (ctypes.c_int * numData)(0) + atom_arr_h = (ctypes.c_int * num_data)(0) atom_arr = ctypes.addressof(atom_arr_h) else: print("CANNOT access pageable memory") - atom_arr = checkCudaErrors( - cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * numData, cudart.cudaMemAttachGlobal) + atom_arr = check_cuda_errors( + cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * num_data, cudart.cudaMemAttachGlobal) ) - atom_arr_h = (ctypes.c_int * numData).from_address(atom_arr) + atom_arr_h = (ctypes.c_int * num_data).from_address(atom_arr) - for i in range(numData): + for i in range(num_data): atom_arr_h[i] = 0 # To make the AND and XOR tests generate something other than 0... atom_arr_h[7] = atom_arr_h[9] = 0xFF - with common.KernelHelper(systemWideAtomics, dev_id) as kernelHelper: - _atomicKernel = kernelHelper.getFunction(b"atomicKernel") - kernelArgs = ((atom_arr,), (ctypes.c_void_p,)) - checkCudaErrors( - cuda.cuLaunchKernel( - _atomicKernel, - numBlocks, - 1, - 1, # grid dim - numThreads, - 1, - 1, # block dim - 0, - cuda.CU_STREAM_LEGACY, # shared mem and stream - kernelArgs, - 0, - ) - ) # arguments + kernel_helper = common.KernelHelper(system_wide_atomics, dev_id) + _atomic_kernel = kernel_helper.get_function(b"atomicKernel") + kernel_args = ((atom_arr,), (ctypes.c_void_p,)) + check_cuda_errors( + cuda.cuLaunchKernel( + _atomic_kernel, + num_blocks, + 1, + 1, # grid dim + num_threads, + 1, + 1, # block dim + 0, + cuda.CU_STREAM_LEGACY, # shared mem and stream + kernel_args, + 0, + ) + ) # arguments # NOTE: Python doesn't have an equivalent system atomic operations # atomicKernel_CPU(atom_arr_h, numBlocks * numThreads) - checkCudaErrors(cudart.cudaDeviceSynchronize()) + check_cuda_errors(cudart.cudaDeviceSynchronize()) # Compute & verify reference solution - testResult = verify(atom_arr_h, numThreads * numBlocks) + test_result = verify(atom_arr_h, num_threads * num_blocks) if device_prop.pageableMemoryAccess: pass else: - checkCudaErrors(cudart.cudaFree(atom_arr)) + check_cuda_errors(cudart.cudaFree(atom_arr)) - if not testResult: + if not test_result: print("systemWideAtomics completed with errors", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py index 8ee238e36b..8c70aadd3a 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py @@ -7,11 +7,11 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV +from common.helper_cuda import check_cuda_errors, find_cuda_device_drv from cuda.bindings import driver as cuda -vectorAddDrv = """\ +vector_add_drv = """\ /* Vector addition: C = A + B. * * This sample is a very basic sample that implements element by element @@ -32,82 +32,82 @@ def main(): - N = 50000 - nbytes = N * np.dtype(np.float32).itemsize + n = 50000 + nbytes = n * np.dtype(np.float32).itemsize # Initialize - checkCudaErrors(cuda.cuInit(0)) - cuDevice = findCudaDeviceDRV() + check_cuda_errors(cuda.cuInit(0)) + cu_device = find_cuda_device_drv() # Create context - cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice)) + cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device)) - uvaSupported = checkCudaErrors( - cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice) + uva_supported = check_cuda_errors( + cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cu_device) ) - if not uvaSupported: + if not uva_supported: import pytest pytest.skip("Accessing pageable memory directly requires UVA") - with common.KernelHelper(vectorAddDrv, int(cuDevice)) as kernelHelper: - _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel") - - # Allocate input vectors h_A and h_B in host memory - h_A = np.random.rand(N).astype(dtype=np.float32) - h_B = np.random.rand(N).astype(dtype=np.float32) - h_C = np.random.rand(N).astype(dtype=np.float32) - - # Allocate vectors in device memory - d_A = checkCudaErrors(cuda.cuMemAlloc(nbytes)) - d_B = checkCudaErrors(cuda.cuMemAlloc(nbytes)) - d_C = checkCudaErrors(cuda.cuMemAlloc(nbytes)) - - # Copy vectors from host memory to device memory - checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, nbytes)) - checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, nbytes)) - - if True: - # Grid/Block configuration - threadsPerBlock = 256 - blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock - - kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int)) - - # Launch the CUDA kernel - checkCudaErrors( - cuda.cuLaunchKernel( - _VecAdd_kernel, - blocksPerGrid, - 1, - 1, - threadsPerBlock, - 1, - 1, - 0, - 0, - kernelArgs, - 0, - ) + kernel_helper = common.KernelHelper(vector_add_drv, int(cu_device)) + _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel") + + # Allocate input vectors h_A and h_B in host memory + h_a = np.random.rand(n).astype(dtype=np.float32) + h_b = np.random.rand(n).astype(dtype=np.float32) + h_c = np.random.rand(n).astype(dtype=np.float32) + + # Allocate vectors in device memory + d_a = check_cuda_errors(cuda.cuMemAlloc(nbytes)) + d_b = check_cuda_errors(cuda.cuMemAlloc(nbytes)) + d_c = check_cuda_errors(cuda.cuMemAlloc(nbytes)) + + # Copy vectors from host memory to device memory + check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, nbytes)) + check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, nbytes)) + + if True: + # Grid/Block configuration + threads_per_block = 256 + blocks_per_grid = (n + threads_per_block - 1) / threads_per_block + + kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int)) + + # Launch the CUDA kernel + check_cuda_errors( + cuda.cuLaunchKernel( + _vec_add_kernel, + blocks_per_grid, + 1, + 1, + threads_per_block, + 1, + 1, + 0, + 0, + kernel_args, + 0, ) - else: - pass - - # Copy result from device memory to host memory - # h_C contains the result in host memory - checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, nbytes)) - - for i in range(N): - sum_all = h_A[i] + h_B[i] - if math.fabs(h_C[i] - sum_all) > 1e-7: - break - - # Free device memory - checkCudaErrors(cuda.cuMemFree(d_A)) - checkCudaErrors(cuda.cuMemFree(d_B)) - checkCudaErrors(cuda.cuMemFree(d_C)) - - checkCudaErrors(cuda.cuCtxDestroy(cuContext)) - if i + 1 != N: + ) + else: + pass + + # Copy result from device memory to host memory + # h_C contains the result in host memory + check_cuda_errors(cuda.cuMemcpyDtoH(h_c, d_c, nbytes)) + + for i in range(n): + sum_all = h_a[i] + h_b[i] + if math.fabs(h_c[i] - sum_all) > 1e-7: + break + + # Free device memory + check_cuda_errors(cuda.cuMemFree(d_a)) + check_cuda_errors(cuda.cuMemFree(d_b)) + check_cuda_errors(cuda.cuMemFree(d_c)) + + check_cuda_errors(cuda.cuCtxDestroy(cu_context)) + if i + 1 != n: print("Result = FAIL", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py index c7f9e6275b..d5e2e3d26f 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py @@ -8,11 +8,11 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV +from common.helper_cuda import check_cuda_errors, find_cuda_device_drv from cuda.bindings import driver as cuda -vectorAddMMAP = """\ +vector_add_mmap = """\ /* Vector addition: C = A + B. * * This sample is a very basic sample that implements element by element @@ -36,35 +36,35 @@ def round_up(x, y): return int((x - 1) / y + 1) * y -def getBackingDevices(cuDevice): - num_devices = checkCudaErrors(cuda.cuDeviceGetCount()) +def get_backing_devices(cu_device): + num_devices = check_cuda_errors(cuda.cuDeviceGetCount()) - backingDevices = [cuDevice] + backing_devices = [cu_device] for dev in range(num_devices): # The mapping device is already in the backingDevices vector - if int(dev) == int(cuDevice): + if int(dev) == int(cu_device): continue # Only peer capable devices can map each others memory - capable = checkCudaErrors(cuda.cuDeviceCanAccessPeer(cuDevice, dev)) + capable = check_cuda_errors(cuda.cuDeviceCanAccessPeer(cu_device, dev)) if not capable: continue # The device needs to support virtual address management for the required apis to work - attributeVal = checkCudaErrors( + attribute_val = check_cuda_errors( cuda.cuDeviceGetAttribute( cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - cuDevice, + cu_device, ) ) - if attributeVal == 0: + if attribute_val == 0: continue - backingDevices.append(cuda.CUdevice(dev)) - return backingDevices + backing_devices.append(cuda.CUdevice(dev)) + return backing_devices -def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): +def simple_malloc_multi_device_mmap(size, resident_devices, mapping_devices, align=0): min_granularity = 0 # Setup the properties common for all the chunks @@ -77,7 +77,7 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Get the minimum granularity needed for the resident devices # (the max of the minimum granularity of each participating device) - for device in residentDevices: + for device in resident_devices: prop.location.id = device status, granularity = cuda.cuMemGetAllocationGranularity( prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM @@ -89,7 +89,7 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Get the minimum granularity needed for the accessing devices # (the max of the minimum granularity of each participating device) - for device in mappingDevices: + for device in mapping_devices: prop.location.id = device status, granularity = cuda.cuMemGetAllocationGranularity( prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM @@ -103,28 +103,28 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Essentially size = N * residentDevices.size() * min_granularity is the requirement, # since each piece of the allocation will be stripeSize = N * min_granularity # and the min_granularity requirement applies to each stripeSize piece of the allocation. - size = round_up(size, len(residentDevices) * min_granularity) - stripeSize = size / len(residentDevices) + size = round_up(size, len(resident_devices) * min_granularity) + stripe_size = size / len(resident_devices) # Return the rounded up size to the caller for use in the free - allocationSize = size + allocation_size = size # Reserve the required contiguous VA space for the allocations status, dptr = cuda.cuMemAddressReserve(size, align, cuda.CUdeviceptr(0), 0) if status != cuda.CUresult.CUDA_SUCCESS: - simpleFreeMultiDeviceMmap(dptr, size) + simple_free_multi_device_mmap(dptr, size) return status, None, None # Create and map the backings on each gpu # note: reusing CUmemAllocationProp prop from earlier with prop.type & prop.location.type already specified. - for idx in range(len(residentDevices)): + for idx in range(len(resident_devices)): # Set the location for this chunk to this device - prop.location.id = residentDevices[idx] + prop.location.id = resident_devices[idx] # Create the allocation as a pinned allocation on this device - status, allocationHandle = cuda.cuMemCreate(stripeSize, prop, 0) + status, allocation_handle = cuda.cuMemCreate(stripe_size, prop, 0) if status != cuda.CUresult.CUDA_SUCCESS: - simpleFreeMultiDeviceMmap(dptr, size) + simple_free_multi_device_mmap(dptr, size) return status, None, None # Assign the chunk to the appropriate VA range and release the handle. @@ -132,10 +132,10 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Since we do not need to make any other mappings of this memory or export it, # we no longer need and can release the allocationHandle. # The allocation will be kept live until it is unmapped. - (status,) = cuda.cuMemMap(int(dptr) + (stripeSize * idx), stripeSize, 0, allocationHandle, 0) + (status,) = cuda.cuMemMap(int(dptr) + (stripe_size * idx), stripe_size, 0, allocation_handle, 0) # the handle needs to be released even if the mapping failed. - (status2,) = cuda.cuMemRelease(allocationHandle) + (status2,) = cuda.cuMemRelease(allocation_handle) if status != cuda.CUresult.CUDA_SUCCESS: # cuMemRelease should not have failed here # as the handle was just allocated successfully @@ -144,31 +144,31 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): # Cleanup in case of any mapping failures. if status != cuda.CUresult.CUDA_SUCCESS: - simpleFreeMultiDeviceMmap(dptr, size) + simple_free_multi_device_mmap(dptr, size) return status, None, None # Each accessDescriptor will describe the mapping requirement for a single device - accessDescriptors = [cuda.CUmemAccessDesc()] * len(mappingDevices) + access_descriptors = [cuda.CUmemAccessDesc()] * len(mapping_devices) # Prepare the access descriptor array indicating where and how the backings should be visible. - for idx in range(len(mappingDevices)): + for idx in range(len(mapping_devices)): # Specify which device we are adding mappings for. - accessDescriptors[idx].location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - accessDescriptors[idx].location.id = mappingDevices[idx] + access_descriptors[idx].location.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + access_descriptors[idx].location.id = mapping_devices[idx] # Specify both read and write access. - accessDescriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE + access_descriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE # Apply the access descriptors to the whole VA range. - (status,) = cuda.cuMemSetAccess(dptr, size, accessDescriptors, len(accessDescriptors)) + (status,) = cuda.cuMemSetAccess(dptr, size, access_descriptors, len(access_descriptors)) if status != cuda.CUresult.CUDA_SUCCESS: - simpleFreeMultiDeviceMmap(dptr, size) + simple_free_multi_device_mmap(dptr, size) return status, None, None - return (status, dptr, allocationSize) + return (status, dptr, allocation_size) -def simpleFreeMultiDeviceMmap(dptr, size): +def simple_free_multi_device_mmap(dptr, size): # Unmap the mapped virtual memory region # Since the handles to the mapped backing stores have already been released # by cuMemRelease, and these are the only/last mappings referencing them, @@ -204,97 +204,97 @@ def main(): if platform.machine() == "sbsa": pytest.skip("vectorAddMMAP is not supported on sbsa") - N = 50000 - size = N * np.dtype(np.float32).itemsize + n = 50000 + size = n * np.dtype(np.float32).itemsize # Initialize - checkCudaErrors(cuda.cuInit(0)) + check_cuda_errors(cuda.cuInit(0)) - cuDevice = findCudaDeviceDRV() + cu_device = find_cuda_device_drv() # Check that the selected device supports virtual address management - attributeVal = checkCudaErrors( + attribute_val = check_cuda_errors( cuda.cuDeviceGetAttribute( cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - cuDevice, + cu_device, ) ) - print(f"Device {cuDevice} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attributeVal}.") - if not attributeVal: - pytest.skip(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.") + print(f"Device {cu_device} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attribute_val}.") + if not attribute_val: + pytest.skip(f"Device {cu_device} doesn't support VIRTUAL ADDRESS MANAGEMENT.") # The vector addition happens on cuDevice, so the allocations need to be mapped there. - mappingDevices = [cuDevice] + mapping_devices = [cu_device] # Collect devices accessible by the mapping device (cuDevice) into the backingDevices vector. - backingDevices = getBackingDevices(cuDevice) + backing_devices = get_backing_devices(cu_device) # Create context - cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice)) - - with common.KernelHelper(vectorAddMMAP, int(cuDevice)) as kernelHelper: - _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel") - - # Allocate input vectors h_A and h_B in host memory - h_A = np.random.rand(size).astype(dtype=np.float32) - h_B = np.random.rand(size).astype(dtype=np.float32) - h_C = np.random.rand(size).astype(dtype=np.float32) - - # Allocate vectors in device memory - # note that a call to cuCtxEnablePeerAccess is not needed even though - # the backing devices and mapping device are not the same. - # This is because the cuMemSetAccess call explicitly specifies - # the cross device mapping. - # cuMemSetAccess is still subject to the constraints of cuDeviceCanAccessPeer - # for cross device mappings (hence why we checked cuDeviceCanAccessPeer earlier). - d_A, allocationSize = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices)) - d_B, _ = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices)) - d_C, _ = checkCudaErrors(simpleMallocMultiDeviceMmap(size, backingDevices, mappingDevices)) - - # Copy vectors from host memory to device memory - checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, size)) - checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, size)) - - # Grid/Block configuration - threadsPerBlock = 256 - blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock - - kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int)) - - # Launch the CUDA kernel - checkCudaErrors( - cuda.cuLaunchKernel( - _VecAdd_kernel, - blocksPerGrid, - 1, - 1, - threadsPerBlock, - 1, - 1, - 0, - 0, - kernelArgs, - 0, - ) + cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device)) + + kernel_helper = common.KernelHelper(vector_add_mmap, int(cu_device)) + _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel") + + # Allocate input vectors h_A and h_B in host memory + h_a = np.random.rand(size).astype(dtype=np.float32) + h_b = np.random.rand(size).astype(dtype=np.float32) + h_c = np.random.rand(size).astype(dtype=np.float32) + + # Allocate vectors in device memory + # note that a call to cuCtxEnablePeerAccess is not needed even though + # the backing devices and mapping device are not the same. + # This is because the cuMemSetAccess call explicitly specifies + # the cross device mapping. + # cuMemSetAccess is still subject to the constraints of cuDeviceCanAccessPeer + # for cross device mappings (hence why we checked cuDeviceCanAccessPeer earlier). + d_a, allocation_size = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices)) + d_b, _ = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices)) + d_c, _ = check_cuda_errors(simple_malloc_multi_device_mmap(size, backing_devices, mapping_devices)) + + # Copy vectors from host memory to device memory + check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, size)) + check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, size)) + + # Grid/Block configuration + threads_per_block = 256 + blocks_per_grid = (n + threads_per_block - 1) / threads_per_block + + kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int)) + + # Launch the CUDA kernel + check_cuda_errors( + cuda.cuLaunchKernel( + _vec_add_kernel, + blocks_per_grid, + 1, + 1, + threads_per_block, + 1, + 1, + 0, + 0, + kernel_args, + 0, ) + ) - # Copy result from device memory to host memory - # h_C contains the result in host memory - checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, size)) + # Copy result from device memory to host memory + # h_C contains the result in host memory + check_cuda_errors(cuda.cuMemcpyDtoH(h_c, d_c, size)) - # Verify result - for i in range(N): - sum_all = h_A[i] + h_B[i] - if math.fabs(h_C[i] - sum_all) > 1e-7: - break + # Verify result + for i in range(n): + sum_all = h_a[i] + h_b[i] + if math.fabs(h_c[i] - sum_all) > 1e-7: + break - checkCudaErrors(simpleFreeMultiDeviceMmap(d_A, allocationSize)) - checkCudaErrors(simpleFreeMultiDeviceMmap(d_B, allocationSize)) - checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize)) + check_cuda_errors(simple_free_multi_device_mmap(d_a, allocation_size)) + check_cuda_errors(simple_free_multi_device_mmap(d_b, allocation_size)) + check_cuda_errors(simple_free_multi_device_mmap(d_c, allocation_size)) - checkCudaErrors(cuda.cuCtxDestroy(cuContext)) + check_cuda_errors(cuda.cuCtxDestroy(cu_context)) - if i + 1 != N: + if i + 1 != n: print("Result = FAIL", file=sys.stderr) sys.exit(1) diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py index afe769ca15..f26dd2dabe 100644 --- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py +++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py @@ -9,13 +9,13 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice -from common.helper_string import checkCmdLineFlag +from common.helper_cuda import check_cuda_errors, find_cuda_device +from common.helper_string import check_cmd_line_flag from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -streamOrderedAllocation = """\ +stream_ordered_allocation = """\ /* Add two vectors on the GPU */ extern "C" __global__ void vectorAddGPU(const float *a, const float *b, float *c, int N) @@ -31,18 +31,18 @@ MAX_ITER = 20 -def basicStreamOrderedAllocation(dev, nelem, a, b, c): +def basic_stream_ordered_allocation(dev, nelem, a, b, c): num_bytes = nelem * np.dtype(np.float32).itemsize print("Starting basicStreamOrderedAllocation()") - checkCudaErrors(cudart.cudaSetDevice(dev)) - stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) + check_cuda_errors(cudart.cudaSetDevice(dev)) + stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) - d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) + d_a = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + d_b = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + d_c = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + check_cuda_errors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) + check_cuda_errors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) block = cudart.dim3() block.x = 256 @@ -53,13 +53,13 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c): grid.y = 1 grid.z = 1 - kernelArgs = ( + kernel_args = ( (d_a, d_b, d_c, nelem), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), ) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( - _vectorAddGPU, + _vector_add_gpu, grid.x, grid.y, grid.z, # grid dim @@ -68,68 +68,72 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c): block.z, # block dim 0, stream, # shared mem and stream - kernelArgs, + kernel_args, 0, ) ) # arguments - checkCudaErrors(cudart.cudaFreeAsync(d_a, stream)) - checkCudaErrors(cudart.cudaFreeAsync(d_b, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) - checkCudaErrors(cudart.cudaFreeAsync(d_c, stream)) - checkCudaErrors(cudart.cudaStreamSynchronize(stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_a, stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_b, stream)) + check_cuda_errors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_c, stream)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream)) # Compare the results print("> Checking the results from vectorAddGPU() ...") - errorNorm = 0.0 - refNorm = 0.0 + error_norm = 0.0 + ref_norm = 0.0 for n in range(nelem): ref = a[n] + b[n] diff = c[n] - ref - errorNorm += diff * diff - refNorm += ref * ref + error_norm += diff * diff + ref_norm += ref * ref - errorNorm = math.sqrt(errorNorm) - refNorm = math.sqrt(refNorm) + error_norm = math.sqrt(error_norm) + ref_norm = math.sqrt(ref_norm) - checkCudaErrors(cudart.cudaStreamDestroy(stream)) + check_cuda_errors(cudart.cudaStreamDestroy(stream)) - return errorNorm / refNorm < 1.0e-6 + return error_norm / ref_norm < 1.0e-6 # streamOrderedAllocationPostSync(): demonstrates If the application wants the memory to persist in the pool beyond # synchronization, then it sets the release threshold on the pool. This way, when the application reaches the "steady state", # it is no longer allocating/freeing memory from the OS. -def streamOrderedAllocationPostSync(dev, nelem, a, b, c): +def stream_ordered_allocation_post_sync(dev, nelem, a, b, c): num_bytes = nelem * np.dtype(np.float32).itemsize print("Starting streamOrderedAllocationPostSync()") - checkCudaErrors(cudart.cudaSetDevice(dev)) - stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) - start = checkCudaErrors(cudart.cudaEventCreate()) - end = checkCudaErrors(cudart.cudaEventCreate()) + check_cuda_errors(cudart.cudaSetDevice(dev)) + stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) + start = check_cuda_errors(cudart.cudaEventCreate()) + end = check_cuda_errors(cudart.cudaEventCreate()) - memPool = checkCudaErrors(cudart.cudaDeviceGetDefaultMemPool(dev)) - thresholdVal = cuda.cuuint64_t(ctypes.c_uint64(-1).value) + mem_pool = check_cuda_errors(cudart.cudaDeviceGetDefaultMemPool(dev)) + threshold_val = cuda.cuuint64_t(ctypes.c_uint64(-1).value) # Set high release threshold on the default pool so that cudaFreeAsync will not actually release memory to the system. # By default, the release threshold for a memory pool is set to zero. This implies that the CUDA driver is # allowed to release a memory chunk back to the system as long as it does not contain any active suballocations. - checkCudaErrors( + check_cuda_errors( cudart.cudaMemPoolSetAttribute( - memPool, + mem_pool, cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold, - thresholdVal, + threshold_val, ) ) # Record teh start event - checkCudaErrors(cudart.cudaEventRecord(start, stream)) + check_cuda_errors(cudart.cudaEventRecord(start, stream)) for _i in range(MAX_ITER): - d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) + d_a = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + d_b = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + d_c = check_cuda_errors(cudart.cudaMallocAsync(num_bytes, stream)) + check_cuda_errors( + cudart.cudaMemcpyAsync(d_a, a, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) + ) + check_cuda_errors( + cudart.cudaMemcpyAsync(d_b, b, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) + ) block = cudart.dim3() block.x = 256 @@ -140,13 +144,13 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c): grid.y = 1 grid.z = 1 - kernelArgs = ( + kernel_args = ( (d_a, d_b, d_c, nelem), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), ) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( - _vectorAddGPU, + _vector_add_gpu, grid.x, grid.y, grid.z, # grid dim @@ -155,40 +159,42 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c): block.z, # block dim 0, stream, # shared mem and stream - kernelArgs, + kernel_args, 0, ) ) # arguments - checkCudaErrors(cudart.cudaFreeAsync(d_a, stream)) - checkCudaErrors(cudart.cudaFreeAsync(d_b, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) - checkCudaErrors(cudart.cudaFreeAsync(d_c, stream)) - checkCudaErrors(cudart.cudaStreamSynchronize(stream)) - checkCudaErrors(cudart.cudaEventRecord(end, stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_a, stream)) + check_cuda_errors(cudart.cudaFreeAsync(d_b, stream)) + check_cuda_errors( + cudart.cudaMemcpyAsync(c, d_c, num_bytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) + ) + check_cuda_errors(cudart.cudaFreeAsync(d_c, stream)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream)) + check_cuda_errors(cudart.cudaEventRecord(end, stream)) # Wait for the end event to complete - checkCudaErrors(cudart.cudaEventSynchronize(end)) + check_cuda_errors(cudart.cudaEventSynchronize(end)) - msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, end)) - print(f"Total elapsed time = {msecTotal} ms over {MAX_ITER} iterations") + msec_total = check_cuda_errors(cudart.cudaEventElapsedTime(start, end)) + print(f"Total elapsed time = {msec_total} ms over {MAX_ITER} iterations") # Compare the results print("> Checking the results from vectorAddGPU() ...") - errorNorm = 0.0 - refNorm = 0.0 + error_norm = 0.0 + ref_norm = 0.0 for n in range(nelem): ref = a[n] + b[n] diff = c[n] - ref - errorNorm += diff * diff - refNorm += ref * ref + error_norm += diff * diff + ref_norm += ref * ref - errorNorm = math.sqrt(errorNorm) - refNorm = math.sqrt(refNorm) + error_norm = math.sqrt(error_norm) + ref_norm = math.sqrt(ref_norm) - checkCudaErrors(cudart.cudaStreamDestroy(stream)) + check_cuda_errors(cudart.cudaStreamDestroy(stream)) - return errorNorm / refNorm < 1.0e-6 + return error_norm / ref_norm < 1.0e-6 def main(): @@ -198,42 +204,42 @@ def main(): pytest.skip("streamOrderedAllocation is not supported on Mac OSX") cuda.cuInit(0) - if checkCmdLineFlag("help"): + if check_cmd_line_flag("help"): print("Usage: streamOrderedAllocation [OPTION]\n", file=sys.stderr) print("Options:", file=sys.stderr) print(" device=[device #] Specify the device to be used", file=sys.stderr) sys.exit(1) - dev = findCudaDevice() + dev = find_cuda_device() - version = checkCudaErrors(cudart.cudaDriverGetVersion()) + version = check_cuda_errors(cudart.cudaDriverGetVersion()) if version < 11030: - isMemPoolSupported = False + is_mem_pool_supported = False else: - isMemPoolSupported = checkCudaErrors( + is_mem_pool_supported = check_cuda_errors( cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev) ) - if not isMemPoolSupported: + if not is_mem_pool_supported: pytest.skip("Waiving execution as device does not support Memory Pools") - global _vectorAddGPU - with common.KernelHelper(streamOrderedAllocation, dev) as kernelHelper: - _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU") + global _vector_add_gpu + kernel_helper = common.KernelHelper(stream_ordered_allocation, dev) + _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU") - # Allocate CPU memory - nelem = 1048576 - nelem * np.dtype(np.float32).itemsize + # Allocate CPU memory + nelem = 1048576 + nelem * np.dtype(np.float32).itemsize - a = np.zeros(nelem, dtype="float32") - b = np.zeros(nelem, dtype="float32") - c = np.zeros(nelem, dtype="float32") - # Initialize the vectors - for i in range(nelem): - a[i] = rnd.random() - b[i] = rnd.random() + a = np.zeros(nelem, dtype="float32") + b = np.zeros(nelem, dtype="float32") + c = np.zeros(nelem, dtype="float32") + # Initialize the vectors + for i in range(nelem): + a[i] = rnd.random() + b[i] = rnd.random() - ret1 = basicStreamOrderedAllocation(dev, nelem, a, b, c) - ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c) + ret1 = basic_stream_ordered_allocation(dev, nelem, a, b, c) + ret2 = stream_ordered_allocation_post_sync(dev, nelem, a, b, c) if not ret1 or not ret2: sys.exit(1) diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py index aaa03e446a..722d19dcb5 100644 --- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py @@ -9,16 +9,16 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice -from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt +from common.helper_cuda import check_cuda_errors, find_cuda_device +from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -blockSize = 16 +block_size = 16 -class kernels(Enum): +class Kernels(Enum): AsyncCopyMultiStageLargeChunk = 0 AsyncCopyLargeChunk = 1 AsyncCopyLargeChunkAWBarrier = 2 @@ -29,7 +29,7 @@ class kernels(Enum): NaiveLargeChunk = 7 -kernelNames = [ +kernel_names = [ "AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk", "AsyncCopyLargeChunkAWBarrier", @@ -40,7 +40,7 @@ class kernels(Enum): "NaiveLargeChunk", ] -globalToShmemAsyncCopy = """\ +global_to_shmem_async_copy = """\ #line __LINE__ #if __CUDA_ARCH__ >= 700 #include @@ -709,7 +709,7 @@ class kernels(Enum): """ -def ConstantInit(data, size, val): +def constant_init(data, size, val): p_data = (ctypes.c_float * size).from_address(data) for i in range(size): p_data[i] = val @@ -718,78 +718,82 @@ def ConstantInit(data, size, val): # # Run matrix multiplication using CUDA # -def MatrixMultiply(dimsA, dimsB, kernel_number): +def matrix_multiply(dims_a, dims_b, kernel_number): # Allocate host memory for matricies A and B - size_A = dimsA.x * dimsA.y - mem_size_A = np.dtype(np.float32).itemsize * size_A - h_A = checkCudaErrors(cudart.cudaMallocHost(mem_size_A)) - size_B = dimsB.x * dimsB.y - mem_size_B = np.dtype(np.float32).itemsize * size_B - h_B = checkCudaErrors(cudart.cudaMallocHost(mem_size_B)) + size_a = dims_a.x * dims_a.y + mem_size_a = np.dtype(np.float32).itemsize * size_a + h_a = check_cuda_errors(cudart.cudaMallocHost(mem_size_a)) + size_b = dims_b.x * dims_b.y + mem_size_b = np.dtype(np.float32).itemsize * size_b + h_b = check_cuda_errors(cudart.cudaMallocHost(mem_size_b)) # Initialize host memory - valB = 2.10 - ConstantInit(h_A, size_A, 1.0) - ConstantInit(h_B, size_B, valB) + val_b = 2.10 + constant_init(h_a, size_a, 1.0) + constant_init(h_b, size_b, val_b) # Allocate Device Memory # Allocate host matrix C - dimsC = cudart.dim3() - dimsC.x = dimsB.x - dimsC.y = dimsA.y - dimsC.z = 1 - mem_size_C = dimsC.x * dimsC.y * np.dtype(np.float32).itemsize - h_C = checkCudaErrors(cudart.cudaMallocHost(mem_size_C)) - - if h_C == 0: + dims_c = cudart.dim3() + dims_c.x = dims_b.x + dims_c.y = dims_a.y + dims_c.z = 1 + mem_size_c = dims_c.x * dims_c.y * np.dtype(np.float32).itemsize + h_c = check_cuda_errors(cudart.cudaMallocHost(mem_size_c)) + + if h_c == 0: print("Failed to allocate host matrix C!", file=sys.stderr) sys.exit(1) - d_A = checkCudaErrors(cudart.cudaMalloc(mem_size_A)) - d_B = checkCudaErrors(cudart.cudaMalloc(mem_size_B)) - d_C = checkCudaErrors(cudart.cudaMalloc(mem_size_C)) + d_a = check_cuda_errors(cudart.cudaMalloc(mem_size_a)) + d_b = check_cuda_errors(cudart.cudaMalloc(mem_size_b)) + d_c = check_cuda_errors(cudart.cudaMalloc(mem_size_c)) # Allocate CUDA events that we'll use for timing - start = checkCudaErrors(cudart.cudaEventCreate()) - stop = checkCudaErrors(cudart.cudaEventCreate()) + start = check_cuda_errors(cudart.cudaEventCreate()) + stop = check_cuda_errors(cudart.cudaEventCreate()) - stream = checkCudaErrors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) + stream = check_cuda_errors(cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)) # Copy host memory to device - checkCudaErrors(cudart.cudaMemcpyAsync(d_A, h_A, mem_size_A, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) - checkCudaErrors(cudart.cudaMemcpyAsync(d_B, h_B, mem_size_B, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)) - checkCudaErrors(cudart.cudaMemsetAsync(d_C, 0, mem_size_C, stream)) + check_cuda_errors( + cudart.cudaMemcpyAsync(d_a, h_a, mem_size_a, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) + ) + check_cuda_errors( + cudart.cudaMemcpyAsync(d_b, h_b, mem_size_b, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) + ) + check_cuda_errors(cudart.cudaMemsetAsync(d_c, 0, mem_size_c, stream)) # Setup execution parameters threads = cudart.dim3() - threads.x = threads.y = blockSize + threads.x = threads.y = block_size threads.z = 1 grid = cudart.dim3() - grid.x = dimsB.x / threads.x - grid.y = dimsA.y / threads.y + grid.x = dims_b.x / threads.x + grid.y = dims_a.y / threads.y grid.z = 1 # Here the block size is 16x18, where first 16 rows are consumer thread group # and last 2 rows (1 warp) is producer thread group - threadsSharedStateKernel = cudart.dim3() - threadsSharedStateKernel.x = blockSize - threadsSharedStateKernel.y = blockSize + 2 - threadsSharedStateKernel.z = 1 - gridSharedStateKernel = cudart.dim3() - gridSharedStateKernel.x = dimsB.x / threadsSharedStateKernel.x - gridSharedStateKernel.y = dimsA.y / threadsSharedStateKernel.x - - print(f"Running kernel = {kernel_number} - {kernelNames[kernel_number.value]}") + threads_shared_state_kernel = cudart.dim3() + threads_shared_state_kernel.x = block_size + threads_shared_state_kernel.y = block_size + 2 + threads_shared_state_kernel.z = 1 + grid_shared_state_kernel = cudart.dim3() + grid_shared_state_kernel.x = dims_b.x / threads_shared_state_kernel.x + grid_shared_state_kernel.y = dims_a.y / threads_shared_state_kernel.x + + print(f"Running kernel = {kernel_number} - {kernel_names[kernel_number.value]}") # Create and start timer print("Computing result using CUDA Kernel...") # Performs warmup operation using matrixMul CUDA kernel - kernelArguments = ( - (d_C, d_A, d_B, dimsA.x, dimsB.x), + kernel_arguments = ( + (d_c, d_a, d_b, dims_a.x, dims_b.x), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int), ) - if kernel_number == kernels.AsyncCopyMultiStageLargeChunk: - checkCudaErrors( + if kernel_number == Kernels.AsyncCopyMultiStageLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStageLargeChunk, grid.x, @@ -800,12 +804,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyLargeChunk: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyLargeChunk, grid.x, @@ -816,12 +820,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyLargeChunkAWBarrier: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyLargeChunkAWBarrier, grid.x, @@ -832,28 +836,28 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyMultiStageSharedState: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyMultiStageSharedState: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStageSharedState, - gridSharedStateKernel.x, - gridSharedStateKernel.y, - gridSharedStateKernel.z, # grid dim - threadsSharedStateKernel.x, - threadsSharedStateKernel.y, - threadsSharedStateKernel.z, # block dim + grid_shared_state_kernel.x, + grid_shared_state_kernel.y, + grid_shared_state_kernel.z, # grid dim + threads_shared_state_kernel.x, + threads_shared_state_kernel.y, + threads_shared_state_kernel.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyMultiStage: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyMultiStage: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStage, grid.x, @@ -864,12 +868,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopySingleStage: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopySingleStage: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopySingleStage, grid.x, @@ -880,12 +884,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.Naive: - checkCudaErrors( + elif kernel_number == Kernels.Naive: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulNaive, grid.x, @@ -896,12 +900,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.NaiveLargeChunk: - checkCudaErrors( + elif kernel_number == Kernels.NaiveLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulNaiveLargeChunk, grid.x, @@ -912,21 +916,21 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - checkCudaErrors(cudart.cudaStreamSynchronize(stream)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream)) # Execute the kernel - nIter = 100 + n_iter = 100 # Record the start event - checkCudaErrors(cudart.cudaEventRecord(start, stream)) + check_cuda_errors(cudart.cudaEventRecord(start, stream)) - if kernel_number == kernels.AsyncCopyMultiStageLargeChunk: - checkCudaErrors( + if kernel_number == Kernels.AsyncCopyMultiStageLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStageLargeChunk, grid.x, @@ -937,12 +941,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyLargeChunk: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyLargeChunk, grid.x, @@ -953,12 +957,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyLargeChunkAWBarrier: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyLargeChunkAWBarrier, grid.x, @@ -969,28 +973,28 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyMultiStageSharedState: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyMultiStageSharedState: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStageSharedState, - gridSharedStateKernel.x, - gridSharedStateKernel.y, - gridSharedStateKernel.z, # grid dim - threadsSharedStateKernel.x, - threadsSharedStateKernel.y, - threadsSharedStateKernel.z, # block dim + grid_shared_state_kernel.x, + grid_shared_state_kernel.y, + grid_shared_state_kernel.z, # grid dim + threads_shared_state_kernel.x, + threads_shared_state_kernel.y, + threads_shared_state_kernel.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopyMultiStage: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopyMultiStage: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopyMultiStage, grid.x, @@ -1001,12 +1005,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.AsyncCopySingleStage: - checkCudaErrors( + elif kernel_number == Kernels.AsyncCopySingleStage: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulAsyncCopySingleStage, grid.x, @@ -1017,12 +1021,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.Naive: - checkCudaErrors( + elif kernel_number == Kernels.Naive: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulNaive, grid.x, @@ -1033,12 +1037,12 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments - elif kernel_number == kernels.NaiveLargeChunk: - checkCudaErrors( + elif kernel_number == Kernels.NaiveLargeChunk: + check_cuda_errors( cuda.cuLaunchKernel( _MatrixMulNaiveLargeChunk, grid.x, @@ -1049,31 +1053,33 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): threads.z, # block dim 0, # shared mem stream, # stream - kernelArguments, + kernel_arguments, 0, ) ) # arguments # Record the stop event - checkCudaErrors(cudart.cudaEventRecord(stop, stream)) + check_cuda_errors(cudart.cudaEventRecord(stop, stream)) # Wait for the stop event to complete - checkCudaErrors(cudart.cudaEventSynchronize(stop)) + check_cuda_errors(cudart.cudaEventSynchronize(stop)) - msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop)) + msec_total = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop)) # Compute and print the performance - msecPerMatrixMul = msecTotal / nIter - flopsPerMatrixMul = 2.0 * dimsA.x * dimsA.y * dimsB.x - gigaFlops = (flopsPerMatrixMul * 1.0e-9) / (msecPerMatrixMul / 1000.0) + msec_per_matrix_mul = msec_total / n_iter + flops_per_matrix_mul = 2.0 * dims_a.x * dims_a.y * dims_b.x + giga_flops = (flops_per_matrix_mul * 1.0e-9) / (msec_per_matrix_mul / 1000.0) print( - f"Performance= {gigaFlops:.2f} GFlop/s, Time= {msecPerMatrixMul:.2f} msec, Size= {flopsPerMatrixMul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block" + f"Performance= {giga_flops:.2f} GFlop/s, Time= {msec_per_matrix_mul:.2f} msec, Size= {flops_per_matrix_mul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block" ) # Copy result from device to host - checkCudaErrors(cudart.cudaMemcpyAsync(h_C, d_C, mem_size_C, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) - checkCudaErrors(cudart.cudaStreamSynchronize(stream)) + check_cuda_errors( + cudart.cudaMemcpyAsync(h_c, d_c, mem_size_c, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) + ) + check_cuda_errors(cudart.cudaStreamSynchronize(stream)) correct = True @@ -1081,16 +1087,16 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): # |_cpu - _gpu|/<|x|, |y|> < eps eps = 1.0e-6 - h_C_local = (ctypes.c_float * (dimsC.x * dimsC.y)).from_address(h_C) - for i in range(dimsC.x * dimsC.y): - abs_err = math.fabs(h_C_local[i] - (dimsA.x * valB)) - dot_length = dimsA.x - abs_val = math.fabs(h_C_local[i]) + h_c_local = (ctypes.c_float * (dims_c.x * dims_c.y)).from_address(h_c) + for i in range(dims_c.x * dims_c.y): + abs_err = math.fabs(h_c_local[i] - (dims_a.x * val_b)) + dot_length = dims_a.x + abs_val = math.fabs(h_c_local[i]) rel_err = abs_err / abs_val / dot_length if rel_err > eps: print( - f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}", + f"Error! Matrix[{i:.5f}]={h_c_local[i]:.8f} ref={dims_a.x * val_b:.8f} err term is > {rel_err}", file=sys.stderr, ) correct = False @@ -1099,14 +1105,14 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): print("Result = FAIL", file=sys.stderr) # Clean up memory - checkCudaErrors(cudart.cudaFreeHost(h_A)) - checkCudaErrors(cudart.cudaFreeHost(h_B)) - checkCudaErrors(cudart.cudaFreeHost(h_C)) - checkCudaErrors(cudart.cudaFree(d_A)) - checkCudaErrors(cudart.cudaFree(d_B)) - checkCudaErrors(cudart.cudaFree(d_C)) - checkCudaErrors(cudart.cudaEventDestroy(start)) - checkCudaErrors(cudart.cudaEventDestroy(stop)) + check_cuda_errors(cudart.cudaFreeHost(h_a)) + check_cuda_errors(cudart.cudaFreeHost(h_b)) + check_cuda_errors(cudart.cudaFreeHost(h_c)) + check_cuda_errors(cudart.cudaFree(d_a)) + check_cuda_errors(cudart.cudaFree(d_b)) + check_cuda_errors(cudart.cudaFree(d_c)) + check_cuda_errors(cudart.cudaEventDestroy(start)) + check_cuda_errors(cudart.cudaEventDestroy(stop)) print( "\nNOTE: The CUDA Samples are not meant for performance " "measurements. Results may vary when GPU Boost is enabled." @@ -1119,16 +1125,16 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): def main(): import pytest - common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0)) + common.pytest_skipif_compute_capability_too_low(find_cuda_device(), (7, 0)) if platform.machine() == "qnx": pytest.skip("globalToShmemAsyncCopy is not supported on QNX") - version = checkCudaErrors(cuda.cuDriverGetVersion()) + version = check_cuda_errors(cuda.cuDriverGetVersion()) if version < 11010: pytest.skip("CUDA Toolkit 11.1 or greater is required") - if checkCmdLineFlag("help") or checkCmdLineFlag("?"): + if check_cmd_line_flag("help") or check_cmd_line_flag("?"): print("Usage device=n (n >= 0 for deviceID)", file=sys.stderr) print(" wA=WidthA hA=HeightA (Width x Height of Matrix A)", file=sys.stderr) print(" wB=WidthB hB=HeightB (Width x Height of Matrix B)", file=sys.stderr) @@ -1149,54 +1155,54 @@ def main(): # This will pick the best possible CUDA capable device, otherwise # override the device ID based on input provided at the command line - devID = findCudaDevice() + dev_id = find_cuda_device() - matrixBlock = 32 - dimsA = cudart.dim3() - dimsA.x = dimsA.y = 10 * 4 * matrixBlock - dimsA.z = 1 - dimsB = cudart.dim3() - dimsB.x = dimsB.y = 10 * 4 * matrixBlock - dimsB.z = 1 + matrix_block = 32 + dims_a = cudart.dim3() + dims_a.x = dims_a.y = 10 * 4 * matrix_block + dims_a.z = 1 + dims_b = cudart.dim3() + dims_b.x = dims_b.y = 10 * 4 * matrix_block + dims_b.z = 1 # width of Matrix A - if checkCmdLineFlag("wA="): - dimsA.x = int(getCmdLineArgumentInt("wA=")) + if check_cmd_line_flag("wA="): + dims_a.x = int(get_cmd_line_argument_int("wA=")) # height of Matrix A - if checkCmdLineFlag("hA="): - dimsA.y = int(getCmdLineArgumentInt("hA=")) + if check_cmd_line_flag("hA="): + dims_a.y = int(get_cmd_line_argument_int("hA=")) # width of Matrix B - if checkCmdLineFlag("wB="): - dimsB.x = int(getCmdLineArgumentInt("wB=")) + if check_cmd_line_flag("wB="): + dims_b.x = int(get_cmd_line_argument_int("wB=")) # height of Matrix B - if checkCmdLineFlag("hB="): - dimsB.y = int(getCmdLineArgumentInt("hB=")) + if check_cmd_line_flag("hB="): + dims_b.y = int(get_cmd_line_argument_int("hB=")) - if dimsA.x != dimsB.y: - print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})", file=sys.stderr) + if dims_a.x != dims_b.y: + print(f"Error: outer matrix dimensions must be equal. ({dims_a.x} != {dims_b.y})", file=sys.stderr) sys.exit(1) - selected_kernel = kernels.AsyncCopyMultiStageLargeChunk + selected_kernel = Kernels.AsyncCopyMultiStageLargeChunk # kernel to run - default (AsyncCopyMultiStageLargeChunk == 0) - if checkCmdLineFlag("kernel="): - kernel_number = int(getCmdLineArgumentInt("kernel=")) + if check_cmd_line_flag("kernel="): + kernel_number = int(get_cmd_line_argument_int("kernel=")) if kernel_number < 8: - selected_kernel = kernels(kernel_number) + selected_kernel = Kernels(kernel_number) else: print("Error: kernel number should be between 0 to 7", file=sys.stderr) sys.exit(1) - major = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID) + major = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id) ) if major < 7: pytest.skip("globalToShmemAsyncCopy requires SM 7.0 or higher.") - print(f"MatrixA({dimsA.x},{dimsA.y}), MatrixB({dimsB.x},{dimsB.y})") + print(f"MatrixA({dims_a.x},{dims_a.y}), MatrixB({dims_b.x},{dims_b.y})") global _MatrixMulAsyncCopyMultiStageLargeChunk global _MatrixMulAsyncCopyLargeChunk @@ -1206,17 +1212,17 @@ def main(): global _MatrixMulAsyncCopySingleStage global _MatrixMulNaive global _MatrixMulNaiveLargeChunk - with common.KernelHelper(globalToShmemAsyncCopy, devID) as kernelHelper: - _MatrixMulAsyncCopyMultiStageLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageLargeChunk") - _MatrixMulAsyncCopyLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunk") - _MatrixMulAsyncCopyLargeChunkAWBarrier = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunkAWBarrier") - _MatrixMulAsyncCopyMultiStageSharedState = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageSharedState") - _MatrixMulAsyncCopyMultiStage = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStage") - _MatrixMulAsyncCopySingleStage = kernelHelper.getFunction(b"MatrixMulAsyncCopySingleStage") - _MatrixMulNaive = kernelHelper.getFunction(b"MatrixMulNaive") - _MatrixMulNaiveLargeChunk = kernelHelper.getFunction(b"MatrixMulNaiveLargeChunk") - - matrix_result = MatrixMultiply(dimsA, dimsB, selected_kernel) + kernel_helper = common.KernelHelper(global_to_shmem_async_copy, dev_id) + _MatrixMulAsyncCopyMultiStageLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageLargeChunk") + _MatrixMulAsyncCopyLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunk") + _MatrixMulAsyncCopyLargeChunkAWBarrier = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunkAWBarrier") + _MatrixMulAsyncCopyMultiStageSharedState = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageSharedState") + _MatrixMulAsyncCopyMultiStage = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStage") + _MatrixMulAsyncCopySingleStage = kernel_helper.get_function(b"MatrixMulAsyncCopySingleStage") + _MatrixMulNaive = kernel_helper.get_function(b"MatrixMulNaive") + _MatrixMulNaiveLargeChunk = kernel_helper.get_function(b"MatrixMulNaiveLargeChunk") + + matrix_result = matrix_multiply(dims_a, dims_b, selected_kernel) if matrix_result != 0: sys.exit(1) diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py index 7746bd08e3..b08da3edc0 100644 --- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py @@ -6,7 +6,7 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart @@ -14,7 +14,7 @@ THREADS_PER_BLOCK = 512 GRAPH_LAUNCH_ITERATIONS = 3 -simpleCudaGraphs = """\ +simple_cuda_graphs = """\ #include #include @@ -121,185 +121,185 @@ def init_input(a, size): a_list[i] = rnd.random() -def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks): +def cuda_graphs_manual(input_vec_h, input_vec_d, output_vec_d, result_d, input_size, num_of_blocks): result_h = ctypes.c_double(0.0) - nodeDependencies = [] + node_dependencies = [] - streamForGraph = checkCudaErrors(cudart.cudaStreamCreate()) + stream_for_graph = check_cuda_errors(cudart.cudaStreamCreate()) - kernelNodeParams = cuda.CUDA_KERNEL_NODE_PARAMS() - memcpyParams = cudart.cudaMemcpy3DParms() - memsetParams = cudart.cudaMemsetParams() + kernel_node_params = cuda.CUDA_KERNEL_NODE_PARAMS() + memcpy_params = cudart.cudaMemcpy3DParms() + memset_params = cudart.cudaMemsetParams() - memcpyParams.srcArray = None - memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.srcPtr = cudart.make_cudaPitchedPtr( - inputVec_h, np.dtype(np.float32).itemsize * inputSize, inputSize, 1 + memcpy_params.srcArray = None + memcpy_params.srcPos = cudart.make_cudaPos(0, 0, 0) + memcpy_params.srcPtr = cudart.make_cudaPitchedPtr( + input_vec_h, np.dtype(np.float32).itemsize * input_size, input_size, 1 ) - memcpyParams.dstArray = None - memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.dstPtr = cudart.make_cudaPitchedPtr( - inputVec_d, np.dtype(np.float32).itemsize * inputSize, inputSize, 1 + memcpy_params.dstArray = None + memcpy_params.dstPos = cudart.make_cudaPos(0, 0, 0) + memcpy_params.dstPtr = cudart.make_cudaPitchedPtr( + input_vec_d, np.dtype(np.float32).itemsize * input_size, input_size, 1 ) - memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * inputSize, 1, 1) - memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice + memcpy_params.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * input_size, 1, 1) + memcpy_params.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice - memsetParams.dst = outputVec_d - memsetParams.value = 0 - memsetParams.pitch = 0 - memsetParams.elementSize = np.dtype(np.float32).itemsize # elementSize can be max 4 bytes - memsetParams.width = numOfBlocks * 2 - memsetParams.height = 1 + memset_params.dst = output_vec_d + memset_params.value = 0 + memset_params.pitch = 0 + memset_params.elementSize = np.dtype(np.float32).itemsize # elementSize can be max 4 bytes + memset_params.width = num_of_blocks * 2 + memset_params.height = 1 - graph = checkCudaErrors(cudart.cudaGraphCreate(0)) + graph = check_cuda_errors(cudart.cudaGraphCreate(0)) - memcpyNode = checkCudaErrors(cudart.cudaGraphAddMemcpyNode(graph, None, 0, memcpyParams)) - memsetNode = checkCudaErrors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memsetParams)) + memcpy_node = check_cuda_errors(cudart.cudaGraphAddMemcpyNode(graph, None, 0, memcpy_params)) + memset_node = check_cuda_errors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memset_params)) - nodeDependencies.append(memsetNode) - nodeDependencies.append(memcpyNode) + node_dependencies.append(memset_node) + node_dependencies.append(memcpy_node) - kernelArgs = ( - (inputVec_d, outputVec_d, inputSize, numOfBlocks), + kernel_args = ( + (input_vec_d, output_vec_d, input_size, num_of_blocks), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint), ) - kernelNodeParams.func = _reduce - kernelNodeParams.gridDimX = numOfBlocks - kernelNodeParams.gridDimY = kernelNodeParams.gridDimZ = 1 - kernelNodeParams.blockDimX = THREADS_PER_BLOCK - kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1 - kernelNodeParams.sharedMemBytes = 0 - kernelNodeParams.kernelParams = kernelArgs + kernel_node_params.func = _reduce + kernel_node_params.gridDimX = num_of_blocks + kernel_node_params.gridDimY = kernel_node_params.gridDimZ = 1 + kernel_node_params.blockDimX = THREADS_PER_BLOCK + kernel_node_params.blockDimY = kernel_node_params.blockDimZ = 1 + kernel_node_params.sharedMemBytes = 0 + kernel_node_params.kernelParams = kernel_args # kernelNodeParams.extra = None - kernelNode = checkCudaErrors( - cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams) + kernel_node = check_cuda_errors( + cuda.cuGraphAddKernelNode(graph, node_dependencies, len(node_dependencies), kernel_node_params) ) - nodeDependencies.clear() - nodeDependencies.append(kernelNode) - - memsetParams = cudart.cudaMemsetParams() - memsetParams.dst = result_d - memsetParams.value = 0 - memsetParams.elementSize = np.dtype(np.float32).itemsize - memsetParams.width = 2 - memsetParams.height = 1 - memsetNode = checkCudaErrors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memsetParams)) - - nodeDependencies.append(memsetNode) - - kernelNodeParams = cuda.CUDA_KERNEL_NODE_PARAMS() - kernelNodeParams.func = _reduceFinal - kernelNodeParams.gridDimX = kernelNodeParams.gridDimY = kernelNodeParams.gridDimZ = 1 - kernelNodeParams.blockDimX = THREADS_PER_BLOCK - kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1 - kernelNodeParams.sharedMemBytes = 0 - kernelArgs2 = ( - (outputVec_d, result_d, numOfBlocks), + node_dependencies.clear() + node_dependencies.append(kernel_node) + + memset_params = cudart.cudaMemsetParams() + memset_params.dst = result_d + memset_params.value = 0 + memset_params.elementSize = np.dtype(np.float32).itemsize + memset_params.width = 2 + memset_params.height = 1 + memset_node = check_cuda_errors(cudart.cudaGraphAddMemsetNode(graph, None, 0, memset_params)) + + node_dependencies.append(memset_node) + + kernel_node_params = cuda.CUDA_KERNEL_NODE_PARAMS() + kernel_node_params.func = _reduceFinal + kernel_node_params.gridDimX = kernel_node_params.gridDimY = kernel_node_params.gridDimZ = 1 + kernel_node_params.blockDimX = THREADS_PER_BLOCK + kernel_node_params.blockDimY = kernel_node_params.blockDimZ = 1 + kernel_node_params.sharedMemBytes = 0 + kernel_args2 = ( + (output_vec_d, result_d, num_of_blocks), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint), ) - kernelNodeParams.kernelParams = kernelArgs2 + kernel_node_params.kernelParams = kernel_args2 # kernelNodeParams.extra = None - kernelNode = checkCudaErrors( - cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams) + kernel_node = check_cuda_errors( + cuda.cuGraphAddKernelNode(graph, node_dependencies, len(node_dependencies), kernel_node_params) ) - nodeDependencies.clear() - nodeDependencies.append(kernelNode) - - memcpyParams = cudart.cudaMemcpy3DParms() - - memcpyParams.srcArray = None - memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.srcPtr = cudart.make_cudaPitchedPtr(result_d, np.dtype(np.float64).itemsize, 1, 1) - memcpyParams.dstArray = None - memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1) - memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1) - memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - memcpyNode = checkCudaErrors( - cudart.cudaGraphAddMemcpyNode(graph, nodeDependencies, len(nodeDependencies), memcpyParams) + node_dependencies.clear() + node_dependencies.append(kernel_node) + + memcpy_params = cudart.cudaMemcpy3DParms() + + memcpy_params.srcArray = None + memcpy_params.srcPos = cudart.make_cudaPos(0, 0, 0) + memcpy_params.srcPtr = cudart.make_cudaPitchedPtr(result_d, np.dtype(np.float64).itemsize, 1, 1) + memcpy_params.dstArray = None + memcpy_params.dstPos = cudart.make_cudaPos(0, 0, 0) + memcpy_params.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1) + memcpy_params.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1) + memcpy_params.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost + memcpy_node = check_cuda_errors( + cudart.cudaGraphAddMemcpyNode(graph, node_dependencies, len(node_dependencies), memcpy_params) ) - nodeDependencies.clear() - nodeDependencies.append(memcpyNode) + node_dependencies.clear() + node_dependencies.append(memcpy_node) # WIP: Host nodes - nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph)) - print(f"\nNum of nodes in the graph created manually = {numNodes}") + nodes, num_nodes = check_cuda_errors(cudart.cudaGraphGetNodes(graph)) + print(f"\nNum of nodes in the graph created manually = {num_nodes}") - graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0)) + graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(graph, 0)) - clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph)) - clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0)) + cloned_graph = check_cuda_errors(cudart.cudaGraphClone(graph)) + cloned_graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(cloned_graph, 0)) for _i in range(GRAPH_LAUNCH_ITERATIONS): - checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph)) + check_cuda_errors(cudart.cudaGraphLaunch(graph_exec, stream_for_graph)) - checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph)) print("Cloned Graph Output..") for _i in range(GRAPH_LAUNCH_ITERATIONS): - checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph)) + check_cuda_errors(cudart.cudaGraphLaunch(cloned_graph_exec, stream_for_graph)) - checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph)) - checkCudaErrors(cudart.cudaGraphExecDestroy(graphExec)) - checkCudaErrors(cudart.cudaGraphExecDestroy(clonedGraphExec)) - checkCudaErrors(cudart.cudaGraphDestroy(graph)) - checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph)) - checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph)) + check_cuda_errors(cudart.cudaGraphExecDestroy(graph_exec)) + check_cuda_errors(cudart.cudaGraphExecDestroy(cloned_graph_exec)) + check_cuda_errors(cudart.cudaGraphDestroy(graph)) + check_cuda_errors(cudart.cudaGraphDestroy(cloned_graph)) + check_cuda_errors(cudart.cudaStreamDestroy(stream_for_graph)) -def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks): +def cuda_graphs_using_stream_capture(input_vec_h, input_vec_d, output_vec_d, result_d, input_size, num_of_blocks): result_h = ctypes.c_double(0.0) - stream1 = checkCudaErrors(cudart.cudaStreamCreate()) - stream2 = checkCudaErrors(cudart.cudaStreamCreate()) - stream3 = checkCudaErrors(cudart.cudaStreamCreate()) - streamForGraph = checkCudaErrors(cudart.cudaStreamCreate()) + stream1 = check_cuda_errors(cudart.cudaStreamCreate()) + stream2 = check_cuda_errors(cudart.cudaStreamCreate()) + stream3 = check_cuda_errors(cudart.cudaStreamCreate()) + stream_for_graph = check_cuda_errors(cudart.cudaStreamCreate()) - forkStreamEvent = checkCudaErrors(cudart.cudaEventCreate()) - memsetEvent1 = checkCudaErrors(cudart.cudaEventCreate()) - memsetEvent2 = checkCudaErrors(cudart.cudaEventCreate()) + fork_stream_event = check_cuda_errors(cudart.cudaEventCreate()) + memset_event1 = check_cuda_errors(cudart.cudaEventCreate()) + memset_event2 = check_cuda_errors(cudart.cudaEventCreate()) - checkCudaErrors(cudart.cudaStreamBeginCapture(stream1, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)) + check_cuda_errors(cudart.cudaStreamBeginCapture(stream1, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)) - checkCudaErrors(cudart.cudaEventRecord(forkStreamEvent, stream1)) - checkCudaErrors(cudart.cudaStreamWaitEvent(stream2, forkStreamEvent, 0)) - checkCudaErrors(cudart.cudaStreamWaitEvent(stream3, forkStreamEvent, 0)) + check_cuda_errors(cudart.cudaEventRecord(fork_stream_event, stream1)) + check_cuda_errors(cudart.cudaStreamWaitEvent(stream2, fork_stream_event, 0)) + check_cuda_errors(cudart.cudaStreamWaitEvent(stream3, fork_stream_event, 0)) - checkCudaErrors( + check_cuda_errors( cudart.cudaMemcpyAsync( - inputVec_d, - inputVec_h, - np.dtype(np.float32).itemsize * inputSize, + input_vec_d, + input_vec_h, + np.dtype(np.float32).itemsize * input_size, cudart.cudaMemcpyKind.cudaMemcpyDefault, stream1, ) ) - checkCudaErrors(cudart.cudaMemsetAsync(outputVec_d, 0, np.dtype(np.float64).itemsize * numOfBlocks, stream2)) + check_cuda_errors(cudart.cudaMemsetAsync(output_vec_d, 0, np.dtype(np.float64).itemsize * num_of_blocks, stream2)) - checkCudaErrors(cudart.cudaEventRecord(memsetEvent1, stream2)) + check_cuda_errors(cudart.cudaEventRecord(memset_event1, stream2)) - checkCudaErrors(cudart.cudaMemsetAsync(result_d, 0, np.dtype(np.float64).itemsize, stream3)) - checkCudaErrors(cudart.cudaEventRecord(memsetEvent2, stream3)) + check_cuda_errors(cudart.cudaMemsetAsync(result_d, 0, np.dtype(np.float64).itemsize, stream3)) + check_cuda_errors(cudart.cudaEventRecord(memset_event2, stream3)) - checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent1, 0)) + check_cuda_errors(cudart.cudaStreamWaitEvent(stream1, memset_event1, 0)) - kernelArgs = ( - (inputVec_d, outputVec_d, inputSize, numOfBlocks), + kernel_args = ( + (input_vec_d, output_vec_d, input_size, num_of_blocks), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint), ) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( _reduce, - numOfBlocks, + num_of_blocks, 1, 1, THREADS_PER_BLOCK, @@ -307,20 +307,20 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, 1, 0, stream1, - kernelArgs, + kernel_args, 0, ) ) - checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent2, 0)) + check_cuda_errors(cudart.cudaStreamWaitEvent(stream1, memset_event2, 0)) - kernelArgs2 = ( - (outputVec_d, result_d, numOfBlocks), + kernel_args2 = ( + (output_vec_d, result_d, num_of_blocks), (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint), ) - checkCudaErrors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernelArgs2, 0)) + check_cuda_errors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernel_args2, 0)) - checkCudaErrors( + check_cuda_errors( cudart.cudaMemcpyAsync( result_h, result_d, @@ -332,71 +332,67 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, # WIP: Host nodes - graph = checkCudaErrors(cudart.cudaStreamEndCapture(stream1)) + graph = check_cuda_errors(cudart.cudaStreamEndCapture(stream1)) - nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph)) - print(f"\nNum of nodes in the graph created using stream capture API = {numNodes}") + nodes, num_nodes = check_cuda_errors(cudart.cudaGraphGetNodes(graph)) + print(f"\nNum of nodes in the graph created using stream capture API = {num_nodes}") - graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0)) + graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(graph, 0)) - clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph)) - clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0)) + cloned_graph = check_cuda_errors(cudart.cudaGraphClone(graph)) + cloned_graph_exec = check_cuda_errors(cudart.cudaGraphInstantiate(cloned_graph, 0)) for _i in range(GRAPH_LAUNCH_ITERATIONS): - checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph)) + check_cuda_errors(cudart.cudaGraphLaunch(graph_exec, stream_for_graph)) - checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph)) print("Cloned Graph Output..") for _i in range(GRAPH_LAUNCH_ITERATIONS): - checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph)) + check_cuda_errors(cudart.cudaGraphLaunch(cloned_graph_exec, stream_for_graph)) - checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) + check_cuda_errors(cudart.cudaStreamSynchronize(stream_for_graph)) - checkCudaErrors(cudart.cudaGraphExecDestroy(graphExec)) - checkCudaErrors(cudart.cudaGraphExecDestroy(clonedGraphExec)) - checkCudaErrors(cudart.cudaGraphDestroy(graph)) - checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph)) - checkCudaErrors(cudart.cudaEventDestroy(memsetEvent2)) - checkCudaErrors(cudart.cudaEventDestroy(memsetEvent1)) - checkCudaErrors(cudart.cudaEventDestroy(forkStreamEvent)) - checkCudaErrors(cudart.cudaStreamDestroy(stream3)) - checkCudaErrors(cudart.cudaStreamDestroy(stream1)) - checkCudaErrors(cudart.cudaStreamDestroy(stream2)) - checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph)) + check_cuda_errors(cudart.cudaGraphExecDestroy(graph_exec)) + check_cuda_errors(cudart.cudaGraphExecDestroy(cloned_graph_exec)) + check_cuda_errors(cudart.cudaGraphDestroy(graph)) + check_cuda_errors(cudart.cudaGraphDestroy(cloned_graph)) + check_cuda_errors(cudart.cudaStreamDestroy(stream1)) + check_cuda_errors(cudart.cudaStreamDestroy(stream2)) + check_cuda_errors(cudart.cudaStreamDestroy(stream_for_graph)) def main(): size = 1 << 24 # number of elements to reduce - maxBlocks = 512 + max_blocks = 512 # This will pick the best possible CUDA capable device - devID = findCudaDevice() + dev_id = find_cuda_device() global _reduce global _reduceFinal - with common.KernelHelper(simpleCudaGraphs, devID) as kernelHelper: - _reduce = kernelHelper.getFunction(b"reduce") - _reduceFinal = kernelHelper.getFunction(b"reduceFinal") + kernel_helper = common.KernelHelper(simple_cuda_graphs, dev_id) + _reduce = kernel_helper.get_function(b"reduce") + _reduceFinal = kernel_helper.get_function(b"reduceFinal") - print(f"{size} elements") - print(f"threads per block = {THREADS_PER_BLOCK}") - print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}") + print(f"{size} elements") + print(f"threads per block = {THREADS_PER_BLOCK}") + print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}") - inputVec_h = checkCudaErrors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize)) - inputVec_d = checkCudaErrors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize)) - outputVec_d = checkCudaErrors(cudart.cudaMalloc(maxBlocks * np.dtype(np.float64).itemsize)) - result_d = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.float64).itemsize)) + input_vec_h = check_cuda_errors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize)) + input_vec_d = check_cuda_errors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize)) + output_vec_d = check_cuda_errors(cudart.cudaMalloc(max_blocks * np.dtype(np.float64).itemsize)) + result_d = check_cuda_errors(cudart.cudaMalloc(np.dtype(np.float64).itemsize)) - init_input(inputVec_h, size) + init_input(input_vec_h, size) - cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks) - cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks) + cuda_graphs_manual(input_vec_h, input_vec_d, output_vec_d, result_d, size, max_blocks) + cuda_graphs_using_stream_capture(input_vec_h, input_vec_d, output_vec_d, result_d, size, max_blocks) - checkCudaErrors(cudart.cudaFree(inputVec_d)) - checkCudaErrors(cudart.cudaFree(outputVec_d)) - checkCudaErrors(cudart.cudaFree(result_d)) - checkCudaErrors(cudart.cudaFreeHost(inputVec_h)) + check_cuda_errors(cudart.cudaFree(input_vec_d)) + check_cuda_errors(cudart.cudaFree(output_vec_d)) + check_cuda_errors(cudart.cudaFree(result_d)) + check_cuda_errors(cudart.cudaFreeHost(input_vec_h)) if __name__ == "__main__": diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py index 257a7afa14..8ef5506257 100644 --- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py +++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py @@ -9,12 +9,12 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors, findCudaDevice +from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -conjugateGradientMultiBlockCG = """\ +conjugate_gradient_multi_block_cg = """\ #line __LINE__ #include #include @@ -163,37 +163,37 @@ """ -def genTridiag(I, J, val, N, nz): - I[0] = 0 - J[0] = 0 - J[1] = 0 +def gen_tridiag(i, j, val, n, nz): + i[0] = 0 + j[0] = 0 + j[1] = 0 val[0] = float(random()) + 10.0 val[1] = float(random()) - for i in range(1, N): + for i in range(1, n): if i > 1: - I[i] = I[i - 1] + 3 + i[i] = i[i - 1] + 3 else: - I[1] = 2 + i[1] = 2 start = (i - 1) * 3 + 2 - J[start] = i - 1 - J[start + 1] = i + j[start] = i - 1 + j[start + 1] = i - if i < N - 1: - J[start + 2] = i + 1 + if i < n - 1: + j[start + 2] = i + 1 val[start] = val[start - 1] val[start + 1] = float(random()) + 10.0 - if i < N - 1: + if i < n - 1: val[start + 2] = float(random()) - I[N] = nz + i[n] = nz THREADS_PER_BLOCK = 512 -sSDKname = "conjugateGradientMultiBlockCG" +s_sd_kname = "conjugateGradientMultiBlockCG" def main(): @@ -214,139 +214,137 @@ def main(): pytest.skip("conjugateGradientMultiBlockCG is not supported on QNX") # This will pick the best possible CUDA capable device - devID = findCudaDevice() - deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(devID)) + dev_id = find_cuda_device() + device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id)) - if not deviceProp.managedMemory: + if not device_prop.managedMemory: pytest.skip("Unified Memory not supported on this device") # This sample requires being run on a device that supports Cooperative Kernel # Launch - if not deviceProp.cooperativeLaunch: - pytest.skip(f"Selected GPU {devID} does not support Cooperative Kernel Launch") + if not device_prop.cooperativeLaunch: + pytest.skip(f"Selected GPU {dev_id} does not support Cooperative Kernel Launch") # Statistics about the GPU device print( - f"> GPU device has {deviceProp.multiProcessorCount:%d} Multi-Processors, SM {deviceProp.major:%d}.{deviceProp.minor:%d} compute capabilities\n" + f"> GPU device has {device_prop.multiProcessorCount:%d} Multi-Processors, SM {device_prop.major:%d}.{device_prop.minor:%d} compute capabilities\n" ) # Get kernel - with common.KernelHelper(conjugateGradientMultiBlockCG, devID) as kernelHelper: - _gpuConjugateGradient = kernelHelper.getFunction(b"gpuConjugateGradient") - - # Generate a random tridiagonal symmetric matrix in CSR format - N = 1048576 - nz = (N - 2) * 3 + 4 - - I = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (N + 1), cudart.cudaMemAttachGlobal)) - J = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal)) - val = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal)) - I_local = (ctypes.c_int * (N + 1)).from_address(I) - J_local = (ctypes.c_int * nz).from_address(J) - val_local = (ctypes.c_float * nz).from_address(val) - - genTridiag(I_local, J_local, val_local, N, nz) - - x = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - rhs = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - dot_result = checkCudaErrors( - cudart.cudaMallocManaged(np.dtype(np.float64).itemsize, cudart.cudaMemAttachGlobal) - ) - x_local = (ctypes.c_float * N).from_address(x) - rhs_local = (ctypes.c_float * N).from_address(rhs) - dot_result_local = (ctypes.c_double).from_address(dot_result) - dot_result_local = 0 - - # temp memory for CG - r = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - p = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - Ax = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) - r_local = (ctypes.c_float * N).from_address(r) - - checkCudaErrors(cudart.cudaDeviceSynchronize()) - - start = checkCudaErrors(cudart.cudaEventCreate()) - stop = checkCudaErrors(cudart.cudaEventCreate()) - - for i in range(N): - r_local[i] = rhs_local[i] = 1.0 - x_local[i] = 0.0 - - kernelArgs_value = (I, J, val, x, Ax, p, r, dot_result, nz, N, tol) - kernelArgs_types = ( - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_int, - ctypes.c_float, - ) - kernelArgs = (kernelArgs_value, kernelArgs_types) + kernel_helper = common.KernelHelper(conjugate_gradient_multi_block_cg, dev_id) + _gpu_conjugate_gradient = kernel_helper.get_function(b"gpuConjugateGradient") + + # Generate a random tridiagonal symmetric matrix in CSR format + n = 1048576 + nz = (n - 2) * 3 + 4 + + i = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (n + 1), cudart.cudaMemAttachGlobal)) + j = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal)) + val = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal)) + i_local = (ctypes.c_int * (n + 1)).from_address(i) + j_local = (ctypes.c_int * nz).from_address(j) + val_local = (ctypes.c_float * nz).from_address(val) + + gen_tridiag(i_local, j_local, val_local, n, nz) + + x = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + rhs = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + dot_result = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float64).itemsize, cudart.cudaMemAttachGlobal)) + x_local = (ctypes.c_float * n).from_address(x) + rhs_local = (ctypes.c_float * n).from_address(rhs) + dot_result_local = (ctypes.c_double).from_address(dot_result) + dot_result_local = 0 + + # temp memory for CG + r = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + p = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + ax = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal)) + r_local = (ctypes.c_float * n).from_address(r) + + check_cuda_errors(cudart.cudaDeviceSynchronize()) + + start = check_cuda_errors(cudart.cudaEventCreate()) + stop = check_cuda_errors(cudart.cudaEventCreate()) + + for i in range(n): + r_local[i] = rhs_local[i] = 1.0 + x_local[i] = 0.0 + + kernel_args_value = (i, j, val, x, ax, p, r, dot_result, nz, n, tol) + kernel_args_types = ( + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, + ) + kernel_args = (kernel_args_value, kernel_args_types) - sMemSize = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1) - numThreads = THREADS_PER_BLOCK - numBlocksPerSm = checkCudaErrors( - cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpuConjugateGradient, numThreads, sMemSize) - ) - numSms = deviceProp.multiProcessorCount - dimGrid = cudart.dim3() - dimGrid.x = numSms * numBlocksPerSm - dimGrid.y = 1 - dimGrid.z = 1 - dimBlock = cudart.dim3() - dimBlock.x = THREADS_PER_BLOCK - dimBlock.y = 1 - dimBlock.z = 1 - - checkCudaErrors(cudart.cudaEventRecord(start, 0)) - checkCudaErrors( - cuda.cuLaunchCooperativeKernel( - _gpuConjugateGradient, - dimGrid.x, - dimGrid.y, - dimGrid.z, - dimBlock.x, - dimBlock.y, - dimBlock.z, - 0, - 0, - kernelArgs, - ) + s_mem_size = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1) + num_threads = THREADS_PER_BLOCK + num_blocks_per_sm = check_cuda_errors( + cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpu_conjugate_gradient, num_threads, s_mem_size) + ) + num_sms = device_prop.multiProcessorCount + dim_grid = cudart.dim3() + dim_grid.x = num_sms * num_blocks_per_sm + dim_grid.y = 1 + dim_grid.z = 1 + dim_block = cudart.dim3() + dim_block.x = THREADS_PER_BLOCK + dim_block.y = 1 + dim_block.z = 1 + + check_cuda_errors(cudart.cudaEventRecord(start, 0)) + check_cuda_errors( + cuda.cuLaunchCooperativeKernel( + _gpu_conjugate_gradient, + dim_grid.x, + dim_grid.y, + dim_grid.z, + dim_block.x, + dim_block.y, + dim_block.z, + 0, + 0, + kernel_args, ) - checkCudaErrors(cudart.cudaEventRecord(stop, 0)) - checkCudaErrors(cudart.cudaDeviceSynchronize()) - - time = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop)) - print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms") - - err = 0.0 - for i in range(N): - rsum = 0.0 - - for j in range(I_local[i], I_local[i + 1]): - rsum += val_local[j] * x_local[J_local[j]] - - diff = math.fabs(rsum - rhs_local[i]) - - if diff > err: - err = diff - - checkCudaErrors(cudart.cudaFree(I)) - checkCudaErrors(cudart.cudaFree(J)) - checkCudaErrors(cudart.cudaFree(val)) - checkCudaErrors(cudart.cudaFree(x)) - checkCudaErrors(cudart.cudaFree(rhs)) - checkCudaErrors(cudart.cudaFree(r)) - checkCudaErrors(cudart.cudaFree(p)) - checkCudaErrors(cudart.cudaFree(Ax)) - checkCudaErrors(cudart.cudaFree(dot_result)) - checkCudaErrors(cudart.cudaEventDestroy(start)) - checkCudaErrors(cudart.cudaEventDestroy(stop)) + ) + check_cuda_errors(cudart.cudaEventRecord(stop, 0)) + check_cuda_errors(cudart.cudaDeviceSynchronize()) + + time = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop)) + print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms") + + err = 0.0 + for i in range(n): + rsum = 0.0 + + for j in range(i_local[i], i_local[i + 1]): + rsum += val_local[j] * x_local[j_local[j]] + + diff = math.fabs(rsum - rhs_local[i]) + + if diff > err: + err = diff + + check_cuda_errors(cudart.cudaFree(i)) + check_cuda_errors(cudart.cudaFree(j)) + check_cuda_errors(cudart.cudaFree(val)) + check_cuda_errors(cudart.cudaFree(x)) + check_cuda_errors(cudart.cudaFree(rhs)) + check_cuda_errors(cudart.cudaFree(r)) + check_cuda_errors(cudart.cudaFree(p)) + check_cuda_errors(cudart.cudaFree(ax)) + check_cuda_errors(cudart.cudaFree(dot_result)) + check_cuda_errors(cudart.cudaEventDestroy(start)) + check_cuda_errors(cudart.cudaEventDestroy(stop)) print(f"Test Summary: Error amount = {err:f}") if math.sqrt(dot_result_local) >= tol: diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py index 8723abe26a..5b5151ef24 100644 --- a/cuda_bindings/examples/common/common.py +++ b/cuda_bindings/examples/common/common.py @@ -2,10 +2,8 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from contextlib import suppress - import numpy as np -from common.helper_cuda import checkCudaErrors +from common.helper_cuda import check_cuda_errors from cuda import pathfinder from cuda.bindings import driver as cuda @@ -13,14 +11,14 @@ from cuda.bindings import runtime as cudart -def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor): +def pytest_skipif_compute_capability_too_low(dev_id, required_cc_major_minor): import pytest - cc_major = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID) + cc_major = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id) ) - cc_minor = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID) + cc_minor = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, dev_id) ) have_cc_major_minor = (cc_major, cc_minor) if have_cc_major_minor < required_cc_major_minor: @@ -28,8 +26,7 @@ def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor): class KernelHelper: - def __init__(self, code, devID): - self.module = None + def __init__(self, code, dev_id): include_dirs = [] for libname in ("cudart", "cccl"): hdr_dir = pathfinder.find_nvidia_header_directory(libname) @@ -39,18 +36,18 @@ def __init__(self, code, devID): pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None') include_dirs.append(hdr_dir) - prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) + prog = check_cuda_errors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) # Initialize CUDA - checkCudaErrors(cudart.cudaFree(0)) + check_cuda_errors(cudart.cudaFree(0)) - major = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID) + major = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id) ) - minor = checkCudaErrors( - cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID) + minor = check_cuda_errors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, dev_id) ) - _, nvrtc_minor = checkCudaErrors(nvrtc.nvrtcVersion()) + _, nvrtc_minor = check_cuda_errors(nvrtc.nvrtcVersion()) use_cubin = nvrtc_minor >= 1 prefix = "sm" if use_cubin else "compute" arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii") @@ -65,44 +62,27 @@ def __init__(self, code, devID): opts.append(f"--include-path={inc_dir}".encode()) try: - checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts)) - - if use_cubin: - dataSize = checkCudaErrors(nvrtc.nvrtcGetCUBINSize(prog)) - data = b" " * dataSize - checkCudaErrors(nvrtc.nvrtcGetCUBIN(prog, data)) - else: - dataSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog)) - data = b" " * dataSize - checkCudaErrors(nvrtc.nvrtcGetPTX(prog, data)) + check_cuda_errors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts)) except RuntimeError as err: - logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog)) - log = b" " * logSize - checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log)) + log_size = check_cuda_errors(nvrtc.nvrtcGetProgramLogSize(prog)) + log = b" " * log_size + check_cuda_errors(nvrtc.nvrtcGetProgramLog(prog, log)) import sys print(log.decode(), file=sys.stderr) print(err, file=sys.stderr) sys.exit(1) - finally: - checkCudaErrors(nvrtc.nvrtcDestroyProgram(prog)) - - self.module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(data))) - - def getFunction(self, name): - return checkCudaErrors(cuda.cuModuleGetFunction(self.module, name)) - - def close(self): - if self.module is not None: - checkCudaErrors(cuda.cuModuleUnload(self.module)) - self.module = None - def __enter__(self): - return self + if use_cubin: + data_size = check_cuda_errors(nvrtc.nvrtcGetCUBINSize(prog)) + data = b" " * data_size + check_cuda_errors(nvrtc.nvrtcGetCUBIN(prog, data)) + else: + data_size = check_cuda_errors(nvrtc.nvrtcGetPTXSize(prog)) + data = b" " * data_size + check_cuda_errors(nvrtc.nvrtcGetPTX(prog, data)) - def __exit__(self, exc_type, exc, tb): - self.close() + self.module = check_cuda_errors(cuda.cuModuleLoadData(np.char.array(data))) - def __del__(self): - with suppress(Exception): - self.close() + def get_function(self, name): + return check_cuda_errors(cuda.cuModuleGetFunction(self.module, name)) diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/examples/common/helper_cuda.py index d741eb54d9..9fbfe8c82f 100644 --- a/cuda_bindings/examples/common/helper_cuda.py +++ b/cuda_bindings/examples/common/helper_cuda.py @@ -1,14 +1,14 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt +from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int from cuda.bindings import driver as cuda from cuda.bindings import nvrtc from cuda.bindings import runtime as cudart -def _cudaGetErrorEnum(error): +def _cuda_get_error_enum(error): if isinstance(error, cuda.CUresult): err, name = cuda.cuGetErrorName(error) return name if err == cuda.CUresult.CUDA_SUCCESS else "" @@ -20,9 +20,9 @@ def _cudaGetErrorEnum(error): raise RuntimeError(f"Unknown error type: {error}") -def checkCudaErrors(result): +def check_cuda_errors(result): if result[0].value: - raise RuntimeError(f"CUDA error code={result[0].value}({_cudaGetErrorEnum(result[0])})") + raise RuntimeError(f"CUDA error code={result[0].value}({_cuda_get_error_enum(result[0])})") if len(result) == 1: return None elif len(result) == 2: @@ -31,18 +31,18 @@ def checkCudaErrors(result): return result[1:] -def findCudaDevice(): - devID = 0 - if checkCmdLineFlag("device="): - devID = getCmdLineArgumentInt("device=") - checkCudaErrors(cudart.cudaSetDevice(devID)) - return devID +def find_cuda_device(): + dev_id = 0 + if check_cmd_line_flag("device="): + dev_id = get_cmd_line_argument_int("device=") + check_cuda_errors(cudart.cudaSetDevice(dev_id)) + return dev_id -def findCudaDeviceDRV(): - devID = 0 - if checkCmdLineFlag("device="): - devID = getCmdLineArgumentInt("device=") - checkCudaErrors(cuda.cuInit(0)) - cuDevice = checkCudaErrors(cuda.cuDeviceGet(devID)) - return cuDevice +def find_cuda_device_drv(): + dev_id = 0 + if check_cmd_line_flag("device="): + dev_id = get_cmd_line_argument_int("device=") + check_cuda_errors(cuda.cuInit(0)) + cu_device = check_cuda_errors(cuda.cuDeviceGet(dev_id)) + return cu_device diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/examples/common/helper_string.py index 9f8e70a6c4..47d9d36569 100644 --- a/cuda_bindings/examples/common/helper_string.py +++ b/cuda_bindings/examples/common/helper_string.py @@ -4,12 +4,12 @@ import sys -def checkCmdLineFlag(stringRef): - return any(stringRef == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv)) +def check_cmd_line_flag(string_ref): + return any(string_ref == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv)) -def getCmdLineArgumentInt(stringRef): +def get_cmd_line_argument_int(string_ref): for i, k in enumerate(sys.argv): - if stringRef == i and k < len(sys.argv) - 1: + if string_ref == i and k < len(sys.argv) - 1: return sys.argv[k + 1] return 0 diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py index 148d836adf..21303664ac 100644 --- a/cuda_bindings/examples/extra/isoFDModelling_test.py +++ b/cuda_bindings/examples/extra/isoFDModelling_test.py @@ -5,12 +5,12 @@ import numpy as np from common import common -from common.helper_cuda import checkCudaErrors +from common.helper_cuda import check_cuda_errors from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart -isoPropagator = """\ +iso_propagator = """\ extern "C" __global__ void injectSource(float *__restrict__ in, float *__restrict__ src, int it) { @@ -177,7 +177,7 @@ def align_ny(ny, blk, nops): # # this class contains the input params # -class params: +class Params: def __init__(self): self.BDIMX = 32 # tiles x y for fd operators self.BDIMY = 16 @@ -209,53 +209,53 @@ def __init__(self): # # this class contains all the kernels to be used bu propagator # -class cudaKernels: +class CudaKernels: def __init__(self, cntx): - checkCudaErrors(cuda.cuInit(0)) - checkCudaErrors(cuda.cuCtxSetCurrent(cntx)) - dev = checkCudaErrors(cuda.cuCtxGetDevice()) + check_cuda_errors(cuda.cuInit(0)) + check_cuda_errors(cuda.cuCtxSetCurrent(cntx)) + dev = check_cuda_errors(cuda.cuCtxGetDevice()) - self.kernelHelper = common.KernelHelper(isoPropagator, int(dev)) + self.kernel_helper = common.KernelHelper(iso_propagator, int(dev)) # kernel to create a source fnction with some max frequency - self.creatSource = self.kernelHelper.getFunction(b"createSource") + self.creatSource = self.kernel_helper.get_function(b"createSource") # create a velocity to try things: just a sphere on the middle 4500 m/s and 2500 m/s all around - self.createVelocity = self.kernelHelper.getFunction(b"createVelocity") + self.create_velocity = self.kernel_helper.get_function(b"createVelocity") # kernel to propagate the wavefield by 1 step in time - self.fdPropag = self.kernelHelper.getFunction(b"fwd_3D_orderX2k") + self.fdPropag = self.kernel_helper.get_function(b"fwd_3D_orderX2k") # kernel to propagate the wavefield by 1 step in time - self.injectSource = self.kernelHelper.getFunction(b"injectSource") + self.inject_source = self.kernel_helper.get_function(b"injectSource") # # this class contains: propagator, source creation, velocity creation # injection of data and domain exchange # -class propagator: +class Propagator: def __init__(self, params, _dev): print("init object for device ", _dev) self.dev = _dev - checkCudaErrors(cuda.cuInit(0)) - self.cuDevice = checkCudaErrors(cuda.cuDeviceGet(_dev)) - self.context = checkCudaErrors(cuda.cuCtxCreate(None, 0, self.cuDevice)) + check_cuda_errors(cuda.cuInit(0)) + self.cu_device = check_cuda_errors(cuda.cuDeviceGet(_dev)) + self.context = check_cuda_errors(cuda.cuCtxCreate(None, 0, self.cu_device)) self.waveOut = 0 self.waveIn = 0 - self.streamCenter = checkCudaErrors(cuda.cuStreamCreate(0)) - self.streamHalo = checkCudaErrors(cuda.cuStreamCreate(0)) - self.params = params + self.streamCenter = check_cuda_errors(cuda.cuStreamCreate(0)) + self.streamHalo = check_cuda_errors(cuda.cuStreamCreate(0)) + self.Params = params def __del__(self): - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) - checkCudaErrors(cuda.cuStreamDestroy(self.streamHalo)) - checkCudaErrors(cuda.cuStreamDestroy(self.streamCenter)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuStreamDestroy(self.streamHalo)) + check_cuda_errors(cuda.cuStreamDestroy(self.streamCenter)) if self.waveIn != 0: - checkCudaErrors(cuda.cuMemFree(self.waveIn)) + check_cuda_errors(cuda.cuMemFree(self.waveIn)) if self.waveOut != 0: - checkCudaErrors(cuda.cuMemFree(self.waveOut)) - checkCudaErrors(cuda.cuCtxDestroy(self.context)) + check_cuda_errors(cuda.cuMemFree(self.waveOut)) + check_cuda_errors(cuda.cuCtxDestroy(self.context)) # # swap waveIn with waveOut @@ -275,45 +275,45 @@ def swap(self): # allocate the device memory # def allocate(self): - nel = self.params.nx * self.params.ny * self.params.nz + nel = self.Params.nx * self.Params.ny * self.Params.nz n = np.array(nel, dtype=np.uint32) - bufferSize = n * np.dtype(np.float32).itemsize - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + buffer_size = n * np.dtype(np.float32).itemsize + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) - self.velocity = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) - checkCudaErrors(cuda.cuMemsetD32(self.velocity, 0, n)) + self.velocity = check_cuda_errors(cuda.cuMemAlloc(buffer_size)) + check_cuda_errors(cuda.cuMemsetD32(self.velocity, 0, n)) - nel += self.params.lead + nel += self.Params.lead n = np.array(nel, dtype=np.uint32) ## we need to align at the beginning of the tile - bufferSize = n * np.dtype(np.float32).itemsize - self.waveIn = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) - checkCudaErrors(cuda.cuMemsetD32(self.waveIn, 0, n)) + buffer_size = n * np.dtype(np.float32).itemsize + self.waveIn = check_cuda_errors(cuda.cuMemAlloc(buffer_size)) + check_cuda_errors(cuda.cuMemsetD32(self.waveIn, 0, n)) - self.waveOut = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) - checkCudaErrors(cuda.cuMemsetD32(self.waveOut, 0, n)) + self.waveOut = check_cuda_errors(cuda.cuMemAlloc(buffer_size)) + check_cuda_errors(cuda.cuMemsetD32(self.waveOut, 0, n)) - n = np.array(self.params.nt, dtype=np.uint32) - bufferSize = n * np.dtype(np.float32).itemsize - self.source = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) - checkCudaErrors(cuda.cuMemsetD32(self.source, 0, n)) + n = np.array(self.Params.nt, dtype=np.uint32) + buffer_size = n * np.dtype(np.float32).itemsize + self.source = check_cuda_errors(cuda.cuMemAlloc(buffer_size)) + check_cuda_errors(cuda.cuMemsetD32(self.source, 0, n)) # # create source data # - def createSource(self, kernel): + def create_source(self, kernel): print("creating source on device ", self.dev) buf = np.array([int(self.source)], dtype=np.uint64) - nt = np.array(self.params.nt, dtype=np.uint32) - dt = np.array(self.params.dt, dtype=np.float32) - freq = np.array(self.params.freqMax, dtype=np.float32) + nt = np.array(self.Params.nt, dtype=np.uint32) + dt = np.array(self.Params.dt, dtype=np.float32) + freq = np.array(self.Params.freqMax, dtype=np.float32) args = [buf, dt, freq, nt] argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) - checkCudaErrors( + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors( cuda.cuLaunchKernel( kernel.creatSource, 1, @@ -328,34 +328,34 @@ def createSource(self, kernel): 0, ) ) # arguments - checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo)) + check_cuda_errors(cuda.cuStreamSynchronize(self.streamHalo)) # # inject source function: ony on the domain 0 # - def injectSource(self, kernel, iter): - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + def inject_source(self, kernel, iter): + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) if self.dev != 0: return wavein = np.array([int(self.waveIn)], dtype=np.uint64) src = np.array([int(self.source)], dtype=np.uint64) - offset_sourceInject = ( - self.params.lead - + (int)(self.params.nz / 2) * self.params.nx * self.params.ny - + (int)(self.params.ny / 2) * self.params.nx - + (int)(self.params.nx / 2) + offset_source_inject = ( + self.Params.lead + + (int)(self.Params.nz / 2) * self.Params.nx * self.Params.ny + + (int)(self.Params.ny / 2) * self.Params.nx + + (int)(self.Params.nx / 2) ) - offset_sourceInject *= np.dtype(np.float32).itemsize + offset_source_inject *= np.dtype(np.float32).itemsize np_it = np.array(iter, dtype=np.uint32) - args = [wavein + offset_sourceInject, src, np_it] + args = [wavein + offset_source_inject, src, np_it] argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( - kernel.injectSource, + kernel.inject_source, 1, 1, 1, # grid dim @@ -372,39 +372,39 @@ def injectSource(self, kernel, iter): # # create velocity # - def createVelocity(self, kernel): + def create_velocity(self, kernel): print("running create velocity on device ", self.dev) offset_velocity = ( - self.params.FD_ORDER * self.params.nx * self.params.ny - + self.params.FD_ORDER * self.params.nx - + self.params.FD_ORDER + self.Params.FD_ORDER * self.Params.nx * self.Params.ny + + self.Params.FD_ORDER * self.Params.nx + + self.Params.FD_ORDER ) offset_velocity *= np.dtype(np.float32).itemsize vel = np.array([int(self.velocity)], dtype=np.uint64) - dx_dt2 = (self.params.dt * self.params.dt) / (self.params.delta * self.params.delta) + dx_dt2 = (self.Params.dt * self.Params.dt) / (self.Params.delta * self.Params.delta) - stride = self.params.nx * self.params.ny + stride = self.Params.nx * self.Params.ny np_dx_dt2 = np.array(dx_dt2, dtype=np.float32) - np_nz = np.array((self.params.nz - 2 * self.params.FD_ORDER), dtype=np.uint32) - np_nx = np.array(self.params.nx, dtype=np.uint32) + np_nz = np.array((self.Params.nz - 2 * self.Params.FD_ORDER), dtype=np.uint32) + np_nx = np.array(self.Params.nx, dtype=np.uint32) np_stride = np.array(stride, dtype=np.uint32) args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride] argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) # do halo up - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( - kernel.createVelocity, - self.params.blkx, - self.params.blky, + kernel.create_velocity, + self.Params.blkx, + self.Params.blky, 1, # grid dim - 2 * self.params.BDIMX, - self.params.BDIMY, + 2 * self.Params.BDIMX, + self.Params.BDIMY, 1, # block dim 0, self.streamHalo, # shared mem and stream @@ -412,22 +412,22 @@ def createVelocity(self, kernel): 0, ) ) # arguments - checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo)) + check_cuda_errors(cuda.cuStreamSynchronize(self.streamHalo)) # # execute the center part of propagation # - def executeCenter(self, kernel): + def execute_center(self, kernel): if verbose_prints: print("running center on device ", self.dev) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) offset_velocity = ( - 2 * self.params.FD_ORDER * self.params.nx * self.params.ny - + self.params.FD_ORDER * self.params.nx - + self.params.FD_ORDER + 2 * self.Params.FD_ORDER * self.Params.nx * self.Params.ny + + self.Params.FD_ORDER * self.Params.nx + + self.Params.FD_ORDER ) - offset_wave = self.params.lead + offset_velocity + offset_wave = self.Params.lead + offset_velocity offset_wave *= np.dtype(np.float32).itemsize offset_velocity *= np.dtype(np.float32).itemsize @@ -436,9 +436,9 @@ def executeCenter(self, kernel): waveout = np.array([int(self.waveOut)], dtype=np.uint64) vel = np.array([int(self.velocity)], dtype=np.uint64) - stride = self.params.nx * self.params.ny - np_nz = np.array(self.params.nz - 4 * self.params.FD_ORDER, dtype=np.uint32) - np_nx = np.array(self.params.nx, dtype=np.uint32) + stride = self.Params.nx * self.Params.ny + np_nz = np.array(self.Params.nz - 4 * self.Params.FD_ORDER, dtype=np.uint32) + np_nx = np.array(self.Params.nx, dtype=np.uint32) np_stride = np.array(stride, dtype=np.uint32) args = [ @@ -452,14 +452,14 @@ def executeCenter(self, kernel): argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do center propagation from 2 * fd_order to nz - 2 * fd_order - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( kernel.fdPropag, - self.params.blkx, - self.params.blky, + self.Params.blkx, + self.Params.blky, 1, # grid dim - self.params.BDIMX, - self.params.BDIMY, + self.Params.BDIMX, + self.Params.BDIMY, 1, # block dim 0, self.streamCenter, # shared mem and stream @@ -471,18 +471,18 @@ def executeCenter(self, kernel): # # execute the halo part of propagation # - def executeHalo(self, kernel): + def execute_halo(self, kernel): if verbose_prints: print("running halos on device ", self.dev) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) offset_velocity = ( - self.params.FD_ORDER * self.params.nx * self.params.ny - + self.params.FD_ORDER * self.params.nx - + self.params.FD_ORDER + self.Params.FD_ORDER * self.Params.nx * self.Params.ny + + self.Params.FD_ORDER * self.Params.nx + + self.Params.FD_ORDER ) - offset_wave = self.params.lead + offset_velocity + offset_wave = self.Params.lead + offset_velocity offset_wave *= np.dtype(np.float32).itemsize offset_velocity *= np.dtype(np.float32).itemsize @@ -491,9 +491,9 @@ def executeHalo(self, kernel): waveout = np.array([int(self.waveOut)], dtype=np.uint64) vel = np.array([int(self.velocity)], dtype=np.uint64) - stride = self.params.nx * self.params.ny - np_nz = np.array(self.params.FD_ORDER, dtype=np.uint32) - np_nx = np.array(self.params.nx, dtype=np.uint32) + stride = self.Params.nx * self.Params.ny + np_nz = np.array(self.Params.FD_ORDER, dtype=np.uint32) + np_nx = np.array(self.Params.nx, dtype=np.uint32) np_stride = np.array(stride, dtype=np.uint32) args = [ @@ -507,14 +507,14 @@ def executeHalo(self, kernel): argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do halo up - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( kernel.fdPropag, - self.params.blkx, - self.params.blky, + self.Params.blkx, + self.Params.blky, 1, # grid dim - self.params.BDIMX, - self.params.BDIMY, + self.Params.BDIMX, + self.Params.BDIMY, 1, # block dim 0, self.streamHalo, # shared mem and stream @@ -525,11 +525,11 @@ def executeHalo(self, kernel): # do halo down offset_velocity = ( - (self.params.nz - 2 * self.params.FD_ORDER) * self.params.nx * self.params.ny - + self.params.FD_ORDER * self.params.nx - + self.params.FD_ORDER + (self.Params.nz - 2 * self.Params.FD_ORDER) * self.Params.nx * self.Params.ny + + self.Params.FD_ORDER * self.Params.nx + + self.Params.FD_ORDER ) - offset_wave = self.params.lead + offset_velocity + offset_wave = self.Params.lead + offset_velocity offset_wave *= np.dtype(np.float32).itemsize offset_velocity *= np.dtype(np.float32).itemsize @@ -543,14 +543,14 @@ def executeHalo(self, kernel): np_stride, ] argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors( + check_cuda_errors( cuda.cuLaunchKernel( kernel.fdPropag, - self.params.blkx, - self.params.blky, + self.Params.blkx, + self.Params.blky, 1, # grid dim - self.params.BDIMX, - self.params.BDIMY, + self.Params.BDIMX, + self.Params.BDIMY, 1, # block dim 0, self.streamHalo, # shared mem and stream @@ -562,79 +562,79 @@ def executeHalo(self, kernel): # # exchange the halos # - def exchangeHalo(self, propag): + def exchange_halo(self, propag): if verbose_prints: print("exchange halos on device ", self.dev, "with dev ", propag.dev) - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) # # the following variables don't change # - nstride = self.params.nx * self.params.ny + nstride = self.Params.nx * self.Params.ny - devS = self.context - devD = propag.context + dev_s = self.context + dev_d = propag.context - n_exch = self.params.FD_ORDER * nstride + n_exch = self.Params.FD_ORDER * nstride n_exch *= np.dtype(np.float32).itemsize if self.dev < propag.dev: # exchange up - offsetS = self.params.lead + (self.params.nz - 2 * self.params.FD_ORDER) * nstride - offsetD = propag.params.lead + offset_s = self.Params.lead + (self.Params.nz - 2 * self.Params.FD_ORDER) * nstride + offset_d = propag.Params.lead - offsetS *= np.dtype(np.float32).itemsize - offsetD *= np.dtype(np.float32).itemsize + offset_s *= np.dtype(np.float32).itemsize + offset_d *= np.dtype(np.float32).itemsize - waveD = cuda.CUdeviceptr(int(propag.waveOut) + offsetD) - waveS = cuda.CUdeviceptr(int(self.waveOut) + offsetS) + wave_d = cuda.CUdeviceptr(int(propag.waveOut) + offset_d) + wave_s = cuda.CUdeviceptr(int(self.waveOut) + offset_s) - checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo)) + check_cuda_errors(cuda.cuMemcpyPeerAsync(wave_d, dev_d, wave_s, dev_s, n_exch, self.streamHalo)) else: # exchange down - offsetS = self.params.lead + self.params.FD_ORDER * nstride - offsetD = propag.params.lead + (propag.params.nz - propag.params.FD_ORDER) * nstride + offset_s = self.Params.lead + self.Params.FD_ORDER * nstride + offset_d = propag.Params.lead + (propag.Params.nz - propag.Params.FD_ORDER) * nstride - offsetS *= np.dtype(np.float32).itemsize - offsetD *= np.dtype(np.float32).itemsize + offset_s *= np.dtype(np.float32).itemsize + offset_d *= np.dtype(np.float32).itemsize - waveD = cuda.CUdeviceptr(int(propag.waveOut) + offsetD) - waveS = cuda.CUdeviceptr(int(self.waveOut) + offsetS) + wave_d = cuda.CUdeviceptr(int(propag.waveOut) + offset_d) + wave_s = cuda.CUdeviceptr(int(self.waveOut) + offset_s) - checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo)) + check_cuda_errors(cuda.cuMemcpyPeerAsync(wave_d, dev_d, wave_s, dev_s, n_exch, self.streamHalo)) # # sync stream # - def syncStream(self, stream): - checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) - checkCudaErrors(cuda.cuStreamSynchronize(stream)) + def sync_stream(self, stream): + check_cuda_errors(cuda.cuCtxSetCurrent(self.context)) + check_cuda_errors(cuda.cuStreamSynchronize(stream)) def main(): - checkCudaErrors(cuda.cuInit(0)) + check_cuda_errors(cuda.cuInit(0)) # Number of GPUs print("Checking for multiple GPUs...") - gpu_n = checkCudaErrors(cuda.cuDeviceGetCount()) + gpu_n = check_cuda_errors(cuda.cuDeviceGetCount()) print(f"CUDA-capable device count: {gpu_n}") if gpu_n < 2: print("Two or more GPUs with Peer-to-Peer access capability are required") return - prop = [checkCudaErrors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] + prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] # Check possibility for peer access print("\nChecking GPU(s) for support of peer to peer memory access...") - p2pCapableGPUs = [-1, -1] + p2p_capable_gp_us = [-1, -1] for i in range(gpu_n): - p2pCapableGPUs[0] = i + p2p_capable_gp_us[0] = i for j in range(gpu_n): if i == j: continue - i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j)) - j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i)) + i_access_j = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(i, j)) + j_access_i = check_cuda_errors(cudart.cudaDeviceCanAccessPeer(j, i)) print( "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No" @@ -646,23 +646,23 @@ def main(): ) ) if i_access_j and j_access_i: - p2pCapableGPUs[1] = j + p2p_capable_gp_us[1] = j break - if p2pCapableGPUs[1] != -1: + if p2p_capable_gp_us[1] != -1: break - if p2pCapableGPUs[0] == -1 or p2pCapableGPUs[1] == -1: + if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1: print("Two or more GPUs with Peer-to-Peer access capability are required.") print("Peer to Peer access is not available amongst GPUs in the system, waiving test.") return # Use first pair of p2p capable GPUs detected - gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]] + gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]] # # init device # - pars = params() + pars = Params() # # create propagators @@ -674,16 +674,16 @@ def main(): # create kernels and propagators that are going to be used on device # for i in gpuid: - p = propagator(pars, i) - k = cudaKernels(p.context) + p = Propagator(pars, i) + k = CudaKernels(p.context) propags.append(p) kerns.append(k) # allocate resources in device for propag, kern in zip(propags, kerns): propag.allocate() - propag.createSource(kern) - propag.createVelocity(kern) + propag.create_source(kern) + propag.create_velocity(kern) # # loop over time iterations @@ -691,26 +691,26 @@ def main(): start = time.time() for it in range(pars.nt): for propag in propags: - propag.syncStream(propag.streamHalo) + propag.sync_stream(propag.streamHalo) for propag, kern in zip(propags, kerns): - propag.injectSource(kern, it) + propag.inject_source(kern, it) for propag, kern in zip(propags, kerns): - propag.executeHalo(kern) + propag.execute_halo(kern) for propag in propags: - propag.syncStream(propag.streamHalo) + propag.sync_stream(propag.streamHalo) - propags[1].exchangeHalo(propags[0]) + propags[1].exchange_halo(propags[0]) - propags[0].exchangeHalo(propags[1]) + propags[0].exchange_halo(propags[1]) for propag, kern in zip(propags, kerns): - propag.executeCenter(kern) + propag.execute_center(kern) for propag in propags: - propag.syncStream(propag.streamCenter) + propag.sync_stream(propag.streamCenter) for propag in propags: propag.swap() @@ -727,19 +727,19 @@ def main(): # nz = 2 * (int)(pars.nz - 2 * pars.FD_ORDER) print(" nz= ", nz, " nx= ", pars.nx) - hOut = np.zeros((nz, pars.nx), dtype="float32") + h_out = np.zeros((nz, pars.nx), dtype="float32") istart = 0 for propag in propags: - checkCudaErrors(cuda.cuCtxSetCurrent(propag.context)) + check_cuda_errors(cuda.cuCtxSetCurrent(propag.context)) offset = pars.lead + pars.FD_ORDER * pars.nx * pars.ny + (int)(pars.ny / 2) * pars.nx for j in range(pars.nz - 2 * pars.FD_ORDER): ptr = cuda.CUdeviceptr(int(propag.waveOut) + offset * 4) - checkCudaErrors( + check_cuda_errors( cuda.cuMemcpyDtoH( - hOut[istart].ctypes.data, + h_out[istart].ctypes.data, ptr, pars.nx * np.dtype(np.float32).itemsize, ) @@ -756,7 +756,7 @@ def main(): if display_graph: nrows = nz ncols = pars.nx - dbz = hOut + dbz = h_out dbz = np.reshape(dbz, (nrows, ncols)) ## diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py index be78deafc1..80e7e73376 100644 --- a/cuda_bindings/examples/extra/jit_program_test.py +++ b/cuda_bindings/examples/extra/jit_program_test.py @@ -9,7 +9,7 @@ from cuda.bindings import nvrtc -def ASSERT_DRV(err): +def assert_drv(err): if isinstance(err, cuda.CUresult): if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError(f"Cuda Error: {err}") @@ -35,31 +35,31 @@ def ASSERT_DRV(err): def main(): # Init (err,) = cuda.cuInit(0) - ASSERT_DRV(err) + assert_drv(err) # Device - err, cuDevice = cuda.cuDeviceGet(0) - ASSERT_DRV(err) + err, cu_device = cuda.cuDeviceGet(0) + assert_drv(err) # Ctx - err, context = cuda.cuCtxCreate(None, 0, cuDevice) - ASSERT_DRV(err) + err, context = cuda.cuCtxCreate(None, 0, cu_device) + assert_drv(err) # Create program err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, None, None) - ASSERT_DRV(err) + assert_drv(err) # Get target architecture err, major = cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device ) - ASSERT_DRV(err) + assert_drv(err) err, minor = cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device ) - ASSERT_DRV(err) + assert_drv(err) err, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion() - ASSERT_DRV(err) + assert_drv(err) use_cubin = nvrtc_minor >= 1 prefix = "sm" if use_cubin else "compute" arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii") @@ -67,82 +67,80 @@ def main(): # Compile program opts = [b"--fmad=false", arch_arg] (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts) - ASSERT_DRV(err) + assert_drv(err) # Get log from compilation - err, logSize = nvrtc.nvrtcGetProgramLogSize(prog) - ASSERT_DRV(err) - log = b" " * logSize + err, log_size = nvrtc.nvrtcGetProgramLogSize(prog) + assert_drv(err) + log = b" " * log_size (err,) = nvrtc.nvrtcGetProgramLog(prog, log) - ASSERT_DRV(err) + assert_drv(err) print(log.decode()) # Get data from compilation if use_cubin: - err, dataSize = nvrtc.nvrtcGetCUBINSize(prog) - ASSERT_DRV(err) - data = b" " * dataSize + err, data_size = nvrtc.nvrtcGetCUBINSize(prog) + assert_drv(err) + data = b" " * data_size (err,) = nvrtc.nvrtcGetCUBIN(prog, data) - ASSERT_DRV(err) + assert_drv(err) else: - err, dataSize = nvrtc.nvrtcGetPTXSize(prog) - ASSERT_DRV(err) - data = b" " * dataSize + err, data_size = nvrtc.nvrtcGetPTXSize(prog) + assert_drv(err) + data = b" " * data_size (err,) = nvrtc.nvrtcGetPTX(prog, data) - ASSERT_DRV(err) - (err,) = nvrtc.nvrtcDestroyProgram(prog) - ASSERT_DRV(err) + assert_drv(err) # Load data as module data and retrieve function data = np.char.array(data) err, module = cuda.cuModuleLoadData(data) - ASSERT_DRV(err) + assert_drv(err) err, kernel = cuda.cuModuleGetFunction(module, b"saxpy") - ASSERT_DRV(err) + assert_drv(err) # Test the kernel - NUM_THREADS = 128 - NUM_BLOCKS = 32 + num_threads = 128 + num_blocks = 32 a = np.float32(2.0) - n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32) - bufferSize = n * a.itemsize + n = np.array(num_threads * num_blocks, dtype=np.uint32) + buffer_size = n * a.itemsize - err, dX = cuda.cuMemAlloc(bufferSize) - ASSERT_DRV(err) - err, dY = cuda.cuMemAlloc(bufferSize) - ASSERT_DRV(err) - err, dOut = cuda.cuMemAlloc(bufferSize) - ASSERT_DRV(err) + err, d_x = cuda.cuMemAlloc(buffer_size) + assert_drv(err) + err, d_y = cuda.cuMemAlloc(buffer_size) + assert_drv(err) + err, d_out = cuda.cuMemAlloc(buffer_size) + assert_drv(err) - hX = np.random.rand(n).astype(dtype=np.float32) - hY = np.random.rand(n).astype(dtype=np.float32) - hOut = np.zeros(n).astype(dtype=np.float32) + h_x = np.random.rand(n).astype(dtype=np.float32) + h_y = np.random.rand(n).astype(dtype=np.float32) + h_out = np.zeros(n).astype(dtype=np.float32) err, stream = cuda.cuStreamCreate(0) - ASSERT_DRV(err) + assert_drv(err) - (err,) = cuda.cuMemcpyHtoDAsync(dX, hX, bufferSize, stream) - ASSERT_DRV(err) - (err,) = cuda.cuMemcpyHtoDAsync(dY, hY, bufferSize, stream) - ASSERT_DRV(err) + (err,) = cuda.cuMemcpyHtoDAsync(d_x, h_x, buffer_size, stream) + assert_drv(err) + (err,) = cuda.cuMemcpyHtoDAsync(d_y, h_y, buffer_size, stream) + assert_drv(err) (err,) = cuda.cuStreamSynchronize(stream) - ASSERT_DRV(err) + assert_drv(err) # Assert values are different before running kernel - hZ = a * hX + hY - if np.allclose(hOut, hZ): + h_z = a * h_x + h_y + if np.allclose(h_out, h_z): raise ValueError("Error inside tolerence for host-device vectors") - arg_values = (a, dX, dY, dOut, n) + arg_values = (a, d_x, d_y, d_out, n) arg_types = (ctypes.c_float, None, None, None, ctypes.c_size_t) (err,) = cuda.cuLaunchKernel( kernel, - NUM_BLOCKS, + num_blocks, 1, 1, # grid dim - NUM_THREADS, + num_threads, 1, 1, # block dim 0, @@ -150,32 +148,32 @@ def main(): (arg_values, arg_types), 0, ) # arguments - ASSERT_DRV(err) + assert_drv(err) - (err,) = cuda.cuMemcpyDtoHAsync(hOut, dOut, bufferSize, stream) - ASSERT_DRV(err) + (err,) = cuda.cuMemcpyDtoHAsync(h_out, d_out, buffer_size, stream) + assert_drv(err) (err,) = cuda.cuStreamSynchronize(stream) - ASSERT_DRV(err) + assert_drv(err) # Assert values are same after running kernel - hZ = a * hX + hY - if not np.allclose(hOut, hZ): + h_z = a * h_x + h_y + if not np.allclose(h_out, h_z): raise ValueError("Error outside tolerence for host-device vectors") (err,) = cuda.cuStreamDestroy(stream) - ASSERT_DRV(err) + assert_drv(err) - (err,) = cuda.cuMemFree(dX) - ASSERT_DRV(err) - (err,) = cuda.cuMemFree(dY) - ASSERT_DRV(err) - (err,) = cuda.cuMemFree(dOut) - ASSERT_DRV(err) + (err,) = cuda.cuMemFree(d_x) + assert_drv(err) + (err,) = cuda.cuMemFree(d_y) + assert_drv(err) + (err,) = cuda.cuMemFree(d_out) + assert_drv(err) (err,) = cuda.cuModuleUnload(module) - ASSERT_DRV(err) + assert_drv(err) (err,) = cuda.cuCtxDestroy(context) - ASSERT_DRV(err) + assert_drv(err) if __name__ == "__main__": diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index e7339122ac..12962bf491 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -881,6 +881,13 @@ struct KernelBox { }; } // namespace +static const KernelBox* get_box(const KernelHandle& h) { + const CUkernel* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(KernelBox, resource) + ); +} + static HandleRegistry kernel_registry; KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name) { @@ -906,11 +913,7 @@ KernelHandle create_kernel_handle_ref(CUkernel kernel) { LibraryHandle get_kernel_library(const KernelHandle& h) noexcept { if (!h) return {}; - const CUkernel* p = h.get(); - auto* box = reinterpret_cast( - reinterpret_cast(p) - offsetof(KernelBox, resource) - ); - return box->h_library; + return get_box(h)->h_library; } // ============================================================================ @@ -942,25 +945,25 @@ GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent) } namespace { -struct NodeBox { +struct GraphNodeBox { CUgraphNode resource; GraphHandle h_graph; }; } // namespace -static const NodeBox* get_box(const NodeHandle& h) { +static const GraphNodeBox* get_box(const GraphNodeHandle& h) { const CUgraphNode* p = h.get(); - return reinterpret_cast( - reinterpret_cast(p) - offsetof(NodeBox, resource) + return reinterpret_cast( + reinterpret_cast(p) - offsetof(GraphNodeBox, resource) ); } -NodeHandle create_node_handle(CUgraphNode node, const GraphHandle& h_graph) { - auto box = std::make_shared(NodeBox{node, h_graph}); - return NodeHandle(box, &box->resource); +GraphNodeHandle create_graph_node_handle(CUgraphNode node, const GraphHandle& h_graph) { + auto box = std::make_shared(GraphNodeBox{node, h_graph}); + return GraphNodeHandle(box, &box->resource); } -GraphHandle node_get_graph(const NodeHandle& h) noexcept { +GraphHandle graph_node_get_graph(const GraphNodeHandle& h) noexcept { return h ? get_box(h)->h_graph : GraphHandle{}; } diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index c306345b17..090e5fa8cb 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -147,7 +147,7 @@ using MemoryPoolHandle = std::shared_ptr; using LibraryHandle = std::shared_ptr; using KernelHandle = std::shared_ptr; using GraphHandle = std::shared_ptr; -using NodeHandle = std::shared_ptr; +using GraphNodeHandle = std::shared_ptr; using GraphicsResourceHandle = std::shared_ptr; using NvrtcProgramHandle = std::shared_ptr; using NvvmProgramHandle = std::shared_ptr; @@ -398,10 +398,10 @@ GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent); // Create a node handle. Nodes are owned by their parent graph (not // independently destroyable). The GraphHandle dependency ensures the // graph outlives any node reference. -NodeHandle create_node_handle(CUgraphNode node, const GraphHandle& h_graph); +GraphNodeHandle create_graph_node_handle(CUgraphNode node, const GraphHandle& h_graph); // Extract the owning graph handle from a node handle. -GraphHandle node_get_graph(const NodeHandle& h) noexcept; +GraphHandle graph_node_get_graph(const GraphNodeHandle& h) noexcept; // ============================================================================ // Graphics resource handle functions @@ -503,7 +503,7 @@ inline CUgraph as_cu(const GraphHandle& h) noexcept { return h ? *h : nullptr; } -inline CUgraphNode as_cu(const NodeHandle& h) noexcept { +inline CUgraphNode as_cu(const GraphNodeHandle& h) noexcept { return h ? *h : nullptr; } @@ -561,7 +561,7 @@ inline std::intptr_t as_intptr(const GraphHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } -inline std::intptr_t as_intptr(const NodeHandle& h) noexcept { +inline std::intptr_t as_intptr(const GraphNodeHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } @@ -632,6 +632,13 @@ inline PyObject* as_py(const GraphHandle& h) noexcept { return detail::make_py("cuda.bindings.driver", "CUgraph", as_intptr(h)); } +inline PyObject* as_py(const GraphNodeHandle& h) noexcept { + if (!as_intptr(h)) { + Py_RETURN_NONE; + } + return detail::make_py("cuda.bindings.driver", "CUgraphNode", as_intptr(h)); +} + inline PyObject* as_py(const NvrtcProgramHandle& h) noexcept { return detail::make_py("cuda.bindings.nvrtc", "nvrtcProgram", as_intptr(h)); } diff --git a/cuda_core/cuda/core/_graph/_graphdef.pxd b/cuda_core/cuda/core/_graph/_graphdef.pxd index bff91172f4..83612cd6bb 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pxd +++ b/cuda_core/cuda/core/_graph/_graphdef.pxd @@ -5,7 +5,7 @@ from libc.stddef cimport size_t from cuda.bindings cimport cydriver -from cuda.core._resource_handles cimport EventHandle, GraphHandle, KernelHandle, NodeHandle +from cuda.core._resource_handles cimport EventHandle, GraphHandle, GraphNodeHandle, KernelHandle cdef class Condition @@ -45,7 +45,7 @@ cdef class GraphDef: cdef class Node: cdef: - NodeHandle _h_node + GraphNodeHandle _h_node tuple _pred_cache tuple _succ_cache object __weakref__ @@ -56,7 +56,7 @@ cdef class Node: cdef class EmptyNode(Node): @staticmethod - cdef EmptyNode _create_impl(NodeHandle h_node) + cdef EmptyNode _create_impl(GraphNodeHandle h_node) cdef class KernelNode(Node): @@ -67,12 +67,12 @@ cdef class KernelNode(Node): KernelHandle _h_kernel @staticmethod - cdef KernelNode _create_with_params(NodeHandle h_node, + cdef KernelNode _create_with_params(GraphNodeHandle h_node, tuple grid, tuple block, unsigned int shmem_size, KernelHandle h_kernel) @staticmethod - cdef KernelNode _create_from_driver(NodeHandle h_node) + cdef KernelNode _create_from_driver(GraphNodeHandle h_node) cdef class AllocNode(Node): @@ -84,12 +84,12 @@ cdef class AllocNode(Node): tuple _peer_access @staticmethod - cdef AllocNode _create_with_params(NodeHandle h_node, + cdef AllocNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dptr, size_t bytesize, int device_id, str memory_type, tuple peer_access) @staticmethod - cdef AllocNode _create_from_driver(NodeHandle h_node) + cdef AllocNode _create_from_driver(GraphNodeHandle h_node) cdef class FreeNode(Node): @@ -97,11 +97,11 @@ cdef class FreeNode(Node): cydriver.CUdeviceptr _dptr @staticmethod - cdef FreeNode _create_with_params(NodeHandle h_node, + cdef FreeNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dptr) @staticmethod - cdef FreeNode _create_from_driver(NodeHandle h_node) + cdef FreeNode _create_from_driver(GraphNodeHandle h_node) cdef class MemsetNode(Node): @@ -114,13 +114,13 @@ cdef class MemsetNode(Node): size_t _pitch @staticmethod - cdef MemsetNode _create_with_params(NodeHandle h_node, + cdef MemsetNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dptr, unsigned int value, unsigned int element_size, size_t width, size_t height, size_t pitch) @staticmethod - cdef MemsetNode _create_from_driver(NodeHandle h_node) + cdef MemsetNode _create_from_driver(GraphNodeHandle h_node) cdef class MemcpyNode(Node): @@ -132,13 +132,13 @@ cdef class MemcpyNode(Node): cydriver.CUmemorytype _src_type @staticmethod - cdef MemcpyNode _create_with_params(NodeHandle h_node, + cdef MemcpyNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dst, cydriver.CUdeviceptr src, size_t size, cydriver.CUmemorytype dst_type, cydriver.CUmemorytype src_type) @staticmethod - cdef MemcpyNode _create_from_driver(NodeHandle h_node) + cdef MemcpyNode _create_from_driver(GraphNodeHandle h_node) cdef class ChildGraphNode(Node): @@ -146,11 +146,11 @@ cdef class ChildGraphNode(Node): GraphHandle _h_child_graph @staticmethod - cdef ChildGraphNode _create_with_params(NodeHandle h_node, + cdef ChildGraphNode _create_with_params(GraphNodeHandle h_node, GraphHandle h_child_graph) @staticmethod - cdef ChildGraphNode _create_from_driver(NodeHandle h_node) + cdef ChildGraphNode _create_from_driver(GraphNodeHandle h_node) cdef class EventRecordNode(Node): @@ -158,11 +158,11 @@ cdef class EventRecordNode(Node): EventHandle _h_event @staticmethod - cdef EventRecordNode _create_with_params(NodeHandle h_node, + cdef EventRecordNode _create_with_params(GraphNodeHandle h_node, EventHandle h_event) @staticmethod - cdef EventRecordNode _create_from_driver(NodeHandle h_node) + cdef EventRecordNode _create_from_driver(GraphNodeHandle h_node) cdef class EventWaitNode(Node): @@ -170,11 +170,11 @@ cdef class EventWaitNode(Node): EventHandle _h_event @staticmethod - cdef EventWaitNode _create_with_params(NodeHandle h_node, + cdef EventWaitNode _create_with_params(GraphNodeHandle h_node, EventHandle h_event) @staticmethod - cdef EventWaitNode _create_from_driver(NodeHandle h_node) + cdef EventWaitNode _create_from_driver(GraphNodeHandle h_node) cdef class HostCallbackNode(Node): @@ -184,12 +184,12 @@ cdef class HostCallbackNode(Node): void* _user_data @staticmethod - cdef HostCallbackNode _create_with_params(NodeHandle h_node, + cdef HostCallbackNode _create_with_params(GraphNodeHandle h_node, object callable_obj, cydriver.CUhostFn fn, void* user_data) @staticmethod - cdef HostCallbackNode _create_from_driver(NodeHandle h_node) + cdef HostCallbackNode _create_from_driver(GraphNodeHandle h_node) cdef class ConditionalNode(Node): @@ -199,7 +199,7 @@ cdef class ConditionalNode(Node): tuple _branches # tuple of GraphDef (non-owning wrappers) @staticmethod - cdef ConditionalNode _create_from_driver(NodeHandle h_node) + cdef ConditionalNode _create_from_driver(GraphNodeHandle h_node) cdef class IfNode(ConditionalNode): diff --git a/cuda_core/cuda/core/_graph/_graphdef.pyx b/cuda_core/cuda/core/_graph/_graphdef.pyx index 053e39e886..4c06363293 100644 --- a/cuda_core/cuda/core/_graph/_graphdef.pyx +++ b/cuda_core/cuda/core/_graph/_graphdef.pyx @@ -49,7 +49,7 @@ from cuda.core._resource_handles cimport ( EventHandle, GraphHandle, KernelHandle, - NodeHandle, + GraphNodeHandle, as_cu, as_intptr, as_py, @@ -57,8 +57,8 @@ from cuda.core._resource_handles cimport ( create_graph_handle, create_graph_handle_ref, create_kernel_handle_ref, - create_node_handle, - node_get_graph, + create_graph_node_handle, + graph_node_get_graph, ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value @@ -196,7 +196,7 @@ cdef ConditionalNode _make_conditional_node( params.conditional.size = size cdef cydriver.CUcontext ctx = NULL - cdef GraphHandle h_graph = node_get_graph(pred._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(pred._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(pred._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -226,7 +226,7 @@ cdef ConditionalNode _make_conditional_node( cdef tuple branches = tuple(branch_list) cdef ConditionalNode n = node_cls.__new__(node_cls) - n._h_node = create_node_handle(new_node, h_graph) + n._h_node = create_graph_node_handle(new_node, h_graph) n._condition = condition n._cond_type = cond_type n._branches = branches @@ -308,7 +308,7 @@ cdef class GraphDef: def _entry(self) -> Node: """Return the internal entry-point Node (no dependencies).""" cdef Node n = Node.__new__(Node) - n._h_node = create_node_handle(NULL, self._h_graph) + n._h_node = create_graph_node_handle(NULL, self._h_graph) return n def alloc(self, size_t size, options: GraphAllocOptions | None = None) -> AllocNode: @@ -564,10 +564,10 @@ cdef class Node: """Factory: dispatch to the right subclass based on node type.""" if node == NULL: n = Node.__new__(Node) - (n)._h_node = create_node_handle(node, h_graph) + (n)._h_node = create_graph_node_handle(node, h_graph) return n - cdef NodeHandle h_node = create_node_handle(node, h_graph) + cdef GraphNodeHandle h_node = create_graph_node_handle(node, h_graph) cdef cydriver.CUgraphNodeType node_type with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetType(node, &node_type)) @@ -634,7 +634,7 @@ cdef class Node: @property def graph(self) -> GraphDef: """Return the GraphDef this node belongs to.""" - return GraphDef._from_handle(node_get_graph(self._h_node)) + return GraphDef._from_handle(graph_node_get_graph(self._h_node)) @property def handle(self) -> int | None: @@ -642,10 +642,7 @@ cdef class Node: Returns None for the entry node. """ - cdef cydriver.CUgraphNode node = as_cu(self._h_node) - if node == NULL: - return None - return node + return as_py(self._h_node) @property def pred(self) -> tuple: @@ -681,7 +678,7 @@ cdef class Node: with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetDependencies(node, deps.data(), NULL, &num_deps)) - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) self._pred_cache = tuple(Node._create(h_graph, deps[i]) for i in range(num_deps)) return self._pred_cache @@ -719,7 +716,7 @@ cdef class Node: with nogil: HANDLE_RETURN(cydriver.cuGraphNodeGetDependentNodes(node, deps.data(), NULL, &num_deps)) - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) self._succ_cache = tuple(Node._create(h_graph, deps[i]) for i in range(num_deps)) return self._succ_cache @@ -746,7 +743,7 @@ cdef class Node: cdef cydriver.CUDA_KERNEL_NODE_PARAMS node_params cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -777,7 +774,7 @@ cdef class Node: self._succ_cache = None return KernelNode._create_with_params( - create_node_handle(new_node, h_graph), + create_graph_node_handle(new_node, h_graph), conf.grid, conf.block, conf.shmem_size, ker._h_kernel) @@ -798,7 +795,7 @@ cdef class Node: """ cdef vector[cydriver.CUgraphNode] deps cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef Node other cdef cydriver.CUgraphNode* deps_ptr = NULL cdef size_t num_deps = 0 @@ -821,7 +818,7 @@ cdef class Node: self._succ_cache = None for other in nodes: (other)._succ_cache = None - return EmptyNode._create_impl(create_node_handle(new_node, h_graph)) + return EmptyNode._create_impl(create_graph_node_handle(new_node, h_graph)) def alloc(self, size_t size, options: GraphAllocOptions | None = None) -> AllocNode: """Add a memory allocation node depending on this node. @@ -851,7 +848,7 @@ cdef class Node: cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS alloc_params cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -910,7 +907,7 @@ cdef class Node: self._succ_cache = None return AllocNode._create_with_params( - create_node_handle(new_node, h_graph), alloc_params.dptr, size, + create_graph_node_handle(new_node, h_graph), alloc_params.dptr, size, device_id, memory_type, tuple(peer_ids)) def free(self, dptr: int) -> FreeNode: @@ -927,7 +924,7 @@ cdef class Node: A new FreeNode representing the free operation. """ cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -942,7 +939,7 @@ cdef class Node: &new_node, as_cu(h_graph), deps, num_deps, c_dptr)) self._succ_cache = None - return FreeNode._create_with_params(create_node_handle(new_node, h_graph), c_dptr) + return FreeNode._create_with_params(create_graph_node_handle(new_node, h_graph), c_dptr) def memset(self, dst: int, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode: """Add a memset node depending on this node. @@ -972,7 +969,7 @@ cdef class Node: cdef cydriver.CUDA_MEMSET_NODE_PARAMS memset_params cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -1001,7 +998,7 @@ cdef class Node: self._succ_cache = None return MemsetNode._create_with_params( - create_node_handle(new_node, h_graph), c_dst, + create_graph_node_handle(new_node, h_graph), c_dst, val, elem_size, width, height, pitch) def memcpy(self, dst: int, src: int, size_t size) -> MemcpyNode: @@ -1066,7 +1063,7 @@ cdef class Node: params.Depth = 1 cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -1083,7 +1080,7 @@ cdef class Node: self._succ_cache = None return MemcpyNode._create_with_params( - create_node_handle(new_node, h_graph), c_dst, c_src, size, + create_graph_node_handle(new_node, h_graph), c_dst, c_src, size, c_dst_type, c_src_type) def embed(self, child: GraphDef) -> ChildGraphNode: @@ -1105,7 +1102,7 @@ cdef class Node: """ cdef GraphDef child_def = child cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -1127,7 +1124,7 @@ cdef class Node: self._succ_cache = None return ChildGraphNode._create_with_params( - create_node_handle(new_node, h_graph), h_embedded) + create_graph_node_handle(new_node, h_graph), h_embedded) def record_event(self, event: Event) -> EventRecordNode: """Add an event record node depending on this node. @@ -1144,7 +1141,7 @@ cdef class Node: """ cdef Event ev = event cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -1162,7 +1159,7 @@ cdef class Node: self._succ_cache = None return EventRecordNode._create_with_params( - create_node_handle(new_node, h_graph), ev._h_event) + create_graph_node_handle(new_node, h_graph), ev._h_event) def wait_event(self, event: Event) -> EventWaitNode: """Add an event wait node depending on this node. @@ -1179,7 +1176,7 @@ cdef class Node: """ cdef Event ev = event cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -1197,7 +1194,7 @@ cdef class Node: self._succ_cache = None return EventWaitNode._create_with_params( - create_node_handle(new_node, h_graph), ev._h_event) + create_graph_node_handle(new_node, h_graph), ev._h_event) def callback(self, fn, *, user_data=None) -> HostCallbackNode: """Add a host callback node depending on this node. @@ -1236,7 +1233,7 @@ cdef class Node: cdef cydriver.CUDA_HOST_NODE_PARAMS node_params cdef cydriver.CUgraphNode new_node = NULL - cdef GraphHandle h_graph = node_get_graph(self._h_node) + cdef GraphHandle h_graph = graph_node_get_graph(self._h_node) cdef cydriver.CUgraphNode pred_node = as_cu(self._h_node) cdef cydriver.CUgraphNode* deps = NULL cdef size_t num_deps = 0 @@ -1287,7 +1284,7 @@ cdef class Node: self._succ_cache = None return HostCallbackNode._create_with_params( - create_node_handle(new_node, h_graph), callable_obj, + create_graph_node_handle(new_node, h_graph), callable_obj, node_params.fn, node_params.userData) def if_cond(self, condition: Condition) -> IfNode: @@ -1383,7 +1380,7 @@ cdef class EmptyNode(Node): """A synchronization / join node with no operation.""" @staticmethod - cdef EmptyNode _create_impl(NodeHandle h_node): + cdef EmptyNode _create_impl(GraphNodeHandle h_node): cdef EmptyNode n = EmptyNode.__new__(EmptyNode) n._h_node = h_node return n @@ -1411,7 +1408,7 @@ cdef class KernelNode(Node): """ @staticmethod - cdef KernelNode _create_with_params(NodeHandle h_node, + cdef KernelNode _create_with_params(GraphNodeHandle h_node, tuple grid, tuple block, unsigned int shmem_size, KernelHandle h_kernel): """Create from known params (called by launch() builder).""" @@ -1424,7 +1421,7 @@ cdef class KernelNode(Node): return n @staticmethod - cdef KernelNode _create_from_driver(NodeHandle h_node): + cdef KernelNode _create_from_driver(GraphNodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_KERNEL_NODE_PARAMS params @@ -1492,7 +1489,7 @@ cdef class AllocNode(Node): """ @staticmethod - cdef AllocNode _create_with_params(NodeHandle h_node, + cdef AllocNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dptr, size_t bytesize, int device_id, str memory_type, tuple peer_access): """Create from known params (called by alloc() builder).""" @@ -1506,7 +1503,7 @@ cdef class AllocNode(Node): return n @staticmethod - cdef AllocNode _create_from_driver(NodeHandle h_node): + cdef AllocNode _create_from_driver(GraphNodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS params @@ -1581,7 +1578,7 @@ cdef class FreeNode(Node): """ @staticmethod - cdef FreeNode _create_with_params(NodeHandle h_node, + cdef FreeNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dptr): """Create from known params (called by free() builder).""" cdef FreeNode n = FreeNode.__new__(FreeNode) @@ -1590,7 +1587,7 @@ cdef class FreeNode(Node): return n @staticmethod - cdef FreeNode _create_from_driver(NodeHandle h_node): + cdef FreeNode _create_from_driver(GraphNodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUdeviceptr dptr @@ -1627,7 +1624,7 @@ cdef class MemsetNode(Node): """ @staticmethod - cdef MemsetNode _create_with_params(NodeHandle h_node, + cdef MemsetNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dptr, unsigned int value, unsigned int element_size, size_t width, size_t height, size_t pitch): @@ -1643,7 +1640,7 @@ cdef class MemsetNode(Node): return n @staticmethod - cdef MemsetNode _create_from_driver(NodeHandle h_node): + cdef MemsetNode _create_from_driver(GraphNodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_MEMSET_NODE_PARAMS params @@ -1702,7 +1699,7 @@ cdef class MemcpyNode(Node): """ @staticmethod - cdef MemcpyNode _create_with_params(NodeHandle h_node, + cdef MemcpyNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dst, cydriver.CUdeviceptr src, size_t size, cydriver.CUmemorytype dst_type, cydriver.CUmemorytype src_type): @@ -1717,7 +1714,7 @@ cdef class MemcpyNode(Node): return n @staticmethod - cdef MemcpyNode _create_from_driver(NodeHandle h_node): + cdef MemcpyNode _create_from_driver(GraphNodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_MEMCPY3D params @@ -1771,7 +1768,7 @@ cdef class ChildGraphNode(Node): """ @staticmethod - cdef ChildGraphNode _create_with_params(NodeHandle h_node, + cdef ChildGraphNode _create_with_params(GraphNodeHandle h_node, GraphHandle h_child_graph): """Create from known params (called by embed() builder).""" cdef ChildGraphNode n = ChildGraphNode.__new__(ChildGraphNode) @@ -1780,13 +1777,13 @@ cdef class ChildGraphNode(Node): return n @staticmethod - cdef ChildGraphNode _create_from_driver(NodeHandle h_node): + cdef ChildGraphNode _create_from_driver(GraphNodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUgraph child_graph = NULL with nogil: HANDLE_RETURN(cydriver.cuGraphChildGraphNodeGetGraph(node, &child_graph)) - cdef GraphHandle h_graph = node_get_graph(h_node) + cdef GraphHandle h_graph = graph_node_get_graph(h_node) cdef GraphHandle h_child = create_graph_handle_ref(child_graph, h_graph) return ChildGraphNode._create_with_params(h_node, h_child) @@ -1814,7 +1811,7 @@ cdef class EventRecordNode(Node): """ @staticmethod - cdef EventRecordNode _create_with_params(NodeHandle h_node, + cdef EventRecordNode _create_with_params(GraphNodeHandle h_node, EventHandle h_event): """Create from known params (called by record_event() builder).""" cdef EventRecordNode n = EventRecordNode.__new__(EventRecordNode) @@ -1823,7 +1820,7 @@ cdef class EventRecordNode(Node): return n @staticmethod - cdef EventRecordNode _create_from_driver(NodeHandle h_node): + cdef EventRecordNode _create_from_driver(GraphNodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUevent event @@ -1851,7 +1848,7 @@ cdef class EventWaitNode(Node): """ @staticmethod - cdef EventWaitNode _create_with_params(NodeHandle h_node, + cdef EventWaitNode _create_with_params(GraphNodeHandle h_node, EventHandle h_event): """Create from known params (called by wait_event() builder).""" cdef EventWaitNode n = EventWaitNode.__new__(EventWaitNode) @@ -1860,7 +1857,7 @@ cdef class EventWaitNode(Node): return n @staticmethod - cdef EventWaitNode _create_from_driver(NodeHandle h_node): + cdef EventWaitNode _create_from_driver(GraphNodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUevent event @@ -1888,7 +1885,7 @@ cdef class HostCallbackNode(Node): """ @staticmethod - cdef HostCallbackNode _create_with_params(NodeHandle h_node, + cdef HostCallbackNode _create_with_params(GraphNodeHandle h_node, object callable_obj, cydriver.CUhostFn fn, void* user_data): """Create from known params (called by callback() builder).""" @@ -1900,7 +1897,7 @@ cdef class HostCallbackNode(Node): return n @staticmethod - cdef HostCallbackNode _create_from_driver(NodeHandle h_node): + cdef HostCallbackNode _create_from_driver(GraphNodeHandle h_node): """Create by fetching params from the driver (called by _create factory).""" cdef cydriver.CUgraphNode node = as_cu(h_node) cdef cydriver.CUDA_HOST_NODE_PARAMS params @@ -1946,7 +1943,7 @@ cdef class ConditionalNode(Node): """ @staticmethod - cdef ConditionalNode _create_from_driver(NodeHandle h_node): + cdef ConditionalNode _create_from_driver(GraphNodeHandle h_node): cdef ConditionalNode n if not _check_node_get_params(): n = ConditionalNode.__new__(ConditionalNode) @@ -1967,7 +1964,7 @@ cdef class ConditionalNode(Node): condition._c_handle = ( int(cond_params.handle)) - cdef GraphHandle h_graph = node_get_graph(h_node) + cdef GraphHandle h_graph = graph_node_get_graph(h_node) cdef list branch_list = [] cdef unsigned int i cdef GraphHandle h_branch diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index f34b24c096..4e8f810619 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -19,7 +19,6 @@ from cuda.core._resource_handles cimport ( KernelHandle, create_library_handle_from_file, create_library_handle_from_data, - create_library_handle_ref, create_kernel_handle, create_kernel_handle_ref, get_kernel_library, diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 7eca9d1221..9b4baf11da 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -27,7 +27,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const cydriver.CUlibrary] LibraryHandle ctypedef shared_ptr[const cydriver.CUkernel] KernelHandle ctypedef shared_ptr[const cydriver.CUgraph] GraphHandle - ctypedef shared_ptr[const cydriver.CUgraphNode] NodeHandle + ctypedef shared_ptr[const cydriver.CUgraphNode] GraphNodeHandle ctypedef shared_ptr[const cydriver.CUgraphicsResource] GraphicsResourceHandle ctypedef shared_ptr[const cynvrtc.nvrtcProgram] NvrtcProgramHandle @@ -51,7 +51,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUlibrary as_cu(LibraryHandle h) noexcept nogil cydriver.CUkernel as_cu(KernelHandle h) noexcept nogil cydriver.CUgraph as_cu(GraphHandle h) noexcept nogil - cydriver.CUgraphNode as_cu(NodeHandle h) noexcept nogil + cydriver.CUgraphNode as_cu(GraphNodeHandle h) noexcept nogil cydriver.CUgraphicsResource as_cu(GraphicsResourceHandle h) noexcept nogil cynvrtc.nvrtcProgram as_cu(NvrtcProgramHandle h) noexcept nogil cynvvm.nvvmProgram as_cu(NvvmProgramHandle h) noexcept nogil @@ -67,7 +67,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": intptr_t as_intptr(LibraryHandle h) noexcept nogil intptr_t as_intptr(KernelHandle h) noexcept nogil intptr_t as_intptr(GraphHandle h) noexcept nogil - intptr_t as_intptr(NodeHandle h) noexcept nogil + intptr_t as_intptr(GraphNodeHandle h) noexcept nogil intptr_t as_intptr(GraphicsResourceHandle h) noexcept nogil intptr_t as_intptr(NvrtcProgramHandle h) noexcept nogil intptr_t as_intptr(NvvmProgramHandle h) noexcept nogil @@ -83,6 +83,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": object as_py(LibraryHandle h) object as_py(KernelHandle h) object as_py(GraphHandle h) + object as_py(GraphNodeHandle h) object as_py(GraphicsResourceHandle h) object as_py(NvrtcProgramHandle h) object as_py(NvvmProgramHandle h) @@ -176,8 +177,8 @@ cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil # Graph node handles -cdef NodeHandle create_node_handle(cydriver.CUgraphNode node, const GraphHandle& h_graph) except+ nogil -cdef GraphHandle node_get_graph(const NodeHandle& h) noexcept nogil +cdef GraphNodeHandle create_graph_node_handle(cydriver.CUgraphNode node, const GraphHandle& h_graph) except+ nogil +cdef GraphHandle graph_node_get_graph(const GraphNodeHandle& h) noexcept nogil # Graphics resource handles cdef GraphicsResourceHandle create_graphics_resource_handle( diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index be8955ce92..d4d60d6192 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -154,10 +154,10 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil # Graph node handles - NodeHandle create_node_handle "cuda_core::create_node_handle" ( + GraphNodeHandle create_graph_node_handle "cuda_core::create_graph_node_handle" ( cydriver.CUgraphNode node, const GraphHandle& h_graph) except+ nogil - GraphHandle node_get_graph "cuda_core::node_get_graph" ( - const NodeHandle& h) noexcept nogil + GraphHandle graph_node_get_graph "cuda_core::graph_node_get_graph" ( + const GraphNodeHandle& h) noexcept nogil # Graphics resource handles GraphicsResourceHandle create_graphics_resource_handle "cuda_core::create_graphics_resource_handle" ( diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py index 02d1b59ec1..c6233dd5d9 100644 --- a/cuda_core/examples/cuda_graphs.py +++ b/cuda_core/examples/cuda_graphs.py @@ -84,9 +84,9 @@ def main(): result3 = cp.empty_like(a) # Prepare launch configuration - block_size = 256 - grid_size = (size + block_size - 1) // block_size - config = LaunchConfig(grid=grid_size, block=block_size) + block = 256 + grid = (size + block - 1) // block + config = LaunchConfig(grid=grid, block=block) # Sync before graph capture dev.sync() diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py index 7b8b43cd8d..46fa59ee3f 100644 --- a/cuda_core/examples/gl_interop_plasma.py +++ b/cuda_core/examples/gl_interop_plasma.py @@ -94,8 +94,8 @@ def setup_cuda(kernel_source): dev.set_current() stream = dev.create_stream() - opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") - prog = Program(kernel_source, code_type="c++", options=opts) + program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") + prog = Program(kernel_source, code_type="c++", options=program_options) mod = prog.compile("cubin") kernel = mod.get_kernel("plasma") diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py index 3919953eab..4e3bfcceb5 100644 --- a/cuda_core/examples/pytorch_example.py +++ b/cuda_core/examples/pytorch_example.py @@ -48,7 +48,7 @@ def __cuda_stream__(self): return (0, stream_id) # Return format required by CUDA Python -s = dev.create_stream(PyTorchStreamWrapper(pt_stream)) +stream = dev.create_stream(PyTorchStreamWrapper(pt_stream)) try: # prepare program @@ -61,7 +61,7 @@ def __cuda_stream__(self): ) # Run in single precision - ker = mod.get_kernel("saxpy_kernel") + kernel = mod.get_kernel("saxpy_kernel") dtype = torch.float32 # prepare input/output @@ -76,16 +76,16 @@ def __cuda_stream__(self): block = 32 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) - ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) + kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) # launch kernel on our stream - launch(s, config, ker, *ker_args) + launch(stream, config, kernel, *kernel_args) # check result assert torch.allclose(out, a.item() * x + y) # let's repeat again with double precision - ker = mod.get_kernel("saxpy_kernel") + kernel = mod.get_kernel("saxpy_kernel") dtype = torch.float64 # prepare input @@ -102,12 +102,12 @@ def __cuda_stream__(self): block = 64 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) - ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) + kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) # launch kernel on PyTorch's stream - launch(s, config, ker, *ker_args) + launch(stream, config, kernel, *kernel_args) # check result assert torch.allclose(out, a * x + y) finally: - s.close() + stream.close() diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index d7eb401ac3..548af802be 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -35,7 +35,7 @@ dev = Device() dev.set_current() -s = dev.create_stream() +stream = dev.create_stream() buf = None try: @@ -53,7 +53,7 @@ ) # run in single precision - ker = mod.get_kernel("saxpy") + kernel = mod.get_kernel("saxpy") dtype = cp.float32 # prepare input/output @@ -63,24 +63,24 @@ x = rng.random(size, dtype=dtype) y = rng.random(size, dtype=dtype) out = cp.empty_like(x) - dev.sync() # cupy runs on a different stream from s, so sync before accessing + dev.sync() # cupy runs on a different stream from stream, so sync before accessing # prepare launch block = 32 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) - ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) + kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) - # launch kernel on stream s - launch(s, config, ker, *ker_args) - s.sync() + # launch kernel on stream + launch(stream, config, kernel, *kernel_args) + stream.sync() # check result assert cp.allclose(out, a * x + y) # let's repeat again, this time allocates our own out buffer instead of cupy's # run in double precision - ker = mod.get_kernel("saxpy") + kernel = mod.get_kernel("saxpy") dtype = cp.float64 # prepare input @@ -93,18 +93,18 @@ # prepare output buf = dev.allocate( size * 8, # = dtype.itemsize - stream=s, + stream=stream, ) # prepare launch block = 64 grid = int((size + block - 1) // block) config = LaunchConfig(grid=grid, block=block) - ker_args = (a, x.data.ptr, y.data.ptr, buf, size) + kernel_args = (a, x.data.ptr, y.data.ptr, buf, size) - # launch kernel on stream s - launch(s, config, ker, *ker_args) - s.sync() + # launch kernel on stream + launch(stream, config, kernel, *kernel_args) + stream.sync() # check result # we wrap output buffer as a cupy array for simplicity @@ -115,5 +115,5 @@ finally: # cupy cleans up automatically the rest if buf is not None: - buf.close(s) - s.close() + buf.close(stream) + stream.close() diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index 0fbb4466bb..882ce8bbb3 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -13,7 +13,7 @@ import cupy as cp -from cuda.core import Device, LaunchConfig, Program, launch, system +from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system if system.get_num_devices() < 2: print("this example requires at least 2 GPUs", file=sys.stderr) @@ -56,9 +56,9 @@ def __cuda_stream__(self): } } """ - prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"}) + prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}")) mod_add = prog_add.compile("cubin") - ker_add = mod_add.get_kernel("vector_add") + add_kernel = mod_add.get_kernel("vector_add") # Set GPU 1 dev1 = Device(1) @@ -78,9 +78,9 @@ def __cuda_stream__(self): } } """ - prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"}) + prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}")) mod_sub = prog_sub.compile("cubin") - ker_sub = mod_sub.get_kernel("vector_sub") + sub_kernel = mod_sub.get_kernel("vector_sub") # Create launch configs for each kernel that will be executed on the respective # CUDA streams. @@ -103,7 +103,7 @@ def __cuda_stream__(self): stream0.wait(cp_stream0) # Launch the add kernel on GPU 0 / stream 0 - launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) + launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) # Allocate memory on GPU 1 # Note: This runs on CuPy's current stream for GPU 1. @@ -118,7 +118,7 @@ def __cuda_stream__(self): stream1.wait(cp_stream1) # Launch the subtract kernel on GPU 1 / stream 1 - launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size)) + launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size)) # Synchronize both GPUs are validate the results dev0.set_current() diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py index d53c4278b2..9d4e4aacff 100644 --- a/cuda_core/examples/strided_memory_view_gpu.py +++ b/cuda_core/examples/strided_memory_view_gpu.py @@ -57,7 +57,7 @@ # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both # of which are supported by StridedMemoryView). @args_viewable_as_strided_memory((0,)) -def my_func(arr, work_stream, gpu_ker): +def my_func(arr, work_stream, kernel): # Create a memory view over arr (assumed to be a 1D array of int32). The stream # ordering is taken care of, so that arr can be safely accessed on our work # stream (ordered after a data stream on which arr is potentially prepared). @@ -73,7 +73,7 @@ def my_func(arr, work_stream, gpu_ker): block = 256 grid = (size + block - 1) // block config = LaunchConfig(grid=grid, block=block) - launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size)) + launch(work_stream, config, kernel, view.ptr, np.uint64(size)) # Here we're being conservative and synchronize over our work stream, # assuming we do not know the data stream; if we know then we could # just order the data stream after the work stream here, e.g. @@ -101,24 +101,24 @@ def run(): # To know the GPU's compute capability, we need to identify which GPU to use. dev = Device(0) dev.set_current() - gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11")) - mod = gpu_prog.compile(target_type="cubin") - gpu_ker = mod.get_kernel(func_name) + prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11")) + mod = prog.compile(target_type="cubin") + kernel = mod.get_kernel(func_name) - s = dev.create_stream() + stream = dev.create_stream() try: # Create input array on GPU arr_gpu = cp.ones(1024, dtype=cp.int32) print(f"before: {arr_gpu[:10]=}") # Run the workload - my_func(arr_gpu, s, gpu_ker) + my_func(arr_gpu, stream, kernel) # Check the result print(f"after: {arr_gpu[:10]=}") assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) finally: - s.close() + stream.close() if __name__ == "__main__": diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index 5e36270eab..a5f50d4189 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -94,7 +94,7 @@ options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path), ) mod = prog.compile(target_type="cubin") -ker = mod.get_kernel("check_cluster_info") +kernel = mod.get_kernel("check_cluster_info") # prepare launch config grid = 4 @@ -126,7 +126,7 @@ block_dims[:] = 0 # launch kernel on the default stream - launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer) + launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer) dev.sync() # verify results diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index 4c645fc7dd..e648a3846f 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -30,7 +30,7 @@ dev = Device() dev.set_current() -s = dev.create_stream() +stream = dev.create_stream() try: # prepare program @@ -39,7 +39,7 @@ mod = prog.compile("cubin", name_expressions=("vector_add",)) # run in single precision - ker = mod.get_kernel("vector_add") + kernel = mod.get_kernel("vector_add") dtype = cp.float32 # prepare input/output @@ -49,7 +49,7 @@ b = rng.random(size, dtype=dtype) c = cp.empty_like(a) - # cupy runs on a different stream from s, so sync before accessing + # cupy runs on a different stream from stream, so sync before accessing dev.sync() # prepare launch @@ -57,11 +57,11 @@ grid = (size + block - 1) // block config = LaunchConfig(grid=grid, block=block) - # launch kernel on stream s - launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) - s.sync() + # launch kernel on stream + launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) + stream.sync() # check result assert cp.allclose(c, a + b) finally: - s.close() + stream.close() diff --git a/cuda_core/tests/graph/test_explicit_integration.py b/cuda_core/tests/graph/test_explicit_integration.py index 53f4877e22..2595f4097b 100644 --- a/cuda_core/tests/graph/test_explicit_integration.py +++ b/cuda_core/tests/graph/test_explicit_integration.py @@ -245,41 +245,43 @@ def capture_result(): heat_cfg = LaunchConfig(grid=grid, block=block) tick_cfg = LaunchConfig(grid=1, block=1) + # fmt: off # Phase 1 — Allocate device memory a_curr = g.alloc(_HEAT_N * SIZEOF_FLOAT) a_next = g.alloc(_HEAT_N * SIZEOF_FLOAT) - a_ctr = g.alloc(SIZEOF_INT) + a_ctr = g.alloc(SIZEOF_INT) # Phase 2 — Initialise buffers m_curr = a_curr.memset(a_curr.dptr, 0, _HEAT_N * SIZEOF_FLOAT) m_next = a_next.memset(a_next.dptr, 0, _HEAT_N * SIZEOF_FLOAT) - m_ctr = a_ctr.memset(a_ctr.dptr, np.int32(_HEAT_ITERS), 1) + m_ctr = a_ctr.memset(a_ctr.dptr, np.int32(_HEAT_ITERS), 1) # Phase 3 — Boundary conditions (child graph) - p = ( - g.join(m_curr, m_next, m_ctr) - .embed( - GraphDef() - .memset(a_curr.dptr, np.float32(_HEAT_T_LEFT), 1) - .memset( - a_curr.dptr + (_HEAT_N - 1) * SIZEOF_FLOAT, - np.float32(_HEAT_T_RIGHT), - 1, - ) - .graph - ) - .record_event(event_start) - ) + bc = GraphDef() \ + .memset(a_curr.dptr, np.float32(_HEAT_T_LEFT), 1) \ + .memset(a_curr.dptr + (_HEAT_N - 1) * SIZEOF_FLOAT, + np.float32(_HEAT_T_RIGHT), 1) \ + .graph + p = g.join(m_curr, m_next, m_ctr) \ + .embed(bc) \ + .record_event(event_start) + # Phase 4 — Iterate loop = p.while_loop(condition) - loop.body.launch(heat_cfg, k_heat, a_next.dptr, a_curr.dptr, np.int32(_HEAT_N), _HEAT_ALPHA).memcpy( - a_curr.dptr, a_next.dptr, _HEAT_N * SIZEOF_FLOAT - ).launch(tick_cfg, k_countdown, condition.handle, a_ctr.dptr) + loop.body.launch(heat_cfg, k_heat, a_next.dptr, a_curr.dptr, + np.int32(_HEAT_N), _HEAT_ALPHA) \ + .memcpy(a_curr.dptr, a_next.dptr, _HEAT_N * SIZEOF_FLOAT) \ + .launch(tick_cfg, k_countdown, condition.handle, a_ctr.dptr) # Phase 5 — After loop: timing end, readback, verify, free memory - loop.wait_event(event_start).record_event(event_end).memcpy(host_ptr, a_curr.dptr, _HEAT_N * SIZEOF_FLOAT).callback( - capture_result - ).free(a_curr.dptr).free(a_next.dptr).free(a_ctr.dptr) + loop.wait_event(event_start) \ + .record_event(event_end) \ + .memcpy(host_ptr, a_curr.dptr, _HEAT_N * SIZEOF_FLOAT) \ + .callback(capture_result) \ + .free(a_curr.dptr) \ + .free(a_next.dptr) \ + .free(a_ctr.dptr) + # fmt: on # Phase 6 — Instantiate, launch, verify graph = g.instantiate() @@ -344,45 +346,39 @@ def _run_bisection_graph(dev, k_eval, k_hi, k_lo, k_cd, k_check, k_newton, host_ def capture_result(): results["root"] = ctypes.c_float.from_address(host_ptr).value + # fmt: off # Allocate and initialise: a = 0.0, b = 2.0, counter = ITERS - a = g.alloc(SIZEOF_FLOAT) - b = g.alloc(SIZEOF_FLOAT) + a = g.alloc(SIZEOF_FLOAT) + b = g.alloc(SIZEOF_FLOAT) ctr = g.alloc(SIZEOF_INT) - p = g.join( - a.memset(a.dptr, np.float32(0.0), 1), - b.memset(b.dptr, np.float32(2.0), 1), - ctr.memset(ctr.dptr, np.int32(_BISECT_ITERS), 1), - ) + p = g.join(a.memset(a.dptr, np.float32(0.0), 1), + b.memset(b.dptr, np.float32(2.0), 1), + ctr.memset(ctr.dptr, np.int32(_BISECT_ITERS), 1)) # While loop: bisection iterations while_cond = g.create_condition(default_value=1) - ie_cond = g.create_condition(default_value=0) + ie_cond = g.create_condition(default_value=0) loop = p.while_loop(while_cond) - ie = loop.body.launch( - cfg, - k_eval, - a.dptr, - b.dptr, - ie_cond.handle, - ).if_else(ie_cond) + ie = loop.body.launch(cfg, k_eval, a.dptr, b.dptr, ie_cond.handle) \ + .if_else(ie_cond) ie.then.launch(cfg, k_hi, a.dptr, b.dptr) ie.else_.launch(cfg, k_lo, a.dptr, b.dptr) ie.launch(cfg, k_cd, while_cond.handle, ctr.dptr) # Post-loop: Newton refinement (IfNode), readback, free if_cond = g.create_condition(default_value=0) - if_node = loop.launch( - cfg, - k_check, - a.dptr, - b.dptr, - if_cond.handle, - ).if_cond(if_cond) + if_node = loop.launch(cfg, k_check, a.dptr, b.dptr, if_cond.handle) \ + .if_cond(if_cond) if_node.then.launch(cfg, k_newton, a.dptr, b.dptr) - (if_node.memcpy(host_ptr, a.dptr, SIZEOF_FLOAT).callback(capture_result).free(a.dptr).free(b.dptr).free(ctr.dptr)) + if_node.memcpy(host_ptr, a.dptr, SIZEOF_FLOAT) \ + .callback(capture_result) \ + .free(a.dptr) \ + .free(b.dptr) \ + .free(ctr.dptr) + # fmt: on # Instantiate, launch, verify graph = g.instantiate() @@ -448,16 +444,20 @@ def _run_switch_graph(dev, mode, k_negate, k_double, k_square, host_ptr): g = GraphDef() cfg = LaunchConfig(grid=1, block=1) + # fmt: off x = g.alloc(SIZEOF_INT) sw_cond = g.create_condition(default_value=mode) - sw = x.memset(x.dptr, np.int32(_SWITCH_VALUE), 1).switch(sw_cond, 4) + sw = x.memset(x.dptr, np.int32(_SWITCH_VALUE), 1) \ + .switch(sw_cond, 4) sw.branches[0].launch(cfg, k_negate, x.dptr) sw.branches[1].launch(cfg, k_double, x.dptr) sw.branches[2].launch(cfg, k_square, x.dptr) # branch 3: identity (no kernel — value unchanged) - sw.memcpy(host_ptr, x.dptr, SIZEOF_INT).free(x.dptr) + sw.memcpy(host_ptr, x.dptr, SIZEOF_INT) \ + .free(x.dptr) + # fmt: on graph = g.instantiate() stream = dev.create_stream() diff --git a/cuda_core/tests/graph/test_explicit_lifetime.py b/cuda_core/tests/graph/test_explicit_lifetime.py index d99590e355..f355fa821d 100644 --- a/cuda_core/tests/graph/test_explicit_lifetime.py +++ b/cuda_core/tests/graph/test_explicit_lifetime.py @@ -22,7 +22,6 @@ KernelNode, ) - # ============================================================================= # Conditional body graph lifetime # ============================================================================= @@ -153,7 +152,7 @@ def test_nested_child_graph_lifetime(init_cuda): middle_ref = outer_node.child_graph middle_nodes = middle_ref.nodes() - child_node = [n for n in middle_nodes if isinstance(n, ChildGraphNode)][0] + child_node = next(n for n in middle_nodes if isinstance(n, ChildGraphNode)) grandchild = child_node.child_graph del outer, outer_node, middle, inner, middle_ref, middle_nodes, child_node @@ -279,17 +278,14 @@ def test_event_survives_graph_clone_and_execution(init_cuda): rec = g.record_event(event) rec.wait_event(event) - cloned_cu_graph = handle_return( - driver.cuGraphClone(driver.CUgraph(g.handle))) + cloned_cu_graph = handle_return(driver.cuGraphClone(driver.CUgraph(g.handle))) del event, g, rec gc.collect() graph_exec = handle_return(driver.cuGraphInstantiate(cloned_cu_graph, 0)) stream = dev.create_stream() - handle_return( - driver.cuGraphLaunch( - graph_exec, driver.CUstream(int(stream.handle)))) + handle_return(driver.cuGraphLaunch(graph_exec, driver.CUstream(int(stream.handle)))) stream.sync() @@ -348,17 +344,14 @@ def test_kernel_survives_graph_clone_and_execution(init_cuda): g = GraphDef() g.launch(config, kernel) - cloned_cu_graph = handle_return( - driver.cuGraphClone(driver.CUgraph(g.handle))) + cloned_cu_graph = handle_return(driver.cuGraphClone(driver.CUgraph(g.handle))) del kernel, mod, g gc.collect() graph_exec = handle_return(driver.cuGraphInstantiate(cloned_cu_graph, 0)) stream = dev.create_stream() - handle_return( - driver.cuGraphLaunch( - graph_exec, driver.CUstream(int(stream.handle)))) + handle_return(driver.cuGraphLaunch(graph_exec, driver.CUstream(int(stream.handle)))) stream.sync() diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index cc88f6b19a..2bc7e25d21 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -538,11 +538,9 @@ def test_kernel_from_handle_foreign_kernel(init_cuda): cubin = mod.code sym_map = mod.symbol_mapping - cu_lib = handle_return( - driver.cuLibraryLoadData(cubin, [], [], 0, [], [], 0)) + cu_lib = handle_return(driver.cuLibraryLoadData(cubin, [], [], 0, [], [], 0)) mangled = sym_map["saxpy"] - cu_kernel = handle_return( - driver.cuLibraryGetKernel(cu_lib, mangled)) + cu_kernel = handle_return(driver.cuLibraryGetKernel(cu_lib, mangled)) handle = int(cu_kernel) k = Kernel.from_handle(handle) diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py index e189bb127a..89fa07445d 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/descriptor_catalog.py @@ -266,6 +266,29 @@ class DescriptorSpec: linux_sonames=("libcufile.so.0",), site_packages_linux=("nvidia/cu13/lib", "nvidia/cufile/lib"), ), + DescriptorSpec( + name="cupti", + packaged_with="ctk", + linux_sonames=("libcupti.so.12", "libcupti.so.13"), + windows_dlls=( + "cupti64_2025.4.1.dll", + "cupti64_2025.3.1.dll", + "cupti64_2025.2.1.dll", + "cupti64_2025.1.1.dll", + "cupti64_2024.3.2.dll", + "cupti64_2024.2.1.dll", + "cupti64_2024.1.1.dll", + "cupti64_2023.3.1.dll", + "cupti64_2023.2.2.dll", + "cupti64_2023.1.1.dll", + "cupti64_2022.4.1.dll", + ), + site_packages_linux=("nvidia/cu13/lib", "nvidia/cuda_cupti/lib"), + site_packages_windows=("nvidia/cu13/bin/x86_64", "nvidia/cuda_cupti/bin"), + anchor_rel_dirs_linux=("extras/CUPTI/lib64", "lib"), + anchor_rel_dirs_windows=("extras/CUPTI/lib64", "bin"), + ctk_root_canary_anchor_libnames=("cudart",), + ), # ----------------------------------------------------------------------- # Third-party / separately packaged libraries # ----------------------------------------------------------------------- diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_platform.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_platform.py index 817ac0b65f..95e0f4dd1e 100644 --- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_platform.py +++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/search_platform.py @@ -141,10 +141,20 @@ def find_in_lib_dir( error_messages: list[str], attachments: list[str], ) -> str | None: + # Most libraries have both unversioned and versioned files/symlinks (exact match first) so_name = os.path.join(lib_dir, lib_searched_for) if os.path.isfile(so_name): return so_name - error_messages.append(f"No such file: {so_name}") + # Some libraries only exist as versioned files (e.g., libcupti.so.13 in conda), + # so the glob fallback is needed + file_wild = lib_searched_for + "*" + # Only one match is expected, but to ensure deterministic behavior in unexpected + # situations, and to be internally consistent, we sort in reverse order with the + # intent to return the newest version first. + for so_name in sorted(glob.glob(os.path.join(lib_dir, file_wild)), reverse=True): + if os.path.isfile(so_name): + return so_name + error_messages.append(f"No such file: {file_wild}") attachments.append(f' listdir("{lib_dir}"):') if not os.path.isdir(lib_dir): attachments.append(" DIRECTORY DOES NOT EXIST") diff --git a/cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py b/cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py index 1727cca607..13f47fc2b5 100644 --- a/cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py +++ b/cuda_pathfinder/cuda/pathfinder/_headers/find_nvidia_headers.py @@ -8,6 +8,10 @@ import os from dataclasses import dataclass +from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import ( + _resolve_system_loaded_abs_path_in_subprocess, +) +from cuda.pathfinder._dynamic_libs.search_steps import derive_ctk_root from cuda.pathfinder._headers import supported_nvidia_headers from cuda.pathfinder._utils.env_vars import get_cuda_home_or_path from cuda.pathfinder._utils.find_sub_dirs import find_sub_dirs_all_sitepackages @@ -91,6 +95,23 @@ def _find_based_on_conda_layout(libname: str, h_basename: str, ctk_layout: bool) return None +def _find_ctk_header_directory_via_canary(libname: str, h_basename: str) -> str | None: + """Try CTK header lookup via CTK-root canary probing. + + Uses the same canary as dynamic-library CTK-root discovery: system-load + ``cudart`` in a spawned child process, derive CTK root from the resolved + absolute library path, then search the expected CTK include layout under + that root. + """ + canary_abs_path = _resolve_system_loaded_abs_path_in_subprocess("cudart") + if canary_abs_path is None: + return None + ctk_root = derive_ctk_root(canary_abs_path) + if ctk_root is None: + return None + return _locate_based_on_ctk_layout(libname, h_basename, ctk_root) + + def _find_ctk_header_directory(libname: str) -> LocatedHeaderDir | None: h_basename = supported_nvidia_headers.SUPPORTED_HEADERS_CTK[libname] candidate_dirs = supported_nvidia_headers.SUPPORTED_SITE_PACKAGE_HEADER_DIRS_CTK[libname] @@ -106,6 +127,9 @@ def _find_ctk_header_directory(libname: str) -> LocatedHeaderDir | None: if cuda_home and (result := _locate_based_on_ctk_layout(libname, h_basename, cuda_home)): return LocatedHeaderDir(abs_path=result, found_via="CUDA_HOME") + if result := _find_ctk_header_directory_via_canary(libname, h_basename): + return LocatedHeaderDir(abs_path=result, found_via="system-ctk-root") + return None @@ -139,6 +163,12 @@ def locate_nvidia_header_directory(libname: str) -> LocatedHeaderDir | None: 3. **CUDA Toolkit environment variables** - Use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order). + + 4. **CTK root canary probe** + + - Probe a system-loaded ``cudart`` in a spawned child process, + derive the CTK root from the resolved library path, then search + CTK include layout under that root. """ if libname in supported_nvidia_headers.SUPPORTED_HEADERS_CTK: @@ -195,6 +225,12 @@ def find_nvidia_header_directory(libname: str) -> str | None: 3. **CUDA Toolkit environment variables** - Use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order). + + 4. **CTK root canary probe** + + - Probe a system-loaded ``cudart`` in a spawned child process, + derive the CTK root from the resolved library path, then search + CTK include layout under that root. """ found = locate_nvidia_header_directory(libname) return found.abs_path if found else None diff --git a/cuda_pathfinder/docs/nv-versions.json b/cuda_pathfinder/docs/nv-versions.json index a8498094b5..eb0e60239e 100644 --- a/cuda_pathfinder/docs/nv-versions.json +++ b/cuda_pathfinder/docs/nv-versions.json @@ -3,6 +3,10 @@ "version": "latest", "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/latest/" }, + { + "version": "1.4.1", + "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/1.4.1/" + }, { "version": "1.4.0", "url": "https://nvidia.github.io/cuda-python/cuda-pathfinder/1.4.0/" diff --git a/cuda_pathfinder/docs/source/release/1.4.1-notes.rst b/cuda_pathfinder/docs/source/release/1.4.1-notes.rst new file mode 100644 index 0000000000..836a62f03d --- /dev/null +++ b/cuda_pathfinder/docs/source/release/1.4.1-notes.rst @@ -0,0 +1,49 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +.. py:currentmodule:: cuda.pathfinder + +``cuda-pathfinder`` 1.4.1 Release notes +======================================= + +Released on Mar 6, 2026 + +Highlights +---------- + +* Add CTK canary fallback for header discovery. When CUDA headers cannot be + found via site-packages, Conda, or ``CUDA_HOME``/``CUDA_PATH``, the system + now attempts to discover the CTK root via canary probing (using a + system-loaded ``cudart`` library path) and searches the CTK include layout + from that root. This improves header discovery in standard and non-standard + CTK installations, including containerized environments. + (`PR #1731 `_) + +* Add support for loading CUPTI (CUDA Profiling Tools Interface) dynamic libs + via ``load_nvidia_dynamic_lib("cupti")`` on Linux and Windows. CUPTI libraries + are discovered in site-packages (CUDA 12 and 13), Conda environments, and + CTK installations (via CUDA_HOME/CUDA_PATH or canary probe). + Supports ``libcupti.so.12``, ``libcupti.so.13`` on Linux and versioned + ``cupti64_*.dll`` files on Windows. + (`PR #1693 `_) + +* Add support for finding static libraries (e.g., ``libcudadevrt.a`` on Linux, + ``cudadevrt.lib`` on Windows) via new ``find_static_lib()`` and + ``locate_static_lib()`` APIs. These follow the same search order as bitcode + libraries: site-packages, Conda, then CUDA_HOME/CUDA_PATH. + (`PR #1690 `_) + +* Fix site-packages search order for virtual environments created with + ``--system-site-packages``. The search now correctly prioritizes the venv's + site-packages before user-site-packages, conforming to PEP 405. + (`PR #1717 `_) + +Internal refactoring +-------------------- + +* Refactor library discovery and loading to use a descriptor-driven architecture. + All library metadata (SONAMEs, DLLs, site-packages paths, dependencies, loader + flags) is now consolidated into a single ``LibDescriptor`` registry, improving + maintainability and extensibility. This is an internal refactoring with no + behavioral changes. + (`PR #1685 `_) diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml index 21299d3366..fdd01b763b 100644 --- a/cuda_pathfinder/pyproject.toml +++ b/cuda_pathfinder/pyproject.toml @@ -19,7 +19,7 @@ test = [ ] # Internal organization of test dependencies. cu12 = [ - "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl]==12.*", + "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl,cupti]==12.*", "cuda-toolkit[cufile]==12.*; sys_platform != 'win32'", "cutensor-cu12", "nvidia-cublasmp-cu12; sys_platform != 'win32'", @@ -31,7 +31,7 @@ cu12 = [ "nvidia-nvshmem-cu12; sys_platform != 'win32'", ] cu13 = [ - "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl,nvvm]==13.*", + "cuda-toolkit[nvcc,cublas,nvrtc,cudart,cufft,curand,cusolver,cusparse,npp,nvfatbin,nvjitlink,nvjpeg,cccl,cupti,nvvm]==13.*", "cuda-toolkit[cufile]==13.*; sys_platform != 'win32'", "cutensor-cu13", "nvidia-cublasmp-cu13; sys_platform != 'win32'", diff --git a/cuda_pathfinder/tests/test_find_nvidia_headers.py b/cuda_pathfinder/tests/test_find_nvidia_headers.py index f14681546d..2732de216b 100644 --- a/cuda_pathfinder/tests/test_find_nvidia_headers.py +++ b/cuda_pathfinder/tests/test_find_nvidia_headers.py @@ -16,10 +16,15 @@ import importlib.metadata import os import re +from pathlib import Path import pytest +import cuda.pathfinder._headers.find_nvidia_headers as find_nvidia_headers_module from cuda.pathfinder import LocatedHeaderDir, find_nvidia_header_directory, locate_nvidia_header_directory +from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import ( + _resolve_system_loaded_abs_path_in_subprocess, +) from cuda.pathfinder._headers.supported_nvidia_headers import ( SUPPORTED_HEADERS_CTK, SUPPORTED_HEADERS_CTK_ALL, @@ -28,6 +33,7 @@ SUPPORTED_INSTALL_DIRS_NON_CTK, SUPPORTED_SITE_PACKAGE_HEADER_DIRS_CTK, ) +from cuda.pathfinder._utils.platform_aware import IS_WINDOWS STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_FIND_NVIDIA_HEADERS_STRICTNESS", "see_what_works") assert STRICTNESS in ("see_what_works", "all_must_work") @@ -46,7 +52,13 @@ def test_unknown_libname(): def _located_hdr_dir_asserts(located_hdr_dir): assert isinstance(located_hdr_dir, LocatedHeaderDir) - assert located_hdr_dir.found_via in ("site-packages", "conda", "CUDA_HOME", "supported_install_dir") + assert located_hdr_dir.found_via in ( + "site-packages", + "conda", + "CUDA_HOME", + "system-ctk-root", + "supported_install_dir", + ) def test_non_ctk_importlib_metadata_distributions_names(): @@ -62,6 +74,36 @@ def have_distribution_for(libname: str) -> bool: ) +@pytest.fixture +def clear_locate_nvidia_header_cache(): + locate_nvidia_header_directory.cache_clear() + _resolve_system_loaded_abs_path_in_subprocess.cache_clear() + yield + locate_nvidia_header_directory.cache_clear() + _resolve_system_loaded_abs_path_in_subprocess.cache_clear() + + +def _create_ctk_header(ctk_root: Path, libname: str) -> str: + """Create a fake CTK header file and return its directory.""" + header_basename = SUPPORTED_HEADERS_CTK[libname] + if libname == "nvvm": + header_dir = ctk_root / "nvvm" / "include" + elif libname == "cccl": + header_dir = ctk_root / "include" / "cccl" + else: + header_dir = ctk_root / "include" + header_path = header_dir / header_basename + header_path.parent.mkdir(parents=True, exist_ok=True) + header_path.touch() + return str(header_dir) + + +def _fake_cudart_canary_abs_path(ctk_root: Path) -> str: + if IS_WINDOWS: + return str(ctk_root / "bin" / "x64" / "cudart64_13.dll") + return str(ctk_root / "lib64" / "libcudart.so.13") + + @pytest.mark.parametrize("libname", SUPPORTED_HEADERS_NON_CTK.keys()) def test_locate_non_ctk_headers(info_summary_append, libname): hdr_dir = find_nvidia_header_directory(libname) @@ -110,3 +152,85 @@ def test_locate_ctk_headers(info_summary_append, libname): assert os.path.isfile(os.path.join(hdr_dir, h_filename)) if STRICTNESS == "all_must_work": assert hdr_dir is not None + + +@pytest.mark.usefixtures("clear_locate_nvidia_header_cache") +def test_locate_ctk_headers_uses_canary_fallback_when_cuda_home_unset(tmp_path, monkeypatch, mocker): + ctk_root = tmp_path / "cuda-system" + expected_hdr_dir = _create_ctk_header(ctk_root, "cudart") + + monkeypatch.delenv("CONDA_PREFIX", raising=False) + monkeypatch.delenv("CUDA_HOME", raising=False) + monkeypatch.delenv("CUDA_PATH", raising=False) + mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[]) + probe = mocker.patch.object( + find_nvidia_headers_module, + "_resolve_system_loaded_abs_path_in_subprocess", + return_value=_fake_cudart_canary_abs_path(ctk_root), + ) + + located_hdr_dir = locate_nvidia_header_directory("cudart") + + assert located_hdr_dir is not None + assert located_hdr_dir.abs_path == expected_hdr_dir + assert located_hdr_dir.found_via == "system-ctk-root" + probe.assert_called_once_with("cudart") + + +@pytest.mark.usefixtures("clear_locate_nvidia_header_cache") +def test_locate_ctk_headers_cuda_home_takes_priority_over_canary(tmp_path, monkeypatch, mocker): + cuda_home = tmp_path / "cuda-home" + expected_hdr_dir = _create_ctk_header(cuda_home, "cudart") + canary_root = tmp_path / "cuda-system" + _create_ctk_header(canary_root, "cudart") + + monkeypatch.delenv("CONDA_PREFIX", raising=False) + monkeypatch.setenv("CUDA_HOME", str(cuda_home)) + monkeypatch.delenv("CUDA_PATH", raising=False) + mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[]) + probe = mocker.patch.object( + find_nvidia_headers_module, + "_resolve_system_loaded_abs_path_in_subprocess", + return_value=_fake_cudart_canary_abs_path(canary_root), + ) + + located_hdr_dir = locate_nvidia_header_directory("cudart") + + assert located_hdr_dir is not None + assert located_hdr_dir.abs_path == expected_hdr_dir + assert located_hdr_dir.found_via == "CUDA_HOME" + probe.assert_not_called() + + +@pytest.mark.usefixtures("clear_locate_nvidia_header_cache") +def test_locate_ctk_headers_canary_miss_paths_are_non_fatal(monkeypatch, mocker): + monkeypatch.delenv("CONDA_PREFIX", raising=False) + monkeypatch.delenv("CUDA_HOME", raising=False) + monkeypatch.delenv("CUDA_PATH", raising=False) + mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[]) + mocker.patch.object( + find_nvidia_headers_module, + "_resolve_system_loaded_abs_path_in_subprocess", + return_value=None, + ) + + assert locate_nvidia_header_directory("cudart") is None + assert find_nvidia_header_directory("cudart") is None + + +@pytest.mark.usefixtures("clear_locate_nvidia_header_cache") +def test_locate_ctk_headers_canary_probe_errors_are_not_masked(monkeypatch, mocker): + monkeypatch.delenv("CONDA_PREFIX", raising=False) + monkeypatch.delenv("CUDA_HOME", raising=False) + monkeypatch.delenv("CUDA_PATH", raising=False) + mocker.patch.object(find_nvidia_headers_module, "find_sub_dirs_all_sitepackages", return_value=[]) + mocker.patch.object( + find_nvidia_headers_module, + "_resolve_system_loaded_abs_path_in_subprocess", + side_effect=RuntimeError("canary probe failed"), + ) + + with pytest.raises(RuntimeError, match="canary probe failed"): + locate_nvidia_header_directory("cudart") + with pytest.raises(RuntimeError, match="canary probe failed"): + find_nvidia_header_directory("cudart") diff --git a/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib_using_mocker.py b/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib_using_mocker.py new file mode 100644 index 0000000000..3510d1933e --- /dev/null +++ b/cuda_pathfinder/tests/test_load_nvidia_dynamic_lib_using_mocker.py @@ -0,0 +1,173 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from cuda.pathfinder._dynamic_libs import load_nvidia_dynamic_lib as load_mod +from cuda.pathfinder._dynamic_libs import search_steps as steps_mod +from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL +from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import ( + _load_lib_no_cache, + _resolve_system_loaded_abs_path_in_subprocess, +) +from cuda.pathfinder._dynamic_libs.search_steps import EARLY_FIND_STEPS +from cuda.pathfinder._utils.platform_aware import IS_WINDOWS + +_MODULE = "cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib" +_STEPS_MODULE = "cuda.pathfinder._dynamic_libs.search_steps" + + +@pytest.fixture(autouse=True) +def _clear_canary_subprocess_probe_cache(): + _resolve_system_loaded_abs_path_in_subprocess.cache_clear() + yield + _resolve_system_loaded_abs_path_in_subprocess.cache_clear() + + +def _make_loaded_dl(path, found_via): + return LoadedDL(path, False, 0xDEAD, found_via) + + +def _create_cupti_in_ctk(ctk_root): + """Create a fake cupti lib in extras/CUPTI/lib64.""" + if IS_WINDOWS: + cupti_dir = ctk_root / "extras" / "CUPTI" / "lib64" + cupti_dir.mkdir(parents=True, exist_ok=True) + cupti_lib = cupti_dir / "cupti64_2025.4.1.dll" + else: + cupti_dir = ctk_root / "extras" / "CUPTI" / "lib64" + cupti_dir.mkdir(parents=True, exist_ok=True) + cupti_lib = cupti_dir / "libcupti.so.13" + # Create symlink like real CTK installations + cupti_symlink = cupti_dir / "libcupti.so" + cupti_symlink.symlink_to("libcupti.so.13") + cupti_lib.write_bytes(b"fake") + return cupti_lib + + +# --------------------------------------------------------------------------- +# Conda tests +# Note: Site-packages and CTK are covered by real CI tests. +# Mock tests focus on Conda (not covered by real CI) and error paths. +# --------------------------------------------------------------------------- + + +def test_cupti_found_in_conda(tmp_path, mocker, monkeypatch): + """Test finding cupti in conda environment.""" + if IS_WINDOWS: + pytest.skip("Windows support for cupti not yet implemented") + + # Create conda structure + conda_prefix = tmp_path / "conda_env" + conda_lib_dir = conda_prefix / "lib" + conda_lib_dir.mkdir(parents=True) + cupti_lib = conda_lib_dir / "libcupti.so.13" + cupti_lib.write_bytes(b"fake") + + # Mock conda discovery + monkeypatch.setenv("CONDA_PREFIX", str(conda_prefix)) + + # Disable site-packages search + def _run_find_steps_without_site_packages(ctx, steps): + if steps is EARLY_FIND_STEPS: + # Skip site-packages, only run conda + from cuda.pathfinder._dynamic_libs.search_steps import find_in_conda + + result = find_in_conda(ctx) + return result + return steps_mod.run_find_steps(ctx, steps) + + mocker.patch(f"{_MODULE}.run_find_steps", side_effect=_run_find_steps_without_site_packages) + mocker.patch.object(load_mod.LOADER, "check_if_already_loaded_from_elsewhere", return_value=None) + mocker.patch(f"{_MODULE}.load_dependencies") + mocker.patch.object(load_mod.LOADER, "load_with_system_search", return_value=None) + mocker.patch(f"{_STEPS_MODULE}.get_cuda_home_or_path", return_value=None) + mocker.patch(f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess", return_value=None) + mocker.patch.object( + load_mod.LOADER, + "load_with_abs_path", + side_effect=lambda _desc, path, via: _make_loaded_dl(path, via), + ) + + result = _load_lib_no_cache("cupti") + assert result.found_via == "conda" + assert result.abs_path == str(cupti_lib) + + +# --------------------------------------------------------------------------- +# Error path tests +# --------------------------------------------------------------------------- + + +def test_cupti_not_found_raises_error(mocker): + """Test that DynamicLibNotFoundError is raised when cupti is not found.""" + if IS_WINDOWS: + pytest.skip("Windows support for cupti not yet implemented") + + # Mock all search paths to return None + def _run_find_steps_disabled(ctx, steps): + return None + + mocker.patch(f"{_MODULE}.run_find_steps", side_effect=_run_find_steps_disabled) + mocker.patch.object(load_mod.LOADER, "check_if_already_loaded_from_elsewhere", return_value=None) + mocker.patch(f"{_MODULE}.load_dependencies") + mocker.patch.object(load_mod.LOADER, "load_with_system_search", return_value=None) + mocker.patch(f"{_STEPS_MODULE}.get_cuda_home_or_path", return_value=None) + mocker.patch( + f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess", + return_value=None, + ) + + with pytest.raises(DynamicLibNotFoundError): + _load_lib_no_cache("cupti") + + +# --------------------------------------------------------------------------- +# Search order tests (Conda-specific, since Conda is not covered by real CI) +# --------------------------------------------------------------------------- + + +def test_cupti_search_order_conda_before_cuda_home(tmp_path, mocker, monkeypatch): + """Test that conda is searched before CUDA_HOME (CTK). + + This test is important because Conda is not covered by real CI tests, + so we need to verify the search order between Conda and CTK. + """ + if IS_WINDOWS: + pytest.skip("Windows support for cupti not yet implemented") + + # Create both conda and CUDA_HOME structures + conda_prefix = tmp_path / "conda_env" + conda_lib_dir = conda_prefix / "lib" + conda_lib_dir.mkdir(parents=True) + conda_cupti_lib = conda_lib_dir / "libcupti.so.13" + conda_cupti_lib.write_bytes(b"fake") + + ctk_root = tmp_path / "cuda-13.1" + _create_cupti_in_ctk(ctk_root) + + # Mock discovery - disable site-packages, enable conda + def _run_find_steps_without_site_packages(ctx, steps): + if steps is EARLY_FIND_STEPS: + # Skip site-packages, only run conda + from cuda.pathfinder._dynamic_libs.search_steps import find_in_conda + + result = find_in_conda(ctx) + return result + return steps_mod.run_find_steps(ctx, steps) + + mocker.patch(f"{_MODULE}.run_find_steps", side_effect=_run_find_steps_without_site_packages) + monkeypatch.setenv("CONDA_PREFIX", str(conda_prefix)) + mocker.patch.object(load_mod.LOADER, "check_if_already_loaded_from_elsewhere", return_value=None) + mocker.patch(f"{_MODULE}.load_dependencies") + mocker.patch.object(load_mod.LOADER, "load_with_system_search", return_value=None) + mocker.patch(f"{_STEPS_MODULE}.get_cuda_home_or_path", return_value=str(ctk_root)) + mocker.patch.object( + load_mod.LOADER, + "load_with_abs_path", + side_effect=lambda _desc, path, via: _make_loaded_dl(path, via), + ) + + result = _load_lib_no_cache("cupti") + assert result.found_via == "conda" + assert result.abs_path == str(conda_cupti_lib) diff --git a/ruff.toml b/ruff.toml index 7f3853529e..76f548848c 100644 --- a/ruff.toml +++ b/ruff.toml @@ -123,13 +123,15 @@ inline-quotes = "double" ] # CUDA bindings mirror C API naming conventions (CamelCase types, camelCase functions) -"cuda_bindings/**" = [ +# Keep examples opted-in to enforce naming conventions in example-local identifiers. +"cuda_bindings/{benchmarks,cuda,docs,tests}/**" = [ "N801", # invalid-class-name "N802", # invalid-function-name "N803", # invalid-argument-name "N806", # non-lowercase-variable-in-function "N816", # mixed-case-variable-in-global-scope ] +"cuda_bindings/{build_hooks.py,setup.py}" = ["N801", "N802", "N803", "N806", "N816"] # scripts and build tooling — print is the expected output method "toolshed/**" = ["T201"]