Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
478101c
Convert _graph.py to _graph/ package for explicit graph construction …
Andy-Jost Mar 3, 2026
ee55795
Added GraphHandle to RAII module.
Andy-Jost Mar 4, 2026
3b3a715
Add GraphDef and Node classes for explicit graph construction
Andy-Jost Mar 4, 2026
7559d2f
Add Node class hierarchy with type-specific properties and parameteri…
Andy-Jost Mar 5, 2026
ae5d706
Add MemsetNode with shared _parse_fill_value utility
Andy-Jost Mar 5, 2026
ba7d2f9
Add EventRecordNode and EventWaitNode with Event.from_handle support
Andy-Jost Mar 5, 2026
77a5dac
Replace GraphDef.root with forwarding methods and GraphDef.join
Andy-Jost Mar 5, 2026
1a790ce
Improve __repr__ for graph nodes, add Node.handle, use as_py for Grap…
Andy-Jost Mar 5, 2026
3e25c64
Add MemcpyNode with auto-detected memory types
Andy-Jost Mar 5, 2026
228de38
Add ChildGraphNode with embed() builder and non-owning graph handle
Andy-Jost Mar 5, 2026
506850b
Add HostCallbackNode with Python callable and ctypes CFUNCTYPE support
Andy-Jost Mar 5, 2026
4ee0ed7
Fix dangling child graph references by capturing parent handle
Andy-Jost Mar 5, 2026
82f0ec7
Add conditional graph nodes (IfNode, IfElseNode, WhileNode, SwitchNode)
Andy-Jost Mar 5, 2026
5cf85c4
Reconstruct conditional node subtypes on CUDA 13.2+ drivers
Andy-Jost Mar 5, 2026
d993f9c
Apply developer guide styling to _graphdef.pyx
Andy-Jost Mar 5, 2026
da73536
Fix conditional node body graphs and add integration tests
Andy-Jost Mar 6, 2026
d228830
Merge remote-tracking branch 'origin/main' into explicit-graph-constr…
Andy-Jost Mar 6, 2026
b463ccb
Add lifetime and error/edge-case tests for explicit graph construction
Andy-Jost Mar 6, 2026
f1cceb2
Add RAII NodeHandle, event/kernel lifetime via user objects, consolid…
Andy-Jost Mar 6, 2026
133719b
Move Event metadata fields to EventBox, access via get_box() pointer …
Andy-Jost Mar 6, 2026
e7ebe53
Add HandleRegistry template and event reverse-lookup
Andy-Jost Mar 6, 2026
f0bbf66
Add HandleRegistry template, event reverse-lookup, refactor IPC cache
Andy-Jost Mar 6, 2026
b830e6e
Add kernel reverse-lookup registry, fix create_kernel_handle_ref sema…
Andy-Jost Mar 6, 2026
b55782a
Rename NodeHandle to GraphNodeHandle for consistency with driver term…
Andy-Jost Mar 6, 2026
73ba7fe
Merge remote-tracking branch 'origin/main' into explicit-graph-constr…
Andy-Jost Mar 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 188 additions & 38 deletions cuda_core/cuda/core/_cpp/resource_handles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ decltype(&cuLibraryLoadData) p_cuLibraryLoadData = nullptr;
decltype(&cuLibraryUnload) p_cuLibraryUnload = nullptr;
decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr;

// Graph
decltype(&cuGraphDestroy) p_cuGraphDestroy = nullptr;

// Linker
decltype(&cuLinkDestroy) p_cuLinkDestroy = nullptr;

Expand Down Expand Up @@ -160,6 +163,49 @@ class GILAcquireGuard {

} // namespace

// ============================================================================
// Handle reverse-lookup registry
//
// Maps raw CUDA handles (CUevent, CUkernel, etc.) back to their owning
// shared_ptr so that _ref constructors can recover full metadata.
// Uses weak_ptr to avoid preventing destruction.
// ============================================================================

template<typename Key, typename Handle, typename Hash = std::hash<Key>>
class HandleRegistry {
public:
void register_handle(const Key& key, const Handle& h) {
std::lock_guard<std::mutex> lock(mutex_);
map_[key] = h;
}

void unregister_handle(const Key& key) noexcept {
try {
std::lock_guard<std::mutex> lock(mutex_);
auto it = map_.find(key);
if (it != map_.end() && it->second.expired()) {
map_.erase(it);
}
} catch (...) {}
}

Handle lookup(const Key& key) {
std::lock_guard<std::mutex> lock(mutex_);
auto it = map_.find(key);
if (it != map_.end()) {
if (auto h = it->second.lock()) {
return h;
}
map_.erase(it);
}
return {};
}

private:
std::mutex mutex_;
std::unordered_map<Key, std::weak_ptr<typename Handle::element_type>, Hash> map_;
};

// ============================================================================
// Thread-local error handling
// ============================================================================
Expand Down Expand Up @@ -318,47 +364,98 @@ StreamHandle get_per_thread_stream() {
namespace {
struct EventBox {
CUevent resource;
bool timing_disabled;
bool busy_waited;
bool ipc_enabled;
int device_id;
ContextHandle h_context;
Comment on lines 365 to +371
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These properties are set at event creation time and cannot be queried through the driver API. Moreover, graph-attached events are returned from the driver as plain CUevent handles, and reconstructing the Cython Event object from one of those would lose this information.

The solution is to move the property metadata into C++ and set up a reverse look-up so that the driver-returned CUevent can be used to retrieve the managing shared_ptr, which holds this EventBox.

Graph-attached kernels are handled similarly.

};
} // namespace

EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) {
static const EventBox* get_box(const EventHandle& h) {
const CUevent* p = h.get();
return reinterpret_cast<const EventBox*>(
reinterpret_cast<const char*>(p) - offsetof(EventBox, resource)
);
}

bool get_event_timing_disabled(const EventHandle& h) noexcept {
return h ? get_box(h)->timing_disabled : true;
}

bool get_event_busy_waited(const EventHandle& h) noexcept {
return h ? get_box(h)->busy_waited : false;
}

bool get_event_ipc_enabled(const EventHandle& h) noexcept {
return h ? get_box(h)->ipc_enabled : false;
}

int get_event_device_id(const EventHandle& h) noexcept {
return h ? get_box(h)->device_id : -1;
}

ContextHandle get_event_context(const EventHandle& h) noexcept {
return h ? get_box(h)->h_context : ContextHandle{};
}

static HandleRegistry<CUevent, EventHandle> event_registry;

EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags,
bool timing_disabled, bool busy_waited,
bool ipc_enabled, int device_id) {
GILReleaseGuard gil;
CUevent event;
if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
return {};
}

auto box = std::shared_ptr<const EventBox>(
new EventBox{event},
new EventBox{event, timing_disabled, busy_waited, ipc_enabled, device_id, h_ctx},
[h_ctx](const EventBox* b) {
event_registry.unregister_handle(b->resource);
GILReleaseGuard gil;
p_cuEventDestroy(b->resource);
delete b;
}
);
return EventHandle(box, &box->resource);
EventHandle h(box, &box->resource);
event_registry.register_handle(event, h);
return h;
}

EventHandle create_event_handle_noctx(unsigned int flags) {
return create_event_handle(ContextHandle{}, flags);
return create_event_handle(ContextHandle{}, flags, true, false, false, -1);
}

EventHandle create_event_handle_ref(CUevent event) {
if (auto h = event_registry.lookup(event)) {
return h;
}
auto box = std::make_shared<const EventBox>(EventBox{event, true, false, false, -1, {}});
return EventHandle(box, &box->resource);
}

EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle,
bool busy_waited) {
GILReleaseGuard gil;
CUevent event;
if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
return {};
}

auto box = std::shared_ptr<const EventBox>(
new EventBox{event},
new EventBox{event, true, busy_waited, true, -1, {}},
[](const EventBox* b) {
event_registry.unregister_handle(b->resource);
GILReleaseGuard gil;
p_cuEventDestroy(b->resource);
delete b;
}
);
return EventHandle(box, &box->resource);
EventHandle h(box, &box->resource);
event_registry.register_handle(event, h);
return h;
}

// ============================================================================
Expand Down Expand Up @@ -665,61 +762,43 @@ struct ExportDataKeyHash {

}

static std::mutex ipc_ptr_cache_mutex;
static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;
static HandleRegistry<ExportDataKey, DevicePtrHandle, ExportDataKeyHash> ipc_ptr_cache;
static std::mutex ipc_import_mutex;

DevicePtrHandle deviceptr_import_ipc(const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) {
auto data = const_cast<CUmemPoolPtrExportData*>(
reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));

if (use_ipc_ptr_cache()) {
// Check cache before calling cuMemPoolImportPointer
ExportDataKey key;
std::memcpy(&key.data, data, sizeof(key.data));

std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
std::lock_guard<std::mutex> lock(ipc_import_mutex);

auto it = ipc_ptr_cache.find(key);
if (it != ipc_ptr_cache.end()) {
if (auto box = it->second.lock()) {
// Cache hit - return existing handle
return DevicePtrHandle(box, &box->resource);
}
ipc_ptr_cache.erase(it); // Expired entry
if (auto h = ipc_ptr_cache.lookup(key)) {
return h;
}

// Cache miss - import the pointer
GILReleaseGuard gil;
CUdeviceptr ptr;
if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
return {};
}

// Create new handle with cache-clearing deleter
auto box = std::shared_ptr<DevicePtrBox>(
new DevicePtrBox{ptr, h_stream},
[h_pool, key](DevicePtrBox* b) {
ipc_ptr_cache.unregister_handle(key);
GILReleaseGuard gil;
try {
std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
// Only erase if expired - avoids race where another thread
// replaced the entry with a new import before we acquired the lock.
auto it = ipc_ptr_cache.find(key);
if (it != ipc_ptr_cache.end() && it->second.expired()) {
ipc_ptr_cache.erase(it);
}
} catch (...) {
// Cache cleanup is best-effort - swallow exceptions in destructor context
}
p_cuMemFreeAsync(b->resource, as_cu(b->h_stream));
delete b;
}
);
ipc_ptr_cache[key] = box;
return DevicePtrHandle(box, &box->resource);
DevicePtrHandle h(box, &box->resource);
ipc_ptr_cache.register_handle(key, h);
return h;

} else {
// No caching - simple handle creation
GILReleaseGuard gil;
CUdeviceptr ptr;
if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
Expand Down Expand Up @@ -798,25 +877,96 @@ LibraryHandle create_library_handle_ref(CUlibrary library) {
namespace {
struct KernelBox {
CUkernel resource;
LibraryHandle h_library; // Keeps library alive
LibraryHandle h_library;
};
} // namespace

static const KernelBox* get_box(const KernelHandle& h) {
const CUkernel* p = h.get();
return reinterpret_cast<const KernelBox*>(
reinterpret_cast<const char*>(p) - offsetof(KernelBox, resource)
);
}

static HandleRegistry<CUkernel, KernelHandle> kernel_registry;

KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name) {
GILReleaseGuard gil;
CUkernel kernel;
if (CUDA_SUCCESS != (err = p_cuLibraryGetKernel(&kernel, *h_library, name))) {
return {};
}

return create_kernel_handle_ref(kernel, h_library);
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
KernelHandle h(box, &box->resource);
kernel_registry.register_handle(kernel, h);
return h;
}

KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library) {
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
KernelHandle create_kernel_handle_ref(CUkernel kernel) {
if (auto h = kernel_registry.lookup(kernel)) {
return h;
}
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, {}});
return KernelHandle(box, &box->resource);
}

LibraryHandle get_kernel_library(const KernelHandle& h) noexcept {
if (!h) return {};
return get_box(h)->h_library;
}

// ============================================================================
// Graph Handles
// ============================================================================

namespace {
struct GraphBox {
CUgraph resource;
GraphHandle h_parent; // Keeps parent alive for child/branch graphs
};
} // namespace

GraphHandle create_graph_handle(CUgraph graph) {
auto box = std::shared_ptr<const GraphBox>(
new GraphBox{graph, {}},
[](const GraphBox* b) {
GILReleaseGuard gil;
p_cuGraphDestroy(b->resource);
delete b;
}
);
return GraphHandle(box, &box->resource);
}

GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent) {
auto box = std::make_shared<const GraphBox>(GraphBox{graph, h_parent});
return GraphHandle(box, &box->resource);
}

namespace {
struct GraphNodeBox {
CUgraphNode resource;
GraphHandle h_graph;
};
} // namespace

static const GraphNodeBox* get_box(const GraphNodeHandle& h) {
const CUgraphNode* p = h.get();
return reinterpret_cast<const GraphNodeBox*>(
reinterpret_cast<const char*>(p) - offsetof(GraphNodeBox, resource)
);
}

GraphNodeHandle create_graph_node_handle(CUgraphNode node, const GraphHandle& h_graph) {
auto box = std::make_shared<const GraphNodeBox>(GraphNodeBox{node, h_graph});
return GraphNodeHandle(box, &box->resource);
}

GraphHandle graph_node_get_graph(const GraphNodeHandle& h) noexcept {
return h ? get_box(h)->h_graph : GraphHandle{};
}

// ============================================================================
// Graphics Resource Handles
// ============================================================================
Expand Down
Loading