From 42b7e3b73e5aaa02cb0ce29210eade1a0493a958 Mon Sep 17 00:00:00 2001
From: Nikshay Shrivastava <nikshays@nvidia.com>
Date: Wed, 25 Feb 2026 00:00:30 -0800
Subject: [PATCH 1/4] Add cuDLA bindings

Generated from cudla.h using cybind.

Files added:
- cycudla.pxd/pyx: Cython layer exposing C header types and functions
- cudla.pxd/pyx: lowpp Python layer with POD classes, enums, and wrappers
- _internal/cudla.pxd, cudla_linux.pyx, cudla_windows.pyx: dynamic library loading
- docs/source/module/cudla.rst: API documentation
- tests/cudla/: pytest unit tests for enums, POD types, error handling,
  API surface, and hardware-gated function tests (verified on L4T/Orin)

Build/CI updates:
- pyproject.toml: added cudla to cuda-toolkit optional dependencies
- .github/actions/fetch_ctk/action.yml: added libcudla to CTK components
- docs/source/api.rst: added cudla to toctree
---
 .github/actions/fetch_ctk/action.yml          |    2 +-
 .../cuda/bindings/_internal/cudla.pxd         |   22 +
 .../cuda/bindings/_internal/cudla_linux.pyx   |  382 ++++
 .../cuda/bindings/_internal/cudla_windows.pyx |  345 ++++
 cuda_bindings/cuda/bindings/cudla.pxd         |   49 +
 cuda_bindings/cuda/bindings/cudla.pyx         | 1826 +++++++++++++++++
 cuda_bindings/cuda/bindings/cycudla.pxd       |  155 ++
 cuda_bindings/cuda/bindings/cycudla.pyx       |   61 +
 cuda_bindings/docs/source/api.rst             |    1 +
 cuda_bindings/docs/source/module/cudla.rst    |   67 +
 cuda_bindings/pyproject.toml                  |    2 +-
 cuda_bindings/tests/cudla/conftest.py         |   13 +
 .../tests/cudla/test_cudla_bindings.py        |  305 +++
 13 files changed, 3228 insertions(+), 2 deletions(-)
 create mode 100644 cuda_bindings/cuda/bindings/_internal/cudla.pxd
 create mode 100644 cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
 create mode 100644 cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx
 create mode 100644 cuda_bindings/cuda/bindings/cudla.pxd
 create mode 100644 cuda_bindings/cuda/bindings/cudla.pyx
 create mode 100644 cuda_bindings/cuda/bindings/cycudla.pxd
 create mode 100644 cuda_bindings/cuda/bindings/cycudla.pyx
 create mode 100644 cuda_bindings/docs/source/module/cudla.rst
 create mode 100644 cuda_bindings/tests/cudla/conftest.py
 create mode 100644 cuda_bindings/tests/cudla/test_cudla_bindings.py

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index a150628e0f9..d0f088e47bd 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -14,7 +14,7 @@ inputs:
   cuda-components:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin,libcudla"
   cuda-path:
     description: "where the CTK components will be installed to, relative to $PWD"
     required: false
diff --git a/cuda_bindings/cuda/bindings/_internal/cudla.pxd b/cuda_bindings/cuda/bindings/_internal/cudla.pxd
new file mode 100644
index 00000000000..57fef6a7323
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/cudla.pxd
@@ -0,0 +1,22 @@
+# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+
+from ..cycudla cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cudlaStatus _cudlaGetVersion(uint64_t* const version) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaDeviceGetCount(uint64_t* const pNumDevices) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaCreateDevice(const uint64_t device, cudlaDevHandle* const devHandle, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaMemRegister(const cudlaDevHandle devHandle, const uint64_t* const ptr, const size_t size, uint64_t** const devPtr, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaModuleLoadFromMemory(const cudlaDevHandle devHandle, const uint8_t* const pModule, const size_t moduleSize, cudlaModule* const hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaModuleGetAttributes(const cudlaModule hModule, const cudlaModuleAttributeType attrType, cudlaModuleAttribute* const attribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaModuleUnload(const cudlaModule hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaSubmitTask(const cudlaDevHandle devHandle, const cudlaTask* const ptrToTasks, const uint32_t numTasks, void* const stream, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaDeviceGetAttribute(const cudlaDevHandle devHandle, const cudlaDevAttributeType attrib, cudlaDevAttribute* const pAttribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaMemUnregister(const cudlaDevHandle devHandle, const uint64_t* const devPtr) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaGetLastError(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaDestroyDevice(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus _cudlaSetTaskTimeoutInMs(const cudlaDevHandle devHandle, const uint32_t timeout) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx b/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
new file mode 100644
index 00000000000..6a0f4f2984b
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
@@ -0,0 +1,382 @@
+# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uintptr_t
+
+import threading
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_cudla_init = False
+
+cdef void* __cudlaGetVersion = NULL
+cdef void* __cudlaDeviceGetCount = NULL
+cdef void* __cudlaCreateDevice = NULL
+cdef void* __cudlaMemRegister = NULL
+cdef void* __cudlaModuleLoadFromMemory = NULL
+cdef void* __cudlaModuleGetAttributes = NULL
+cdef void* __cudlaModuleUnload = NULL
+cdef void* __cudlaSubmitTask = NULL
+cdef void* __cudlaDeviceGetAttribute = NULL
+cdef void* __cudlaMemUnregister = NULL
+cdef void* __cudlaGetLastError = NULL
+cdef void* __cudlaDestroyDevice = NULL
+cdef void* __cudlaSetTaskTimeoutInMs = NULL
+
+
+cdef void* load_library() except* with gil:
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cudla")._handle_uint
+    return <void*>handle
+
+
+cdef int _init_cudla() except -1 nogil:
+    global __py_cudla_init
+    cdef void* handle = NULL
+
+    with gil, __symbol_lock:
+        # Recheck the flag after obtaining the locks
+        if __py_cudla_init:
+            return 0
+
+        # Load function
+        global __cudlaGetVersion
+        __cudlaGetVersion = dlsym(RTLD_DEFAULT, 'cudlaGetVersion')
+        if __cudlaGetVersion == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaGetVersion = dlsym(handle, 'cudlaGetVersion')
+
+        global __cudlaDeviceGetCount
+        __cudlaDeviceGetCount = dlsym(RTLD_DEFAULT, 'cudlaDeviceGetCount')
+        if __cudlaDeviceGetCount == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaDeviceGetCount = dlsym(handle, 'cudlaDeviceGetCount')
+
+        global __cudlaCreateDevice
+        __cudlaCreateDevice = dlsym(RTLD_DEFAULT, 'cudlaCreateDevice')
+        if __cudlaCreateDevice == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaCreateDevice = dlsym(handle, 'cudlaCreateDevice')
+
+        global __cudlaMemRegister
+        __cudlaMemRegister = dlsym(RTLD_DEFAULT, 'cudlaMemRegister')
+        if __cudlaMemRegister == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaMemRegister = dlsym(handle, 'cudlaMemRegister')
+
+        global __cudlaModuleLoadFromMemory
+        __cudlaModuleLoadFromMemory = dlsym(RTLD_DEFAULT, 'cudlaModuleLoadFromMemory')
+        if __cudlaModuleLoadFromMemory == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaModuleLoadFromMemory = dlsym(handle, 'cudlaModuleLoadFromMemory')
+
+        global __cudlaModuleGetAttributes
+        __cudlaModuleGetAttributes = dlsym(RTLD_DEFAULT, 'cudlaModuleGetAttributes')
+        if __cudlaModuleGetAttributes == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaModuleGetAttributes = dlsym(handle, 'cudlaModuleGetAttributes')
+
+        global __cudlaModuleUnload
+        __cudlaModuleUnload = dlsym(RTLD_DEFAULT, 'cudlaModuleUnload')
+        if __cudlaModuleUnload == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaModuleUnload = dlsym(handle, 'cudlaModuleUnload')
+
+        global __cudlaSubmitTask
+        __cudlaSubmitTask = dlsym(RTLD_DEFAULT, 'cudlaSubmitTask')
+        if __cudlaSubmitTask == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaSubmitTask = dlsym(handle, 'cudlaSubmitTask')
+
+        global __cudlaDeviceGetAttribute
+        __cudlaDeviceGetAttribute = dlsym(RTLD_DEFAULT, 'cudlaDeviceGetAttribute')
+        if __cudlaDeviceGetAttribute == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaDeviceGetAttribute = dlsym(handle, 'cudlaDeviceGetAttribute')
+
+        global __cudlaMemUnregister
+        __cudlaMemUnregister = dlsym(RTLD_DEFAULT, 'cudlaMemUnregister')
+        if __cudlaMemUnregister == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaMemUnregister = dlsym(handle, 'cudlaMemUnregister')
+
+        global __cudlaGetLastError
+        __cudlaGetLastError = dlsym(RTLD_DEFAULT, 'cudlaGetLastError')
+        if __cudlaGetLastError == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaGetLastError = dlsym(handle, 'cudlaGetLastError')
+
+        global __cudlaDestroyDevice
+        __cudlaDestroyDevice = dlsym(RTLD_DEFAULT, 'cudlaDestroyDevice')
+        if __cudlaDestroyDevice == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaDestroyDevice = dlsym(handle, 'cudlaDestroyDevice')
+
+        global __cudlaSetTaskTimeoutInMs
+        __cudlaSetTaskTimeoutInMs = dlsym(RTLD_DEFAULT, 'cudlaSetTaskTimeoutInMs')
+        if __cudlaSetTaskTimeoutInMs == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __cudlaSetTaskTimeoutInMs = dlsym(handle, 'cudlaSetTaskTimeoutInMs')
+
+        __py_cudla_init = True
+        return 0
+
+
+cdef inline int _check_or_init_cudla() except -1 nogil:
+    if __py_cudla_init:
+        return 0
+
+    return _init_cudla()
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_cudla()
+    cdef dict data = {}
+
+    global __cudlaGetVersion
+    data["__cudlaGetVersion"] = <intptr_t>__cudlaGetVersion
+
+    global __cudlaDeviceGetCount
+    data["__cudlaDeviceGetCount"] = <intptr_t>__cudlaDeviceGetCount
+
+    global __cudlaCreateDevice
+    data["__cudlaCreateDevice"] = <intptr_t>__cudlaCreateDevice
+
+    global __cudlaMemRegister
+    data["__cudlaMemRegister"] = <intptr_t>__cudlaMemRegister
+
+    global __cudlaModuleLoadFromMemory
+    data["__cudlaModuleLoadFromMemory"] = <intptr_t>__cudlaModuleLoadFromMemory
+
+    global __cudlaModuleGetAttributes
+    data["__cudlaModuleGetAttributes"] = <intptr_t>__cudlaModuleGetAttributes
+
+    global __cudlaModuleUnload
+    data["__cudlaModuleUnload"] = <intptr_t>__cudlaModuleUnload
+
+    global __cudlaSubmitTask
+    data["__cudlaSubmitTask"] = <intptr_t>__cudlaSubmitTask
+
+    global __cudlaDeviceGetAttribute
+    data["__cudlaDeviceGetAttribute"] = <intptr_t>__cudlaDeviceGetAttribute
+
+    global __cudlaMemUnregister
+    data["__cudlaMemUnregister"] = <intptr_t>__cudlaMemUnregister
+
+    global __cudlaGetLastError
+    data["__cudlaGetLastError"] = <intptr_t>__cudlaGetLastError
+
+    global __cudlaDestroyDevice
+    data["__cudlaDestroyDevice"] = <intptr_t>__cudlaDestroyDevice
+
+    global __cudlaSetTaskTimeoutInMs
+    data["__cudlaSetTaskTimeoutInMs"] = <intptr_t>__cudlaSetTaskTimeoutInMs
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cudlaStatus _cudlaGetVersion(uint64_t* const version) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaGetVersion
+    _check_or_init_cudla()
+    if __cudlaGetVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaGetVersion is not found")
+    return (<cudlaStatus (*)(uint64_t* const) noexcept nogil>__cudlaGetVersion)(
+        version)
+
+
+cdef cudlaStatus _cudlaDeviceGetCount(uint64_t* const pNumDevices) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaDeviceGetCount
+    _check_or_init_cudla()
+    if __cudlaDeviceGetCount == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaDeviceGetCount is not found")
+    return (<cudlaStatus (*)(uint64_t* const) noexcept nogil>__cudlaDeviceGetCount)(
+        pNumDevices)
+
+
+cdef cudlaStatus _cudlaCreateDevice(const uint64_t device, cudlaDevHandle* const devHandle, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaCreateDevice
+    _check_or_init_cudla()
+    if __cudlaCreateDevice == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaCreateDevice is not found")
+    return (<cudlaStatus (*)(const uint64_t, cudlaDevHandle* const, const uint32_t) noexcept nogil>__cudlaCreateDevice)(
+        device, devHandle, flags)
+
+
+cdef cudlaStatus _cudlaMemRegister(const cudlaDevHandle devHandle, const uint64_t* const ptr, const size_t size, uint64_t** const devPtr, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaMemRegister
+    _check_or_init_cudla()
+    if __cudlaMemRegister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaMemRegister is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const uint64_t* const, const size_t, uint64_t** const, const uint32_t) noexcept nogil>__cudlaMemRegister)(
+        devHandle, ptr, size, devPtr, flags)
+
+
+cdef cudlaStatus _cudlaModuleLoadFromMemory(const cudlaDevHandle devHandle, const uint8_t* const pModule, const size_t moduleSize, cudlaModule* const hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaModuleLoadFromMemory
+    _check_or_init_cudla()
+    if __cudlaModuleLoadFromMemory == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaModuleLoadFromMemory is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const uint8_t* const, const size_t, cudlaModule* const, const uint32_t) noexcept nogil>__cudlaModuleLoadFromMemory)(
+        devHandle, pModule, moduleSize, hModule, flags)
+
+
+cdef cudlaStatus _cudlaModuleGetAttributes(const cudlaModule hModule, const cudlaModuleAttributeType attrType, cudlaModuleAttribute* const attribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaModuleGetAttributes
+    _check_or_init_cudla()
+    if __cudlaModuleGetAttributes == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaModuleGetAttributes is not found")
+    return (<cudlaStatus (*)(const cudlaModule, const cudlaModuleAttributeType, cudlaModuleAttribute* const) noexcept nogil>__cudlaModuleGetAttributes)(
+        hModule, attrType, attribute)
+
+
+cdef cudlaStatus _cudlaModuleUnload(const cudlaModule hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaModuleUnload
+    _check_or_init_cudla()
+    if __cudlaModuleUnload == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaModuleUnload is not found")
+    return (<cudlaStatus (*)(const cudlaModule, const uint32_t) noexcept nogil>__cudlaModuleUnload)(
+        hModule, flags)
+
+
+cdef cudlaStatus _cudlaSubmitTask(const cudlaDevHandle devHandle, const cudlaTask* const ptrToTasks, const uint32_t numTasks, void* const stream, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaSubmitTask
+    _check_or_init_cudla()
+    if __cudlaSubmitTask == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaSubmitTask is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const cudlaTask* const, const uint32_t, void* const, const uint32_t) noexcept nogil>__cudlaSubmitTask)(
+        devHandle, ptrToTasks, numTasks, stream, flags)
+
+
+cdef cudlaStatus _cudlaDeviceGetAttribute(const cudlaDevHandle devHandle, const cudlaDevAttributeType attrib, cudlaDevAttribute* const pAttribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaDeviceGetAttribute
+    _check_or_init_cudla()
+    if __cudlaDeviceGetAttribute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaDeviceGetAttribute is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const cudlaDevAttributeType, cudlaDevAttribute* const) noexcept nogil>__cudlaDeviceGetAttribute)(
+        devHandle, attrib, pAttribute)
+
+
+cdef cudlaStatus _cudlaMemUnregister(const cudlaDevHandle devHandle, const uint64_t* const devPtr) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaMemUnregister
+    _check_or_init_cudla()
+    if __cudlaMemUnregister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaMemUnregister is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const uint64_t* const) noexcept nogil>__cudlaMemUnregister)(
+        devHandle, devPtr)
+
+
+cdef cudlaStatus _cudlaGetLastError(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaGetLastError
+    _check_or_init_cudla()
+    if __cudlaGetLastError == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaGetLastError is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle) noexcept nogil>__cudlaGetLastError)(
+        devHandle)
+
+
+cdef cudlaStatus _cudlaDestroyDevice(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaDestroyDevice
+    _check_or_init_cudla()
+    if __cudlaDestroyDevice == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaDestroyDevice is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle) noexcept nogil>__cudlaDestroyDevice)(
+        devHandle)
+
+
+cdef cudlaStatus _cudlaSetTaskTimeoutInMs(const cudlaDevHandle devHandle, const uint32_t timeout) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaSetTaskTimeoutInMs
+    _check_or_init_cudla()
+    if __cudlaSetTaskTimeoutInMs == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaSetTaskTimeoutInMs is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const uint32_t) noexcept nogil>__cudlaSetTaskTimeoutInMs)(
+        devHandle, timeout)
diff --git a/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx b/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx
new file mode 100644
index 00000000000..2cddfd6d19e
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx
@@ -0,0 +1,345 @@
+# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+import threading
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in nvcuda.dll')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_cudla_init = False
+
+cdef void* __cudlaGetVersion = NULL
+cdef void* __cudlaDeviceGetCount = NULL
+cdef void* __cudlaCreateDevice = NULL
+cdef void* __cudlaMemRegister = NULL
+cdef void* __cudlaModuleLoadFromMemory = NULL
+cdef void* __cudlaModuleGetAttributes = NULL
+cdef void* __cudlaModuleUnload = NULL
+cdef void* __cudlaSubmitTask = NULL
+cdef void* __cudlaDeviceGetAttribute = NULL
+cdef void* __cudlaMemUnregister = NULL
+cdef void* __cudlaGetLastError = NULL
+cdef void* __cudlaDestroyDevice = NULL
+cdef void* __cudlaSetTaskTimeoutInMs = NULL
+
+
+cdef int _init_cudla() except -1 nogil:
+    global __py_cudla_init
+
+    with gil, __symbol_lock:
+        # Recheck the flag after obtaining the locks
+        if __py_cudla_init:
+            return 0
+
+        # Load library
+        handle = load_nvidia_dynamic_lib("cudla")._handle_uint
+
+        # Load function
+        global __cudlaGetVersion
+        __cudlaGetVersion = GetProcAddress(handle, 'cudlaGetVersion')
+
+        global __cudlaDeviceGetCount
+        __cudlaDeviceGetCount = GetProcAddress(handle, 'cudlaDeviceGetCount')
+
+        global __cudlaCreateDevice
+        __cudlaCreateDevice = GetProcAddress(handle, 'cudlaCreateDevice')
+
+        global __cudlaMemRegister
+        __cudlaMemRegister = GetProcAddress(handle, 'cudlaMemRegister')
+
+        global __cudlaModuleLoadFromMemory
+        __cudlaModuleLoadFromMemory = GetProcAddress(handle, 'cudlaModuleLoadFromMemory')
+
+        global __cudlaModuleGetAttributes
+        __cudlaModuleGetAttributes = GetProcAddress(handle, 'cudlaModuleGetAttributes')
+
+        global __cudlaModuleUnload
+        __cudlaModuleUnload = GetProcAddress(handle, 'cudlaModuleUnload')
+
+        global __cudlaSubmitTask
+        __cudlaSubmitTask = GetProcAddress(handle, 'cudlaSubmitTask')
+
+        global __cudlaDeviceGetAttribute
+        __cudlaDeviceGetAttribute = GetProcAddress(handle, 'cudlaDeviceGetAttribute')
+
+        global __cudlaMemUnregister
+        __cudlaMemUnregister = GetProcAddress(handle, 'cudlaMemUnregister')
+
+        global __cudlaGetLastError
+        __cudlaGetLastError = GetProcAddress(handle, 'cudlaGetLastError')
+
+        global __cudlaDestroyDevice
+        __cudlaDestroyDevice = GetProcAddress(handle, 'cudlaDestroyDevice')
+
+        global __cudlaSetTaskTimeoutInMs
+        __cudlaSetTaskTimeoutInMs = GetProcAddress(handle, 'cudlaSetTaskTimeoutInMs')
+
+        __py_cudla_init = True
+        return 0
+
+
+cdef inline int _check_or_init_cudla() except -1 nogil:
+    if __py_cudla_init:
+        return 0
+
+    return _init_cudla()
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_cudla()
+    cdef dict data = {}
+
+    global __cudlaGetVersion
+    data["__cudlaGetVersion"] = <intptr_t>__cudlaGetVersion
+
+    global __cudlaDeviceGetCount
+    data["__cudlaDeviceGetCount"] = <intptr_t>__cudlaDeviceGetCount
+
+    global __cudlaCreateDevice
+    data["__cudlaCreateDevice"] = <intptr_t>__cudlaCreateDevice
+
+    global __cudlaMemRegister
+    data["__cudlaMemRegister"] = <intptr_t>__cudlaMemRegister
+
+    global __cudlaModuleLoadFromMemory
+    data["__cudlaModuleLoadFromMemory"] = <intptr_t>__cudlaModuleLoadFromMemory
+
+    global __cudlaModuleGetAttributes
+    data["__cudlaModuleGetAttributes"] = <intptr_t>__cudlaModuleGetAttributes
+
+    global __cudlaModuleUnload
+    data["__cudlaModuleUnload"] = <intptr_t>__cudlaModuleUnload
+
+    global __cudlaSubmitTask
+    data["__cudlaSubmitTask"] = <intptr_t>__cudlaSubmitTask
+
+    global __cudlaDeviceGetAttribute
+    data["__cudlaDeviceGetAttribute"] = <intptr_t>__cudlaDeviceGetAttribute
+
+    global __cudlaMemUnregister
+    data["__cudlaMemUnregister"] = <intptr_t>__cudlaMemUnregister
+
+    global __cudlaGetLastError
+    data["__cudlaGetLastError"] = <intptr_t>__cudlaGetLastError
+
+    global __cudlaDestroyDevice
+    data["__cudlaDestroyDevice"] = <intptr_t>__cudlaDestroyDevice
+
+    global __cudlaSetTaskTimeoutInMs
+    data["__cudlaSetTaskTimeoutInMs"] = <intptr_t>__cudlaSetTaskTimeoutInMs
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cudlaStatus _cudlaGetVersion(uint64_t* const version) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaGetVersion
+    _check_or_init_cudla()
+    if __cudlaGetVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaGetVersion is not found")
+    return (<cudlaStatus (*)(uint64_t* const) noexcept nogil>__cudlaGetVersion)(
+        version)
+
+
+cdef cudlaStatus _cudlaDeviceGetCount(uint64_t* const pNumDevices) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaDeviceGetCount
+    _check_or_init_cudla()
+    if __cudlaDeviceGetCount == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaDeviceGetCount is not found")
+    return (<cudlaStatus (*)(uint64_t* const) noexcept nogil>__cudlaDeviceGetCount)(
+        pNumDevices)
+
+
+cdef cudlaStatus _cudlaCreateDevice(const uint64_t device, cudlaDevHandle* const devHandle, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaCreateDevice
+    _check_or_init_cudla()
+    if __cudlaCreateDevice == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaCreateDevice is not found")
+    return (<cudlaStatus (*)(const uint64_t, cudlaDevHandle* const, const uint32_t) noexcept nogil>__cudlaCreateDevice)(
+        device, devHandle, flags)
+
+
+cdef cudlaStatus _cudlaMemRegister(const cudlaDevHandle devHandle, const uint64_t* const ptr, const size_t size, uint64_t** const devPtr, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaMemRegister
+    _check_or_init_cudla()
+    if __cudlaMemRegister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaMemRegister is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const uint64_t* const, const size_t, uint64_t** const, const uint32_t) noexcept nogil>__cudlaMemRegister)(
+        devHandle, ptr, size, devPtr, flags)
+
+
+cdef cudlaStatus _cudlaModuleLoadFromMemory(const cudlaDevHandle devHandle, const uint8_t* const pModule, const size_t moduleSize, cudlaModule* const hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaModuleLoadFromMemory
+    _check_or_init_cudla()
+    if __cudlaModuleLoadFromMemory == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaModuleLoadFromMemory is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const uint8_t* const, const size_t, cudlaModule* const, const uint32_t) noexcept nogil>__cudlaModuleLoadFromMemory)(
+        devHandle, pModule, moduleSize, hModule, flags)
+
+
+cdef cudlaStatus _cudlaModuleGetAttributes(const cudlaModule hModule, const cudlaModuleAttributeType attrType, cudlaModuleAttribute* const attribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaModuleGetAttributes
+    _check_or_init_cudla()
+    if __cudlaModuleGetAttributes == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaModuleGetAttributes is not found")
+    return (<cudlaStatus (*)(const cudlaModule, const cudlaModuleAttributeType, cudlaModuleAttribute* const) noexcept nogil>__cudlaModuleGetAttributes)(
+        hModule, attrType, attribute)
+
+
+cdef cudlaStatus _cudlaModuleUnload(const cudlaModule hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaModuleUnload
+    _check_or_init_cudla()
+    if __cudlaModuleUnload == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaModuleUnload is not found")
+    return (<cudlaStatus (*)(const cudlaModule, const uint32_t) noexcept nogil>__cudlaModuleUnload)(
+        hModule, flags)
+
+
+cdef cudlaStatus _cudlaSubmitTask(const cudlaDevHandle devHandle, const cudlaTask* const ptrToTasks, const uint32_t numTasks, void* const stream, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaSubmitTask
+    _check_or_init_cudla()
+    if __cudlaSubmitTask == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaSubmitTask is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const cudlaTask* const, const uint32_t, void* const, const uint32_t) noexcept nogil>__cudlaSubmitTask)(
+        devHandle, ptrToTasks, numTasks, stream, flags)
+
+
+cdef cudlaStatus _cudlaDeviceGetAttribute(const cudlaDevHandle devHandle, const cudlaDevAttributeType attrib, cudlaDevAttribute* const pAttribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaDeviceGetAttribute
+    _check_or_init_cudla()
+    if __cudlaDeviceGetAttribute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaDeviceGetAttribute is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const cudlaDevAttributeType, cudlaDevAttribute* const) noexcept nogil>__cudlaDeviceGetAttribute)(
+        devHandle, attrib, pAttribute)
+
+
+cdef cudlaStatus _cudlaMemUnregister(const cudlaDevHandle devHandle, const uint64_t* const devPtr) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaMemUnregister
+    _check_or_init_cudla()
+    if __cudlaMemUnregister == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaMemUnregister is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const uint64_t* const) noexcept nogil>__cudlaMemUnregister)(
+        devHandle, devPtr)
+
+
+cdef cudlaStatus _cudlaGetLastError(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaGetLastError
+    _check_or_init_cudla()
+    if __cudlaGetLastError == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaGetLastError is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle) noexcept nogil>__cudlaGetLastError)(
+        devHandle)
+
+
+cdef cudlaStatus _cudlaDestroyDevice(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaDestroyDevice
+    _check_or_init_cudla()
+    if __cudlaDestroyDevice == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaDestroyDevice is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle) noexcept nogil>__cudlaDestroyDevice)(
+        devHandle)
+
+
+cdef cudlaStatus _cudlaSetTaskTimeoutInMs(const cudlaDevHandle devHandle, const uint32_t timeout) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    global __cudlaSetTaskTimeoutInMs
+    _check_or_init_cudla()
+    if __cudlaSetTaskTimeoutInMs == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudlaSetTaskTimeoutInMs is not found")
+    return (<cudlaStatus (*)(const cudlaDevHandle, const uint32_t) noexcept nogil>__cudlaSetTaskTimeoutInMs)(
+        devHandle, timeout)
diff --git a/cuda_bindings/cuda/bindings/cudla.pxd b/cuda_bindings/cuda/bindings/cudla.pxd
new file mode 100644
index 00000000000..09b55c579b6
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cudla.pxd
@@ -0,0 +1,49 @@
+# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .cycudla cimport *
+
+
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef cudlaDevHandle DevHandle
+ctypedef cudlaModule Module
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef cudlaStatus _Status
+ctypedef cudlaMode _Mode
+ctypedef cudlaModuleAttributeType _ModuleAttributeType
+ctypedef cudlaFenceType _FenceType
+ctypedef cudlaModuleLoadFlags _ModuleLoadFlags
+ctypedef cudlaSubmissionFlags _SubmissionFlags
+ctypedef cudlaAccessPermissionFlags _AccessPermissionFlags
+ctypedef cudlaDevAttributeType _DevAttributeType
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef uint64_t get_version() except? -1
+cpdef uint64_t device_get_count() except? -1
+cpdef intptr_t create_device(uint64_t device, uint32_t flags) except *
+cpdef intptr_t mem_register(intptr_t dev_handle, intptr_t ptr, size_t size, uint32_t flags) except *
+cpdef intptr_t module_load_from_memory(intptr_t dev_handle, p_module, size_t module_size, uint32_t flags) except *
+cpdef module_unload(intptr_t h_module, uint32_t flags)
+cpdef submit_task(intptr_t dev_handle, intptr_t ptr_to_tasks, uint32_t num_tasks, intptr_t stream, uint32_t flags)
+cpdef object device_get_attribute(intptr_t dev_handle, int attrib) except *
+cpdef mem_unregister(intptr_t dev_handle, intptr_t dev_ptr)
+cpdef int get_last_error(intptr_t dev_handle) except? 0
+cpdef destroy_device(intptr_t dev_handle)
+cpdef set_task_timeout_in_ms(intptr_t dev_handle, uint32_t timeout)
+
+cpdef module_get_attributes(intptr_t h_module, int attr_type) except *
diff --git a/cuda_bindings/cuda/bindings/cudla.pyx b/cuda_bindings/cuda/bindings/cudla.pyx
new file mode 100644
index 00000000000..c11b2494e1b
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cudla.pyx
@@ -0,0 +1,1826 @@
+# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+
+cimport cython  # NOQA
+from libc.stdint cimport intptr_t, uintptr_t
+
+from ._internal.utils cimport get_buffer_pointer
+
+from enum import IntEnum as _IntEnum
+
+from libc.stdlib cimport calloc, free, malloc
+from cython cimport view
+cimport cpython.buffer
+cimport cpython.memoryview
+cimport cpython
+from libc.string cimport memcmp, memcpy
+import numpy as _numpy
+
+
+cdef __from_data(data, dtype_name, expected_dtype, lowpp_type):
+    # _numpy.recarray is a subclass of _numpy.ndarray, so implicitly handled here.
+    if isinstance(data, lowpp_type):
+        return data
+    if not isinstance(data, _numpy.ndarray):
+        raise TypeError("data argument must be a NumPy ndarray")
+    if data.size != 1:
+        raise ValueError("data array must have a size of 1")
+    if data.dtype != expected_dtype:
+        raise ValueError(f"data array must be of dtype {dtype_name}")
+    return lowpp_type.from_ptr(data.ctypes.data, not data.flags.writeable, data)
+
+
+cdef __from_buffer(buffer, size, lowpp_type):
+    cdef Py_buffer view
+    if cpython.PyObject_GetBuffer(buffer, &view, cpython.PyBUF_SIMPLE) != 0:
+        raise TypeError("buffer argument does not support the buffer protocol")
+    try:
+        if view.itemsize != 1:
+            raise ValueError("buffer itemsize must be 1 byte")
+        if view.len != size:
+            raise ValueError(f"buffer length must be {size} bytes")
+        return lowpp_type.from_ptr(<intptr_t><void *>view.buf, not view.readonly, buffer)
+    finally:
+        cpython.PyBuffer_Release(&view)
+
+
+cdef __getbuffer(object self, cpython.Py_buffer *buffer, void *ptr, int size, bint readonly):
+    buffer.buf = <char *>ptr
+    buffer.format = 'b'
+    buffer.internal = NULL
+    buffer.itemsize = 1
+    buffer.len = size
+    buffer.ndim = 1
+    buffer.obj = self
+    buffer.readonly = readonly
+    buffer.shape = &buffer.len
+    buffer.strides = &buffer.itemsize
+    buffer.suboffsets = NULL
+
+
+
+
+###############################################################################
+# POD
+###############################################################################
+
+cdef _get_external_memory_handle_desc_dtype_offsets():
+    cdef cudlaExternalMemoryHandleDesc_t pod = cudlaExternalMemoryHandleDesc_t()
+    return _numpy.dtype({
+        'names': ['ext_buf_object', 'size_'],
+        'formats': [_numpy.intp, _numpy.uint64],
+        'offsets': [
+            (<intptr_t>&(pod.extBufObject)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.size)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(cudlaExternalMemoryHandleDesc_t),
+    })
+
+external_memory_handle_desc_dtype = _get_external_memory_handle_desc_dtype_offsets()
+
+cdef class ExternalMemoryHandleDesc:
+    """Empty-initialize an instance of `cudlaExternalMemoryHandleDesc_t`.
+
+
+    .. seealso:: `cudlaExternalMemoryHandleDesc_t`
+    """
+    cdef:
+        cudlaExternalMemoryHandleDesc_t *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+
+    def __init__(self):
+        self._ptr = <cudlaExternalMemoryHandleDesc_t *>calloc(1, sizeof(cudlaExternalMemoryHandleDesc_t))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating ExternalMemoryHandleDesc")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+
+    def __dealloc__(self):
+        cdef cudlaExternalMemoryHandleDesc_t *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.ExternalMemoryHandleDesc object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef ExternalMemoryHandleDesc other_
+        if not isinstance(other, ExternalMemoryHandleDesc):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cudlaExternalMemoryHandleDesc_t)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cudlaExternalMemoryHandleDesc_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <cudlaExternalMemoryHandleDesc_t *>malloc(sizeof(cudlaExternalMemoryHandleDesc_t))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating ExternalMemoryHandleDesc")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cudlaExternalMemoryHandleDesc_t))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def ext_buf_object(self):
+        """int: """
+        return <intptr_t>(self._ptr[0].extBufObject)
+
+    @ext_buf_object.setter
+    def ext_buf_object(self, val):
+        if self._readonly:
+            raise ValueError("This ExternalMemoryHandleDesc instance is read-only")
+        self._ptr[0].extBufObject = <void *><intptr_t>val
+
+    @property
+    def size_(self):
+        """int: """
+        return self._ptr[0].size
+
+    @size_.setter
+    def size_(self, val):
+        if self._readonly:
+            raise ValueError("This ExternalMemoryHandleDesc instance is read-only")
+        self._ptr[0].size = val
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ExternalMemoryHandleDesc instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(cudlaExternalMemoryHandleDesc_t), ExternalMemoryHandleDesc)
+
+    @staticmethod
+    def from_data(data):
+        """Create an ExternalMemoryHandleDesc instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `external_memory_handle_desc_dtype` holding the data.
+        """
+        return __from_data(data, "external_memory_handle_desc_dtype", external_memory_handle_desc_dtype, ExternalMemoryHandleDesc)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an ExternalMemoryHandleDesc instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef ExternalMemoryHandleDesc obj = ExternalMemoryHandleDesc.__new__(ExternalMemoryHandleDesc)
+        if owner is None:
+            obj._ptr = <cudlaExternalMemoryHandleDesc_t *>malloc(sizeof(cudlaExternalMemoryHandleDesc_t))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating ExternalMemoryHandleDesc")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cudlaExternalMemoryHandleDesc_t))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <cudlaExternalMemoryHandleDesc_t *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        return obj
+
+
+cdef _get_external_semaphore_handle_desc_dtype_offsets():
+    cdef cudlaExternalSemaphoreHandleDesc_t pod = cudlaExternalSemaphoreHandleDesc_t()
+    return _numpy.dtype({
+        'names': ['ext_sync_object'],
+        'formats': [_numpy.intp],
+        'offsets': [
+            (<intptr_t>&(pod.extSyncObject)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(cudlaExternalSemaphoreHandleDesc_t),
+    })
+
+external_semaphore_handle_desc_dtype = _get_external_semaphore_handle_desc_dtype_offsets()
+
+cdef class ExternalSemaphoreHandleDesc:
+    """Empty-initialize an instance of `cudlaExternalSemaphoreHandleDesc_t`.
+
+
+    .. seealso:: `cudlaExternalSemaphoreHandleDesc_t`
+    """
+    cdef:
+        cudlaExternalSemaphoreHandleDesc_t *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+
+    def __init__(self):
+        self._ptr = <cudlaExternalSemaphoreHandleDesc_t *>calloc(1, sizeof(cudlaExternalSemaphoreHandleDesc_t))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating ExternalSemaphoreHandleDesc")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+
+    def __dealloc__(self):
+        cdef cudlaExternalSemaphoreHandleDesc_t *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.ExternalSemaphoreHandleDesc object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef ExternalSemaphoreHandleDesc other_
+        if not isinstance(other, ExternalSemaphoreHandleDesc):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cudlaExternalSemaphoreHandleDesc_t)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cudlaExternalSemaphoreHandleDesc_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <cudlaExternalSemaphoreHandleDesc_t *>malloc(sizeof(cudlaExternalSemaphoreHandleDesc_t))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating ExternalSemaphoreHandleDesc")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cudlaExternalSemaphoreHandleDesc_t))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def ext_sync_object(self):
+        """int: """
+        return <intptr_t>(self._ptr[0].extSyncObject)
+
+    @ext_sync_object.setter
+    def ext_sync_object(self, val):
+        if self._readonly:
+            raise ValueError("This ExternalSemaphoreHandleDesc instance is read-only")
+        self._ptr[0].extSyncObject = <void *><intptr_t>val
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ExternalSemaphoreHandleDesc instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(cudlaExternalSemaphoreHandleDesc_t), ExternalSemaphoreHandleDesc)
+
+    @staticmethod
+    def from_data(data):
+        """Create an ExternalSemaphoreHandleDesc instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `external_semaphore_handle_desc_dtype` holding the data.
+        """
+        return __from_data(data, "external_semaphore_handle_desc_dtype", external_semaphore_handle_desc_dtype, ExternalSemaphoreHandleDesc)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an ExternalSemaphoreHandleDesc instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef ExternalSemaphoreHandleDesc obj = ExternalSemaphoreHandleDesc.__new__(ExternalSemaphoreHandleDesc)
+        if owner is None:
+            obj._ptr = <cudlaExternalSemaphoreHandleDesc_t *>malloc(sizeof(cudlaExternalSemaphoreHandleDesc_t))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating ExternalSemaphoreHandleDesc")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cudlaExternalSemaphoreHandleDesc_t))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <cudlaExternalSemaphoreHandleDesc_t *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        return obj
+
+
+cdef _get_module_tensor_descriptor_dtype_offsets():
+    cdef cudlaModuleTensorDescriptor pod = cudlaModuleTensorDescriptor()
+    return _numpy.dtype({
+        'names': ['name', 'size_', 'n', 'c', 'h', 'w', 'data_format', 'data_type', 'data_category', 'pixel_format', 'pixel_mapping', 'stride'],
+        'formats': [(_numpy.int8, 81), _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, (_numpy.uint32, 8)],
+        'offsets': [
+            (<intptr_t>&(pod.name)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.size)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.n)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.c)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.h)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.w)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.dataFormat)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.dataType)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.dataCategory)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.pixelFormat)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.pixelMapping)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.stride)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(cudlaModuleTensorDescriptor),
+    })
+
+module_tensor_descriptor_dtype = _get_module_tensor_descriptor_dtype_offsets()
+
+cdef class ModuleTensorDescriptor:
+    """Empty-initialize an instance of `cudlaModuleTensorDescriptor`.
+
+
+    .. seealso:: `cudlaModuleTensorDescriptor`
+    """
+    cdef:
+        cudlaModuleTensorDescriptor *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+
+    def __init__(self):
+        self._ptr = <cudlaModuleTensorDescriptor *>calloc(1, sizeof(cudlaModuleTensorDescriptor))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating ModuleTensorDescriptor")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+
+    def __dealloc__(self):
+        cdef cudlaModuleTensorDescriptor *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.ModuleTensorDescriptor object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef ModuleTensorDescriptor other_
+        if not isinstance(other, ModuleTensorDescriptor):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cudlaModuleTensorDescriptor)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cudlaModuleTensorDescriptor), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <cudlaModuleTensorDescriptor *>malloc(sizeof(cudlaModuleTensorDescriptor))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating ModuleTensorDescriptor")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cudlaModuleTensorDescriptor))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def name(self):
+        """~_numpy.int8: (array of length 81)."""
+        return cpython.PyUnicode_FromString(self._ptr[0].name)
+
+    @name.setter
+    def name(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        cdef bytes buf = val.encode()
+        if len(buf) >= 81:
+            raise ValueError("String too long for field name, max length is 80")
+        cdef char *ptr = buf
+        memcpy(<void *>(self._ptr[0].name), <void *>ptr, 81)
+
+    @property
+    def size_(self):
+        """int: """
+        return self._ptr[0].size
+
+    @size_.setter
+    def size_(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].size = val
+
+    @property
+    def n(self):
+        """int: """
+        return self._ptr[0].n
+
+    @n.setter
+    def n(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].n = val
+
+    @property
+    def c(self):
+        """int: """
+        return self._ptr[0].c
+
+    @c.setter
+    def c(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].c = val
+
+    @property
+    def h(self):
+        """int: """
+        return self._ptr[0].h
+
+    @h.setter
+    def h(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].h = val
+
+    @property
+    def w(self):
+        """int: """
+        return self._ptr[0].w
+
+    @w.setter
+    def w(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].w = val
+
+    @property
+    def data_format(self):
+        """int: """
+        return self._ptr[0].dataFormat
+
+    @data_format.setter
+    def data_format(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].dataFormat = val
+
+    @property
+    def data_type(self):
+        """int: """
+        return self._ptr[0].dataType
+
+    @data_type.setter
+    def data_type(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].dataType = val
+
+    @property
+    def data_category(self):
+        """int: """
+        return self._ptr[0].dataCategory
+
+    @data_category.setter
+    def data_category(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].dataCategory = val
+
+    @property
+    def pixel_format(self):
+        """int: """
+        return self._ptr[0].pixelFormat
+
+    @pixel_format.setter
+    def pixel_format(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].pixelFormat = val
+
+    @property
+    def pixel_mapping(self):
+        """int: """
+        return self._ptr[0].pixelMapping
+
+    @pixel_mapping.setter
+    def pixel_mapping(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        self._ptr[0].pixelMapping = val
+
+    @property
+    def stride(self):
+        """~_numpy.uint32: (array of length 8)."""
+        cdef view.array arr = view.array(shape=(8,), itemsize=sizeof(uint32_t), format="I", mode="c", allocate_buffer=False)
+        arr.data = <char *>(&(self._ptr[0].stride))
+        return _numpy.asarray(arr)
+
+    @stride.setter
+    def stride(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleTensorDescriptor instance is read-only")
+        if len(val) != 8:
+            raise ValueError(f"Expected length { 8 } for field stride, got {len(val)}")
+        cdef view.array arr = view.array(shape=(8,), itemsize=sizeof(uint32_t), format="I", mode="c")
+        arr[:] = _numpy.asarray(val, dtype=_numpy.uint32)
+        memcpy(<void *>(&(self._ptr[0].stride)), <void *>(arr.data), sizeof(uint32_t) * len(val))
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ModuleTensorDescriptor instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(cudlaModuleTensorDescriptor), ModuleTensorDescriptor)
+
+    @staticmethod
+    def from_data(data):
+        """Create an ModuleTensorDescriptor instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `module_tensor_descriptor_dtype` holding the data.
+        """
+        return __from_data(data, "module_tensor_descriptor_dtype", module_tensor_descriptor_dtype, ModuleTensorDescriptor)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an ModuleTensorDescriptor instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef ModuleTensorDescriptor obj = ModuleTensorDescriptor.__new__(ModuleTensorDescriptor)
+        if owner is None:
+            obj._ptr = <cudlaModuleTensorDescriptor *>malloc(sizeof(cudlaModuleTensorDescriptor))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating ModuleTensorDescriptor")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cudlaModuleTensorDescriptor))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <cudlaModuleTensorDescriptor *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        return obj
+
+
+cdef _get_fence_dtype_offsets():
+    cdef CudlaFence pod = CudlaFence()
+    return _numpy.dtype({
+        'names': ['fence', 'type'],
+        'formats': [_numpy.intp, _numpy.int32],
+        'offsets': [
+            (<intptr_t>&(pod.fence)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.type)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(CudlaFence),
+    })
+
+fence_dtype = _get_fence_dtype_offsets()
+
+cdef class Fence:
+    """Empty-initialize an instance of `CudlaFence`.
+
+
+    .. seealso:: `CudlaFence`
+    """
+    cdef:
+        CudlaFence *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+
+    def __init__(self):
+        self._ptr = <CudlaFence *>calloc(1, sizeof(CudlaFence))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating Fence")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+
+    def __dealloc__(self):
+        cdef CudlaFence *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.Fence object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef Fence other_
+        if not isinstance(other, Fence):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(CudlaFence)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(CudlaFence), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <CudlaFence *>malloc(sizeof(CudlaFence))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating Fence")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(CudlaFence))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def fence(self):
+        """int: """
+        return <intptr_t>(self._ptr[0].fence)
+
+    @fence.setter
+    def fence(self, val):
+        if self._readonly:
+            raise ValueError("This Fence instance is read-only")
+        self._ptr[0].fence = <void *><intptr_t>val
+
+    @property
+    def type(self):
+        """int: """
+        return <int>(self._ptr[0].type)
+
+    @type.setter
+    def type(self, val):
+        if self._readonly:
+            raise ValueError("This Fence instance is read-only")
+        self._ptr[0].type = <cudlaFenceType><int>val
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an Fence instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(CudlaFence), Fence)
+
+    @staticmethod
+    def from_data(data):
+        """Create an Fence instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `fence_dtype` holding the data.
+        """
+        return __from_data(data, "fence_dtype", fence_dtype, Fence)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an Fence instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef Fence obj = Fence.__new__(Fence)
+        if owner is None:
+            obj._ptr = <CudlaFence *>malloc(sizeof(CudlaFence))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating Fence")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(CudlaFence))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <CudlaFence *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        return obj
+
+
+dev_attribute_dtype = _numpy.dtype((
+    _numpy.dtype((_numpy.void, sizeof(cudlaDevAttribute))),
+    {
+        "unified_addressing_supported": (_numpy.uint8, 0),
+        "device_version": (_numpy.uint32, 0),
+    }
+    ))
+
+
+cdef class DevAttribute:
+    """Empty-initialize an instance of `cudlaDevAttribute`.
+
+
+    .. seealso:: `cudlaDevAttribute`
+    """
+    cdef:
+        cudlaDevAttribute *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+
+    def __init__(self):
+        self._ptr = <cudlaDevAttribute *>calloc(1, sizeof(cudlaDevAttribute))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating DevAttribute")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+
+    def __dealloc__(self):
+        cdef cudlaDevAttribute *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.DevAttribute object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef DevAttribute other_
+        if not isinstance(other, DevAttribute):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cudlaDevAttribute)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cudlaDevAttribute), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <cudlaDevAttribute *>malloc(sizeof(cudlaDevAttribute))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating DevAttribute")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cudlaDevAttribute))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def unified_addressing_supported(self):
+        """int: """
+        return self._ptr[0].unifiedAddressingSupported
+
+    @unified_addressing_supported.setter
+    def unified_addressing_supported(self, val):
+        if self._readonly:
+            raise ValueError("This DevAttribute instance is read-only")
+        self._ptr[0].unifiedAddressingSupported = val
+
+    @property
+    def device_version(self):
+        """int: """
+        return self._ptr[0].deviceVersion
+
+    @device_version.setter
+    def device_version(self, val):
+        if self._readonly:
+            raise ValueError("This DevAttribute instance is read-only")
+        self._ptr[0].deviceVersion = val
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an DevAttribute instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(cudlaDevAttribute), DevAttribute)
+
+    @staticmethod
+    def from_data(data):
+        """Create an DevAttribute instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `dev_attribute_dtype` holding the data.
+        """
+        return __from_data(data, "dev_attribute_dtype", dev_attribute_dtype, DevAttribute)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an DevAttribute instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef DevAttribute obj = DevAttribute.__new__(DevAttribute)
+        if owner is None:
+            obj._ptr = <cudlaDevAttribute *>malloc(sizeof(cudlaDevAttribute))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating DevAttribute")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cudlaDevAttribute))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <cudlaDevAttribute *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        return obj
+
+
+module_attribute_dtype = _numpy.dtype((
+    _numpy.dtype((_numpy.void, sizeof(cudlaModuleAttribute))),
+    {
+        "num_input_tensors": (_numpy.uint32, 0),
+        "num_output_tensors": (_numpy.uint32, 0),
+        "input_tensor_desc": (_numpy.intp, 0),
+        "output_tensor_desc": (_numpy.intp, 0),
+    }
+    ))
+
+
+cdef class ModuleAttribute:
+    """Empty-initialize an instance of `cudlaModuleAttribute`.
+
+
+    .. seealso:: `cudlaModuleAttribute`
+    """
+    cdef:
+        cudlaModuleAttribute *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+
+    def __init__(self):
+        self._ptr = <cudlaModuleAttribute *>calloc(1, sizeof(cudlaModuleAttribute))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating ModuleAttribute")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+
+    def __dealloc__(self):
+        cdef cudlaModuleAttribute *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.ModuleAttribute object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef ModuleAttribute other_
+        if not isinstance(other, ModuleAttribute):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cudlaModuleAttribute)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cudlaModuleAttribute), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <cudlaModuleAttribute *>malloc(sizeof(cudlaModuleAttribute))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating ModuleAttribute")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cudlaModuleAttribute))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def num_input_tensors(self):
+        """int: """
+        return self._ptr[0].numInputTensors
+
+    @num_input_tensors.setter
+    def num_input_tensors(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleAttribute instance is read-only")
+        self._ptr[0].numInputTensors = val
+
+    @property
+    def num_output_tensors(self):
+        """int: """
+        return self._ptr[0].numOutputTensors
+
+    @num_output_tensors.setter
+    def num_output_tensors(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleAttribute instance is read-only")
+        self._ptr[0].numOutputTensors = val
+
+    @property
+    def input_tensor_desc(self):
+        """int: """
+        return <intptr_t>(self._ptr[0].inputTensorDesc)
+
+    @input_tensor_desc.setter
+    def input_tensor_desc(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleAttribute instance is read-only")
+        self._ptr[0].inputTensorDesc = <cudlaModuleTensorDescriptor*><intptr_t>val
+
+    @property
+    def output_tensor_desc(self):
+        """int: """
+        return <intptr_t>(self._ptr[0].outputTensorDesc)
+
+    @output_tensor_desc.setter
+    def output_tensor_desc(self, val):
+        if self._readonly:
+            raise ValueError("This ModuleAttribute instance is read-only")
+        self._ptr[0].outputTensorDesc = <cudlaModuleTensorDescriptor*><intptr_t>val
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ModuleAttribute instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(cudlaModuleAttribute), ModuleAttribute)
+
+    @staticmethod
+    def from_data(data):
+        """Create an ModuleAttribute instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `module_attribute_dtype` holding the data.
+        """
+        return __from_data(data, "module_attribute_dtype", module_attribute_dtype, ModuleAttribute)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an ModuleAttribute instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef ModuleAttribute obj = ModuleAttribute.__new__(ModuleAttribute)
+        if owner is None:
+            obj._ptr = <cudlaModuleAttribute *>malloc(sizeof(cudlaModuleAttribute))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating ModuleAttribute")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cudlaModuleAttribute))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <cudlaModuleAttribute *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        return obj
+
+
+cdef _get_wait_events_dtype_offsets():
+    cdef cudlaWaitEvents pod = cudlaWaitEvents()
+    return _numpy.dtype({
+        'names': ['pre_fences', 'num_events'],
+        'formats': [_numpy.intp, _numpy.uint32],
+        'offsets': [
+            (<intptr_t>&(pod.preFences)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.numEvents)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(cudlaWaitEvents),
+    })
+
+wait_events_dtype = _get_wait_events_dtype_offsets()
+
+cdef class WaitEvents:
+    """Empty-initialize an instance of `cudlaWaitEvents`.
+
+
+    .. seealso:: `cudlaWaitEvents`
+    """
+    cdef:
+        cudlaWaitEvents *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+        dict _refs
+
+    def __init__(self):
+        self._ptr = <cudlaWaitEvents *>calloc(1, sizeof(cudlaWaitEvents))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating WaitEvents")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+        self._refs = {}
+
+    def __dealloc__(self):
+        cdef cudlaWaitEvents *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.WaitEvents object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef WaitEvents other_
+        if not isinstance(other, WaitEvents):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cudlaWaitEvents)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cudlaWaitEvents), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <cudlaWaitEvents *>malloc(sizeof(cudlaWaitEvents))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating WaitEvents")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cudlaWaitEvents))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def pre_fences(self):
+        """int: """
+        if self._ptr[0].preFences == NULL or self._ptr[0].numEvents == 0:
+            return []
+        return Fence.from_ptr(<intptr_t>(self._ptr[0].preFences), self._ptr[0].numEvents)
+
+    @pre_fences.setter
+    def pre_fences(self, val):
+        if self._readonly:
+            raise ValueError("This WaitEvents instance is read-only")
+        cdef Fence arr = val
+        self._ptr[0].preFences = <CudlaFence*><intptr_t>(arr._get_ptr())
+        self._ptr[0].numEvents = len(arr)
+        self._refs["pre_fences"] = arr
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an WaitEvents instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(cudlaWaitEvents), WaitEvents)
+
+    @staticmethod
+    def from_data(data):
+        """Create an WaitEvents instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `wait_events_dtype` holding the data.
+        """
+        return __from_data(data, "wait_events_dtype", wait_events_dtype, WaitEvents)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an WaitEvents instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef WaitEvents obj = WaitEvents.__new__(WaitEvents)
+        if owner is None:
+            obj._ptr = <cudlaWaitEvents *>malloc(sizeof(cudlaWaitEvents))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating WaitEvents")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cudlaWaitEvents))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <cudlaWaitEvents *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        obj._refs = {}
+        return obj
+
+
+cdef _get_signal_events_dtype_offsets():
+    cdef cudlaSignalEvents pod = cudlaSignalEvents()
+    return _numpy.dtype({
+        'names': ['dev_ptrs', 'eof_fences', 'num_events'],
+        'formats': [_numpy.intp, _numpy.intp, _numpy.uint32],
+        'offsets': [
+            (<intptr_t>&(pod.devPtrs)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.eofFences)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.numEvents)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(cudlaSignalEvents),
+    })
+
+signal_events_dtype = _get_signal_events_dtype_offsets()
+
+cdef class SignalEvents:
+    """Empty-initialize an instance of `cudlaSignalEvents`.
+
+
+    .. seealso:: `cudlaSignalEvents`
+    """
+    cdef:
+        cudlaSignalEvents *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+        dict _refs
+
+    def __init__(self):
+        self._ptr = <cudlaSignalEvents *>calloc(1, sizeof(cudlaSignalEvents))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating SignalEvents")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+        self._refs = {}
+
+    def __dealloc__(self):
+        cdef cudlaSignalEvents *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.SignalEvents object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef SignalEvents other_
+        if not isinstance(other, SignalEvents):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cudlaSignalEvents)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cudlaSignalEvents), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <cudlaSignalEvents *>malloc(sizeof(cudlaSignalEvents))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating SignalEvents")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cudlaSignalEvents))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def dev_ptrs(self):
+        """int: """
+        if self._ptr[0].devPtrs == NULL or self._ptr[0].numEvents == 0:
+            return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="q", mode="c")[:0]
+        cdef view.array arr = view.array(shape=(self._ptr[0].numEvents,), itemsize=sizeof(intptr_t), format="q", mode="c", allocate_buffer=False)
+        arr.data = <char *>(self._ptr[0].devPtrs)
+        return arr
+
+    @dev_ptrs.setter
+    def dev_ptrs(self, val):
+        if self._readonly:
+            raise ValueError("This SignalEvents instance is read-only")
+        cdef Py_ssize_t _n = len(val)
+        self._ptr[0].numEvents = _n
+        if _n == 0:
+            return
+        cdef view.array arr = view.array(shape=(_n,), itemsize=sizeof(intptr_t), format="q", mode="c")
+        cdef intptr_t[:] mv = arr
+        cdef Py_ssize_t i
+        for i in range(_n):
+            mv[i] = val[i]
+        self._ptr[0].devPtrs = <uint64_t**><intptr_t>(arr.data)
+        self._refs["dev_ptrs"] = arr
+
+    @property
+    def eof_fences(self):
+        """int: """
+        if self._ptr[0].eofFences == NULL or self._ptr[0].numEvents == 0:
+            return []
+        return Fence.from_ptr(<intptr_t>(self._ptr[0].eofFences), self._ptr[0].numEvents)
+
+    @eof_fences.setter
+    def eof_fences(self, val):
+        if self._readonly:
+            raise ValueError("This SignalEvents instance is read-only")
+        cdef Fence arr = val
+        self._ptr[0].eofFences = <CudlaFence*><intptr_t>(arr._get_ptr())
+        self._ptr[0].numEvents = len(arr)
+        self._refs["eof_fences"] = arr
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an SignalEvents instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(cudlaSignalEvents), SignalEvents)
+
+    @staticmethod
+    def from_data(data):
+        """Create an SignalEvents instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `signal_events_dtype` holding the data.
+        """
+        return __from_data(data, "signal_events_dtype", signal_events_dtype, SignalEvents)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an SignalEvents instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef SignalEvents obj = SignalEvents.__new__(SignalEvents)
+        if owner is None:
+            obj._ptr = <cudlaSignalEvents *>malloc(sizeof(cudlaSignalEvents))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating SignalEvents")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cudlaSignalEvents))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <cudlaSignalEvents *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        obj._refs = {}
+        return obj
+
+
+cdef _get_task_dtype_offsets():
+    cdef cudlaTask pod = cudlaTask()
+    return _numpy.dtype({
+        'names': ['module_handle', 'output_tensor', 'num_output_tensors', 'num_input_tensors', 'input_tensor', 'wait_events', 'signal_events'],
+        'formats': [_numpy.intp, _numpy.intp, _numpy.uint32, _numpy.uint32, _numpy.intp, _numpy.intp, _numpy.intp],
+        'offsets': [
+            (<intptr_t>&(pod.moduleHandle)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.outputTensor)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.numOutputTensors)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.numInputTensors)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.inputTensor)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.waitEvents)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.signalEvents)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(cudlaTask),
+    })
+
+task_dtype = _get_task_dtype_offsets()
+
+cdef class Task:
+    """Empty-initialize an instance of `cudlaTask`.
+
+
+    .. seealso:: `cudlaTask`
+    """
+    cdef:
+        cudlaTask *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+        dict _refs
+
+    def __init__(self):
+        self._ptr = <cudlaTask *>calloc(1, sizeof(cudlaTask))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating Task")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+        self._refs = {}
+
+    def __dealloc__(self):
+        cdef cudlaTask *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.Task object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef Task other_
+        if not isinstance(other, Task):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cudlaTask)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cudlaTask), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <cudlaTask *>malloc(sizeof(cudlaTask))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating Task")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cudlaTask))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def module_handle(self):
+        """int: """
+        return <intptr_t>(self._ptr[0].moduleHandle)
+
+    @module_handle.setter
+    def module_handle(self, val):
+        if self._readonly:
+            raise ValueError("This Task instance is read-only")
+        self._ptr[0].moduleHandle = <cudlaModule><intptr_t>val
+
+    @property
+    def output_tensor(self):
+        """int: """
+        if self._ptr[0].outputTensor == NULL or self._ptr[0].numOutputTensors == 0:
+            return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="q", mode="c")[:0]
+        cdef view.array arr = view.array(shape=(self._ptr[0].numOutputTensors,), itemsize=sizeof(intptr_t), format="q", mode="c", allocate_buffer=False)
+        arr.data = <char *>(self._ptr[0].outputTensor)
+        return arr
+
+    @output_tensor.setter
+    def output_tensor(self, val):
+        if self._readonly:
+            raise ValueError("This Task instance is read-only")
+        cdef Py_ssize_t _n = len(val)
+        self._ptr[0].numOutputTensors = _n
+        if _n == 0:
+            return
+        cdef view.array arr = view.array(shape=(_n,), itemsize=sizeof(intptr_t), format="q", mode="c")
+        cdef intptr_t[:] mv = arr
+        cdef Py_ssize_t i
+        for i in range(_n):
+            mv[i] = val[i]
+        self._ptr[0].outputTensor = <uint64_t**><intptr_t>(arr.data)
+        self._refs["output_tensor"] = arr
+
+    @property
+    def input_tensor(self):
+        """int: """
+        if self._ptr[0].inputTensor == NULL or self._ptr[0].numInputTensors == 0:
+            return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="q", mode="c")[:0]
+        cdef view.array arr = view.array(shape=(self._ptr[0].numInputTensors,), itemsize=sizeof(intptr_t), format="q", mode="c", allocate_buffer=False)
+        arr.data = <char *>(self._ptr[0].inputTensor)
+        return arr
+
+    @input_tensor.setter
+    def input_tensor(self, val):
+        if self._readonly:
+            raise ValueError("This Task instance is read-only")
+        cdef Py_ssize_t _n = len(val)
+        self._ptr[0].numInputTensors = _n
+        if _n == 0:
+            return
+        cdef view.array arr = view.array(shape=(_n,), itemsize=sizeof(intptr_t), format="q", mode="c")
+        cdef intptr_t[:] mv = arr
+        cdef Py_ssize_t i
+        for i in range(_n):
+            mv[i] = val[i]
+        self._ptr[0].inputTensor = <uint64_t**><intptr_t>(arr.data)
+        self._refs["input_tensor"] = arr
+
+    @property
+    def wait_events(self):
+        """int: """
+        return <intptr_t>(self._ptr[0].waitEvents)
+
+    @wait_events.setter
+    def wait_events(self, val):
+        if self._readonly:
+            raise ValueError("This Task instance is read-only")
+        self._ptr[0].waitEvents = <cudlaWaitEvents*><intptr_t>val
+
+    @property
+    def signal_events(self):
+        """int: """
+        return <intptr_t>(self._ptr[0].signalEvents)
+
+    @signal_events.setter
+    def signal_events(self, val):
+        if self._readonly:
+            raise ValueError("This Task instance is read-only")
+        self._ptr[0].signalEvents = <cudlaSignalEvents*><intptr_t>val
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an Task instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(cudlaTask), Task)
+
+    @staticmethod
+    def from_data(data):
+        """Create an Task instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `task_dtype` holding the data.
+        """
+        return __from_data(data, "task_dtype", task_dtype, Task)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an Task instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef Task obj = Task.__new__(Task)
+        if owner is None:
+            obj._ptr = <cudlaTask *>malloc(sizeof(cudlaTask))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating Task")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cudlaTask))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <cudlaTask *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        obj._refs = {}
+        return obj
+
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class Status(_IntEnum):
+    """
+    See `cudlaStatus`.
+    """
+    Success = cudlaSuccess
+    ErrorInvalidParam = cudlaErrorInvalidParam
+    ErrorOutOfResources = cudlaErrorOutOfResources
+    ErrorCreationFailed = cudlaErrorCreationFailed
+    ErrorInvalidAddress = cudlaErrorInvalidAddress
+    ErrorOs = cudlaErrorOs
+    ErrorCuda = cudlaErrorCuda
+    ErrorUmd = cudlaErrorUmd
+    ErrorInvalidDevice = cudlaErrorInvalidDevice
+    ErrorInvalidAttribute = cudlaErrorInvalidAttribute
+    ErrorIncompatibleDlaSWVersion = cudlaErrorIncompatibleDlaSWVersion
+    ErrorMemoryRegistered = cudlaErrorMemoryRegistered
+    ErrorInvalidModule = cudlaErrorInvalidModule
+    ErrorUnsupportedOperation = cudlaErrorUnsupportedOperation
+    ErrorNvSci = cudlaErrorNvSci
+    ErrorDlaErrInvalidInput = cudlaErrorDlaErrInvalidInput
+    ErrorDlaErrInvalidPreAction = cudlaErrorDlaErrInvalidPreAction
+    ErrorDlaErrNoMem = cudlaErrorDlaErrNoMem
+    ErrorDlaErrProcessorBusy = cudlaErrorDlaErrProcessorBusy
+    ErrorDlaErrTaskStatusMismatch = cudlaErrorDlaErrTaskStatusMismatch
+    ErrorDlaErrEngineTimeout = cudlaErrorDlaErrEngineTimeout
+    ErrorDlaErrDataMismatch = cudlaErrorDlaErrDataMismatch
+    ErrorUnknown = cudlaErrorUnknown
+
+class Mode(_IntEnum):
+    """
+    See `cudlaMode`.
+    """
+    CUDA_DLA = CUDLA_CUDA_DLA
+    STANDALONE = CUDLA_STANDALONE
+
+class ModuleAttributeType(_IntEnum):
+    """
+    See `cudlaModuleAttributeType`.
+    """
+    NUM_INPUT_TENSORS = CUDLA_NUM_INPUT_TENSORS
+    NUM_OUTPUT_TENSORS = CUDLA_NUM_OUTPUT_TENSORS
+    INPUT_TENSOR_DESCRIPTORS = CUDLA_INPUT_TENSOR_DESCRIPTORS
+    OUTPUT_TENSOR_DESCRIPTORS = CUDLA_OUTPUT_TENSOR_DESCRIPTORS
+    NUM_OUTPUT_TASK_STATISTICS = CUDLA_NUM_OUTPUT_TASK_STATISTICS
+    OUTPUT_TASK_STATISTICS_DESCRIPTORS = CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS
+
+class FenceType(_IntEnum):
+    """
+    See `cudlaFenceType`.
+    """
+    NVSCISYNC_FENCE = CUDLA_NVSCISYNC_FENCE
+    NVSCISYNC_FENCE_SOF = CUDLA_NVSCISYNC_FENCE_SOF
+
+class ModuleLoadFlags(_IntEnum):
+    """
+    See `cudlaModuleLoadFlags`.
+    """
+    MODULE_DEFAULT = CUDLA_MODULE_DEFAULT
+    MODULE_ENABLE_FAULT_DIAGNOSTICS = CUDLA_MODULE_ENABLE_FAULT_DIAGNOSTICS
+
+class SubmissionFlags(_IntEnum):
+    """
+    See `cudlaSubmissionFlags`.
+    """
+    SUBMIT_NOOP = CUDLA_SUBMIT_NOOP
+    SUBMIT_SKIP_LOCK_ACQUIRE = CUDLA_SUBMIT_SKIP_LOCK_ACQUIRE
+    SUBMIT_DIAGNOSTICS_TASK = CUDLA_SUBMIT_DIAGNOSTICS_TASK
+
+class AccessPermissionFlags(_IntEnum):
+    """
+    See `cudlaAccessPermissionFlags`.
+    """
+    READ_WRITE_PERM = CUDLA_READ_WRITE_PERM
+    READ_ONLY_PERM = CUDLA_READ_ONLY_PERM
+    TASK_STATISTICS = CUDLA_TASK_STATISTICS
+
+class DevAttributeType(_IntEnum):
+    """
+    See `cudlaDevAttributeType`.
+    """
+    UNIFIED_ADDRESSING = CUDLA_UNIFIED_ADDRESSING
+    DEVICE_VERSION = CUDLA_DEVICE_VERSION
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+class CudlaError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        s = Status(status)
+        cdef str err = f"{s.name} ({s.value})"
+        super(CudlaError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cpdef inline check_status(int status):
+    if status != 0:
+        raise CudlaError(status)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef uint64_t get_version() except? -1:
+    cdef uint64_t version
+    with nogil:
+        __status__ = cudlaGetVersion(&version)
+    check_status(__status__)
+    return version
+
+
+cpdef uint64_t device_get_count() except? -1:
+    cdef uint64_t p_num_devices
+    with nogil:
+        __status__ = cudlaDeviceGetCount(&p_num_devices)
+    check_status(__status__)
+    return p_num_devices
+
+
+cpdef intptr_t create_device(uint64_t device, uint32_t flags) except *:
+    cdef DevHandle dev_handle
+    if flags == CUDLA_STANDALONE:
+        raise CudlaError(cudlaErrorUnsupportedOperation)
+    with nogil:
+        __status__ = cudlaCreateDevice(<const uint64_t>device, &dev_handle, <const uint32_t>flags)
+    check_status(__status__)
+    return <intptr_t>dev_handle
+
+
+cpdef intptr_t mem_register(intptr_t dev_handle, intptr_t ptr, size_t size, uint32_t flags) except *:
+    cdef uint64_t* dev_ptr
+    with nogil:
+        __status__ = cudlaMemRegister(<const DevHandle>dev_handle, <const uint64_t* const>ptr, <const size_t>size, &dev_ptr, <const uint32_t>flags)
+    check_status(__status__)
+    return <intptr_t>dev_ptr
+
+
+cpdef intptr_t module_load_from_memory(intptr_t dev_handle, p_module, size_t module_size, uint32_t flags) except *:
+    cdef void* _p_module_ = get_buffer_pointer(p_module, module_size, readonly=True)
+    cdef Module h_module
+    with nogil:
+        __status__ = cudlaModuleLoadFromMemory(<const DevHandle>dev_handle, <const uint8_t* const>_p_module_, <const size_t>module_size, &h_module, <const uint32_t>flags)
+    check_status(__status__)
+    return <intptr_t>h_module
+
+
+cpdef module_unload(intptr_t h_module, uint32_t flags):
+    with nogil:
+        __status__ = cudlaModuleUnload(<const Module>h_module, <const uint32_t>flags)
+    check_status(__status__)
+
+
+cpdef submit_task(intptr_t dev_handle, intptr_t ptr_to_tasks, uint32_t num_tasks, intptr_t stream, uint32_t flags):
+    with nogil:
+        __status__ = cudlaSubmitTask(<const DevHandle>dev_handle, <const cudlaTask* const>ptr_to_tasks, <const uint32_t>num_tasks, <void* const>stream, <const uint32_t>flags)
+    check_status(__status__)
+
+
+cpdef object device_get_attribute(intptr_t dev_handle, int attrib) except *:
+    cdef DevAttribute p_attribute_py = DevAttribute()
+    cdef cudlaDevAttribute *p_attribute = <cudlaDevAttribute *><intptr_t>(p_attribute_py._get_ptr())
+    with nogil:
+        __status__ = cudlaDeviceGetAttribute(<const DevHandle>dev_handle, <const _DevAttributeType>attrib, p_attribute)
+    check_status(__status__)
+    return p_attribute_py
+
+
+cpdef mem_unregister(intptr_t dev_handle, intptr_t dev_ptr):
+    with nogil:
+        __status__ = cudlaMemUnregister(<const DevHandle>dev_handle, <const uint64_t* const>dev_ptr)
+    check_status(__status__)
+
+
+cpdef int get_last_error(intptr_t dev_handle) except? 0:
+    return <int>cudlaGetLastError(<const DevHandle>dev_handle)
+
+
+cpdef destroy_device(intptr_t dev_handle):
+    with nogil:
+        __status__ = cudlaDestroyDevice(<const DevHandle>dev_handle)
+    check_status(__status__)
+
+
+cpdef set_task_timeout_in_ms(intptr_t dev_handle, uint32_t timeout):
+    with nogil:
+        __status__ = cudlaSetTaskTimeoutInMs(<const DevHandle>dev_handle, <const uint32_t>timeout)
+    check_status(__status__)
+
+
+cpdef module_get_attributes(intptr_t h_module, int attr_type) except *:
+    """Query module attributes, interpreting the cudlaModuleAttribute union
+    based on the requested attribute type.
+
+    For count attributes (NUM_INPUT_TENSORS, NUM_OUTPUT_TENSORS,
+    NUM_OUTPUT_TASK_STATISTICS), returns an int.
+
+    For descriptor attributes (INPUT_TENSOR_DESCRIPTORS,
+    OUTPUT_TENSOR_DESCRIPTORS, OUTPUT_TASK_STATISTICS_DESCRIPTORS),
+    returns a list of ModuleTensorDescriptor objects.
+    """
+    cdef int _attr_type = attr_type
+    cdef cudlaModuleAttribute count_attr
+    cdef cudlaModuleAttribute num_attr
+    cdef cudlaModuleAttribute desc_attr
+    cdef uint32_t count
+    cdef cudlaModuleTensorDescriptor* desc_buf
+    cdef uint32_t i
+    cdef int num_attr_type
+
+    if _attr_type == CUDLA_NUM_INPUT_TENSORS or _attr_type == CUDLA_NUM_OUTPUT_TENSORS or _attr_type == CUDLA_NUM_OUTPUT_TASK_STATISTICS:
+        with nogil:
+            __status__ = cudlaModuleGetAttributes(<const Module>h_module, <const _ModuleAttributeType>_attr_type, &count_attr)
+        check_status(__status__)
+        return <int>(count_attr.numInputTensors)
+    elif _attr_type == CUDLA_INPUT_TENSOR_DESCRIPTORS or _attr_type == CUDLA_OUTPUT_TENSOR_DESCRIPTORS or _attr_type == CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS:
+        if _attr_type == CUDLA_INPUT_TENSOR_DESCRIPTORS:
+            num_attr_type = CUDLA_NUM_INPUT_TENSORS
+        elif _attr_type == CUDLA_OUTPUT_TENSOR_DESCRIPTORS:
+            num_attr_type = CUDLA_NUM_OUTPUT_TENSORS
+        else:
+            num_attr_type = CUDLA_NUM_OUTPUT_TASK_STATISTICS
+        with nogil:
+            __status__ = cudlaModuleGetAttributes(<const Module>h_module, <const _ModuleAttributeType>num_attr_type, &num_attr)
+        check_status(__status__)
+        count = num_attr.numInputTensors
+        desc_buf = <cudlaModuleTensorDescriptor*>malloc(count * sizeof(cudlaModuleTensorDescriptor))
+        if desc_buf == NULL:
+            raise MemoryError("Failed to allocate descriptor buffer")
+        try:
+            desc_attr.inputTensorDesc = desc_buf
+            with nogil:
+                __status__ = cudlaModuleGetAttributes(<const Module>h_module, <const _ModuleAttributeType>_attr_type, &desc_attr)
+            check_status(__status__)
+            result = []
+            for i in range(count):
+                result.append(ModuleTensorDescriptor.from_ptr(<intptr_t>&desc_buf[i], readonly=True))
+            return result
+        finally:
+            free(desc_buf)
+    else:
+        raise ValueError(f"Unknown attribute type: {attr_type}")
diff --git a/cuda_bindings/cuda/bindings/cycudla.pxd b/cuda_bindings/cuda/bindings/cycudla.pxd
new file mode 100644
index 00000000000..42578710de9
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cycudla.pxd
@@ -0,0 +1,155 @@
+# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+# This layer exposes the C header to Cython as-is.
+
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdint cimport intptr_t, uintptr_t
+from libc.stddef cimport size_t
+
+
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+ctypedef enum cudlaStatus "cudlaStatus":
+    cudlaSuccess "cudlaSuccess" = 0
+    cudlaErrorInvalidParam "cudlaErrorInvalidParam" = 1
+    cudlaErrorOutOfResources "cudlaErrorOutOfResources" = 2
+    cudlaErrorCreationFailed "cudlaErrorCreationFailed" = 3
+    cudlaErrorInvalidAddress "cudlaErrorInvalidAddress" = 4
+    cudlaErrorOs "cudlaErrorOs" = 5
+    cudlaErrorCuda "cudlaErrorCuda" = 6
+    cudlaErrorUmd "cudlaErrorUmd" = 7
+    cudlaErrorInvalidDevice "cudlaErrorInvalidDevice" = 8
+    cudlaErrorInvalidAttribute "cudlaErrorInvalidAttribute" = 9
+    cudlaErrorIncompatibleDlaSWVersion "cudlaErrorIncompatibleDlaSWVersion" = 10
+    cudlaErrorMemoryRegistered "cudlaErrorMemoryRegistered" = 11
+    cudlaErrorInvalidModule "cudlaErrorInvalidModule" = 12
+    cudlaErrorUnsupportedOperation "cudlaErrorUnsupportedOperation" = 13
+    cudlaErrorNvSci "cudlaErrorNvSci" = 14
+    cudlaErrorDlaErrInvalidInput "cudlaErrorDlaErrInvalidInput" = 0x40000001
+    cudlaErrorDlaErrInvalidPreAction "cudlaErrorDlaErrInvalidPreAction" = 0x40000002
+    cudlaErrorDlaErrNoMem "cudlaErrorDlaErrNoMem" = 0x40000003
+    cudlaErrorDlaErrProcessorBusy "cudlaErrorDlaErrProcessorBusy" = 0x40000004
+    cudlaErrorDlaErrTaskStatusMismatch "cudlaErrorDlaErrTaskStatusMismatch" = 0x40000005
+    cudlaErrorDlaErrEngineTimeout "cudlaErrorDlaErrEngineTimeout" = 0x40000006
+    cudlaErrorDlaErrDataMismatch "cudlaErrorDlaErrDataMismatch" = 0x40000007
+    cudlaErrorUnknown "cudlaErrorUnknown" = 0x7fffffff
+    _CUDLASTATUS_INTERNAL_LOADING_ERROR "_CUDLASTATUS_INTERNAL_LOADING_ERROR" = -42
+
+ctypedef enum cudlaMode "cudlaMode":
+    CUDLA_CUDA_DLA "CUDLA_CUDA_DLA" = 0
+    CUDLA_STANDALONE "CUDLA_STANDALONE" = 1
+
+ctypedef enum cudlaModuleAttributeType "cudlaModuleAttributeType":
+    CUDLA_NUM_INPUT_TENSORS "CUDLA_NUM_INPUT_TENSORS" = 0
+    CUDLA_NUM_OUTPUT_TENSORS "CUDLA_NUM_OUTPUT_TENSORS" = 1
+    CUDLA_INPUT_TENSOR_DESCRIPTORS "CUDLA_INPUT_TENSOR_DESCRIPTORS" = 2
+    CUDLA_OUTPUT_TENSOR_DESCRIPTORS "CUDLA_OUTPUT_TENSOR_DESCRIPTORS" = 3
+    CUDLA_NUM_OUTPUT_TASK_STATISTICS "CUDLA_NUM_OUTPUT_TASK_STATISTICS" = 4
+    CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS "CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS" = 5
+
+ctypedef enum cudlaFenceType "cudlaFenceType":
+    CUDLA_NVSCISYNC_FENCE "CUDLA_NVSCISYNC_FENCE" = 1
+    CUDLA_NVSCISYNC_FENCE_SOF "CUDLA_NVSCISYNC_FENCE_SOF" = 2
+
+ctypedef enum cudlaModuleLoadFlags "cudlaModuleLoadFlags":
+    CUDLA_MODULE_DEFAULT "CUDLA_MODULE_DEFAULT" = 0
+    CUDLA_MODULE_ENABLE_FAULT_DIAGNOSTICS "CUDLA_MODULE_ENABLE_FAULT_DIAGNOSTICS" = 1
+
+ctypedef enum cudlaSubmissionFlags "cudlaSubmissionFlags":
+    CUDLA_SUBMIT_NOOP "CUDLA_SUBMIT_NOOP" = 1
+    CUDLA_SUBMIT_SKIP_LOCK_ACQUIRE "CUDLA_SUBMIT_SKIP_LOCK_ACQUIRE" = (1 << 1)
+    CUDLA_SUBMIT_DIAGNOSTICS_TASK "CUDLA_SUBMIT_DIAGNOSTICS_TASK" = (1 << 2)
+
+ctypedef enum cudlaAccessPermissionFlags "cudlaAccessPermissionFlags":
+    CUDLA_READ_WRITE_PERM "CUDLA_READ_WRITE_PERM" = 0
+    CUDLA_READ_ONLY_PERM "CUDLA_READ_ONLY_PERM" = 1
+    CUDLA_TASK_STATISTICS "CUDLA_TASK_STATISTICS" = (1 << 1)
+
+ctypedef enum cudlaDevAttributeType "cudlaDevAttributeType":
+    CUDLA_UNIFIED_ADDRESSING "CUDLA_UNIFIED_ADDRESSING" = 0
+    CUDLA_DEVICE_VERSION "CUDLA_DEVICE_VERSION" = 1
+
+# types
+ctypedef void* cudlaDevHandle 'cudlaDevHandle'
+ctypedef void* cudlaModule 'cudlaModule'
+ctypedef struct cudlaExternalMemoryHandleDesc_t 'cudlaExternalMemoryHandleDesc_t':
+    void* extBufObject
+    unsigned long long size
+
+ctypedef struct cudlaExternalSemaphoreHandleDesc_t 'cudlaExternalSemaphoreHandleDesc_t':
+    void* extSyncObject
+
+ctypedef struct cudlaModuleTensorDescriptor 'cudlaModuleTensorDescriptor':
+    char name[(80U + 1)]
+    uint64_t size
+    uint64_t n
+    uint64_t c
+    uint64_t h
+    uint64_t w
+    uint8_t dataFormat
+    uint8_t dataType
+    uint8_t dataCategory
+    uint8_t pixelFormat
+    uint8_t pixelMapping
+    uint32_t stride[8U]
+
+ctypedef struct CudlaFence 'CudlaFence':
+    void* fence
+    cudlaFenceType type
+
+ctypedef union cudlaDevAttribute 'cudlaDevAttribute':
+    uint8_t unifiedAddressingSupported
+    uint32_t deviceVersion
+
+ctypedef union cudlaModuleAttribute 'cudlaModuleAttribute':
+    uint32_t numInputTensors
+    uint32_t numOutputTensors
+    cudlaModuleTensorDescriptor* inputTensorDesc
+    cudlaModuleTensorDescriptor* outputTensorDesc
+
+ctypedef struct cudlaWaitEvents 'cudlaWaitEvents':
+    CudlaFence* preFences
+    uint32_t numEvents
+
+ctypedef struct cudlaSignalEvents 'cudlaSignalEvents':
+    uint64_t** devPtrs
+    CudlaFence* eofFences
+    uint32_t numEvents
+
+ctypedef struct cudlaTask 'cudlaTask':
+    cudlaModule moduleHandle
+    uint64_t** outputTensor
+    uint32_t numOutputTensors
+    uint32_t numInputTensors
+    uint64_t** inputTensor
+    cudlaWaitEvents* waitEvents
+    cudlaSignalEvents* signalEvents
+
+
+# Typedef aliases for struct types (struct has _t, typedef doesn't)
+ctypedef cudlaExternalMemoryHandleDesc_t cudlaExternalMemoryHandleDesc 'cudlaExternalMemoryHandleDesc'
+ctypedef cudlaExternalSemaphoreHandleDesc_t cudlaExternalSemaphoreHandleDesc 'cudlaExternalSemaphoreHandleDesc'
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef cudlaStatus cudlaGetVersion(uint64_t* const version) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaDeviceGetCount(uint64_t* const pNumDevices) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaCreateDevice(const uint64_t device, cudlaDevHandle* const devHandle, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaMemRegister(const cudlaDevHandle devHandle, const uint64_t* const ptr, const size_t size, uint64_t** const devPtr, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaModuleLoadFromMemory(const cudlaDevHandle devHandle, const uint8_t* const pModule, const size_t moduleSize, cudlaModule* const hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaModuleGetAttributes(const cudlaModule hModule, const cudlaModuleAttributeType attrType, cudlaModuleAttribute* const attribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaModuleUnload(const cudlaModule hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaSubmitTask(const cudlaDevHandle devHandle, const cudlaTask* const ptrToTasks, const uint32_t numTasks, void* const stream, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaDeviceGetAttribute(const cudlaDevHandle devHandle, const cudlaDevAttributeType attrib, cudlaDevAttribute* const pAttribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaMemUnregister(const cudlaDevHandle devHandle, const uint64_t* const devPtr) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaGetLastError(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaDestroyDevice(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
+cdef cudlaStatus cudlaSetTaskTimeoutInMs(const cudlaDevHandle devHandle, const uint32_t timeout) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/cycudla.pyx b/cuda_bindings/cuda/bindings/cycudla.pyx
new file mode 100644
index 00000000000..d64e28f847d
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cycudla.pyx
@@ -0,0 +1,61 @@
+# This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
+
+from ._internal cimport cudla as _cudla
+
+
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cudlaStatus cudlaGetVersion(uint64_t* const version) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaGetVersion(version)
+
+
+cdef cudlaStatus cudlaDeviceGetCount(uint64_t* const pNumDevices) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaDeviceGetCount(pNumDevices)
+
+
+cdef cudlaStatus cudlaCreateDevice(const uint64_t device, cudlaDevHandle* const devHandle, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaCreateDevice(device, devHandle, flags)
+
+
+cdef cudlaStatus cudlaMemRegister(const cudlaDevHandle devHandle, const uint64_t* const ptr, const size_t size, uint64_t** const devPtr, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaMemRegister(devHandle, ptr, size, devPtr, flags)
+
+
+cdef cudlaStatus cudlaModuleLoadFromMemory(const cudlaDevHandle devHandle, const uint8_t* const pModule, const size_t moduleSize, cudlaModule* const hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaModuleLoadFromMemory(devHandle, pModule, moduleSize, hModule, flags)
+
+
+cdef cudlaStatus cudlaModuleGetAttributes(const cudlaModule hModule, const cudlaModuleAttributeType attrType, cudlaModuleAttribute* const attribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaModuleGetAttributes(hModule, attrType, attribute)
+
+
+cdef cudlaStatus cudlaModuleUnload(const cudlaModule hModule, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaModuleUnload(hModule, flags)
+
+
+cdef cudlaStatus cudlaSubmitTask(const cudlaDevHandle devHandle, const cudlaTask* const ptrToTasks, const uint32_t numTasks, void* const stream, const uint32_t flags) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaSubmitTask(devHandle, ptrToTasks, numTasks, stream, flags)
+
+
+cdef cudlaStatus cudlaDeviceGetAttribute(const cudlaDevHandle devHandle, const cudlaDevAttributeType attrib, cudlaDevAttribute* const pAttribute) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaDeviceGetAttribute(devHandle, attrib, pAttribute)
+
+
+cdef cudlaStatus cudlaMemUnregister(const cudlaDevHandle devHandle, const uint64_t* const devPtr) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaMemUnregister(devHandle, devPtr)
+
+
+cdef cudlaStatus cudlaGetLastError(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaGetLastError(devHandle)
+
+
+cdef cudlaStatus cudlaDestroyDevice(const cudlaDevHandle devHandle) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaDestroyDevice(devHandle)
+
+
+cdef cudlaStatus cudlaSetTaskTimeoutInMs(const cudlaDevHandle devHandle, const uint32_t timeout) except?_CUDLASTATUS_INTERNAL_LOADING_ERROR nogil:
+    return _cudla._cudlaSetTaskTimeoutInMs(devHandle, timeout)
diff --git a/cuda_bindings/docs/source/api.rst b/cuda_bindings/docs/source/api.rst
index a47b65a9297..07eef44252f 100644
--- a/cuda_bindings/docs/source/api.rst
+++ b/cuda_bindings/docs/source/api.rst
@@ -15,6 +15,7 @@ CUDA Python API Reference
    module/nvjitlink
    module/nvvm
    module/nvfatbin
+   module/cudla
    module/cufile
    module/nvml
    module/utils
diff --git a/cuda_bindings/docs/source/module/cudla.rst b/cuda_bindings/docs/source/module/cudla.rst
new file mode 100644
index 00000000000..e1eb720568e
--- /dev/null
+++ b/cuda_bindings/docs/source/module/cudla.rst
@@ -0,0 +1,67 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+.. default-role:: cpp:any
+
+cudla
+=====
+
+Note
+----
+
+The cuDLA bindings require a Jetson platform with DLA hardware (Xavier or Orin).
+cuDLA is not available on desktop GPUs.
+
+Functions
+---------
+
+cuDLA defines the following functions for DLA device management and inference.
+
+.. autofunction:: cuda.bindings.cudla.get_version
+.. autofunction:: cuda.bindings.cudla.device_get_count
+.. autofunction:: cuda.bindings.cudla.create_device
+.. autofunction:: cuda.bindings.cudla.destroy_device
+.. autofunction:: cuda.bindings.cudla.mem_register
+.. autofunction:: cuda.bindings.cudla.mem_unregister
+.. autofunction:: cuda.bindings.cudla.module_load_from_memory
+.. autofunction:: cuda.bindings.cudla.module_get_attributes
+.. autofunction:: cuda.bindings.cudla.module_unload
+.. autofunction:: cuda.bindings.cudla.submit_task
+.. autofunction:: cuda.bindings.cudla.device_get_attribute
+.. autofunction:: cuda.bindings.cudla.get_last_error
+.. autofunction:: cuda.bindings.cudla.import_external_memory
+.. autofunction:: cuda.bindings.cudla.import_external_semaphore
+.. autofunction:: cuda.bindings.cudla.get_nv_sci_sync_attributes
+.. autofunction:: cuda.bindings.cudla.set_task_timeout_in_ms
+
+Types
+-----
+
+.. autoclass:: cuda.bindings.cudla.ExternalMemoryHandleDesc
+.. autoclass:: cuda.bindings.cudla.ExternalSemaphoreHandleDesc
+.. autoclass:: cuda.bindings.cudla.ModuleTensorDescriptor
+.. autoclass:: cuda.bindings.cudla.Fence
+.. autoclass:: cuda.bindings.cudla.DevAttribute
+.. autoclass:: cuda.bindings.cudla.ModuleAttribute
+.. autoclass:: cuda.bindings.cudla.WaitEvents
+.. autoclass:: cuda.bindings.cudla.SignalEvents
+.. autoclass:: cuda.bindings.cudla.Task
+
+Enums
+-----
+
+.. autoclass:: cuda.bindings.cudla.Status
+
+   .. autoattribute:: cuda.bindings.cudla.Status.SUCCESS
+
+.. autoclass:: cuda.bindings.cudla.Mode
+
+   .. autoattribute:: cuda.bindings.cudla.Mode.CUDA_DLA
+   .. autoattribute:: cuda.bindings.cudla.Mode.STANDALONE
+
+.. autoclass:: cuda.bindings.cudla.ModuleAttributeType
+.. autoclass:: cuda.bindings.cudla.FenceType
+.. autoclass:: cuda.bindings.cudla.ModuleLoadFlags
+.. autoclass:: cuda.bindings.cudla.SubmissionFlags
+.. autoclass:: cuda.bindings.cudla.AccessPermissionFlags
+.. autoclass:: cuda.bindings.cudla.DevAttributeType
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index d72ab7f7750..bac72f5b9be 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = ["cuda-pathfinder >=1.4.2"]
 
 [project.optional-dependencies]
 all = [
-    "cuda-toolkit[nvrtc,nvjitlink,nvvm,nvfatbin]==13.*",
+    "cuda-toolkit[nvrtc,nvjitlink,nvvm,nvfatbin,cudla]==13.*",
     "cuda-toolkit[cufile]==13.*; sys_platform == 'linux'",
 ]
 
diff --git a/cuda_bindings/tests/cudla/conftest.py b/cuda_bindings/tests/cudla/conftest.py
new file mode 100644
index 00000000000..2d08058f735
--- /dev/null
+++ b/cuda_bindings/tests/cudla/conftest.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import pytest
+
+
+@pytest.fixture(scope="module", autouse=True)
+def ctx():
+    """Override the parent conftest's ``ctx`` fixture which creates a CUDA
+    context.  cuDLA tests do not require a CUDA context, so this no-op
+    prevents ``cuInit`` / ``cuCtxCreate`` from running (and failing on
+    machines without a CUDA-capable GPU)."""
+    yield None
diff --git a/cuda_bindings/tests/cudla/test_cudla_bindings.py b/cuda_bindings/tests/cudla/test_cudla_bindings.py
new file mode 100644
index 00000000000..4caca695840
--- /dev/null
+++ b/cuda_bindings/tests/cudla/test_cudla_bindings.py
@@ -0,0 +1,305 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import pytest
+
+cudla = pytest.importorskip("cuda.bindings.cudla")
+
+
+def _cudla_library_available():
+    """Check if the cuDLA shared library is loaded and usable."""
+    try:
+        from cuda.bindings._internal import cudla as _inner
+
+        return _inner._inspect_function_pointer("__cudlaGetVersion") != 0
+    except Exception:
+        return False
+
+
+requires_cudla_library = pytest.mark.skipif(
+    not _cudla_library_available(),
+    reason="cuDLA library not available (requires NVIDIA Orin with DLA)",
+)
+
+
+# ---------------------------------------------------------------------------
+# Enum tests (always run -- no library needed)
+# ---------------------------------------------------------------------------
+
+
+class TestEnums:
+    def test_status_values(self):
+        assert cudla.Status.Success == 0
+        assert cudla.Status.ErrorInvalidParam == 1
+        assert cudla.Status.ErrorOutOfResources == 2
+        assert cudla.Status.ErrorCreationFailed == 3
+        assert cudla.Status.ErrorInvalidAddress == 4
+        assert cudla.Status.ErrorOs == 5
+        assert cudla.Status.ErrorCuda == 6
+        assert cudla.Status.ErrorUmd == 7
+        assert cudla.Status.ErrorInvalidDevice == 8
+        assert cudla.Status.ErrorInvalidAttribute == 9
+        assert cudla.Status.ErrorIncompatibleDlaSWVersion == 10
+        assert cudla.Status.ErrorMemoryRegistered == 11
+        assert cudla.Status.ErrorInvalidModule == 12
+        assert cudla.Status.ErrorUnsupportedOperation == 13
+        assert cudla.Status.ErrorNvSci == 14
+        assert cudla.Status.ErrorDlaErrInvalidInput == 0x40000001
+        assert cudla.Status.ErrorDlaErrInvalidPreAction == 0x40000002
+        assert cudla.Status.ErrorDlaErrNoMem == 0x40000003
+        assert cudla.Status.ErrorDlaErrProcessorBusy == 0x40000004
+        assert cudla.Status.ErrorDlaErrTaskStatusMismatch == 0x40000005
+        assert cudla.Status.ErrorDlaErrEngineTimeout == 0x40000006
+        assert cudla.Status.ErrorDlaErrDataMismatch == 0x40000007
+        assert cudla.Status.ErrorUnknown == 0x7FFFFFFF
+
+    def test_status_member_count(self):
+        assert len(cudla.Status) == 23
+
+    def test_mode_values(self):
+        assert cudla.Mode.CUDA_DLA == 0
+        assert cudla.Mode.STANDALONE == 1
+
+    def test_module_attribute_type_values(self):
+        assert cudla.ModuleAttributeType.NUM_INPUT_TENSORS == 0
+        assert cudla.ModuleAttributeType.NUM_OUTPUT_TENSORS == 1
+        assert cudla.ModuleAttributeType.INPUT_TENSOR_DESCRIPTORS == 2
+        assert cudla.ModuleAttributeType.OUTPUT_TENSOR_DESCRIPTORS == 3
+        assert cudla.ModuleAttributeType.NUM_OUTPUT_TASK_STATISTICS == 4
+        assert cudla.ModuleAttributeType.OUTPUT_TASK_STATISTICS_DESCRIPTORS == 5
+
+    def test_fence_type_values(self):
+        assert cudla.FenceType.NVSCISYNC_FENCE == 1
+        assert cudla.FenceType.NVSCISYNC_FENCE_SOF == 2
+
+    def test_module_load_flags(self):
+        assert cudla.ModuleLoadFlags.MODULE_DEFAULT == 0
+        assert cudla.ModuleLoadFlags.MODULE_ENABLE_FAULT_DIAGNOSTICS == 1
+
+    def test_submission_flags(self):
+        assert cudla.SubmissionFlags.SUBMIT_NOOP == 1
+        assert cudla.SubmissionFlags.SUBMIT_SKIP_LOCK_ACQUIRE == 2
+        assert cudla.SubmissionFlags.SUBMIT_DIAGNOSTICS_TASK == 4
+
+    def test_access_permission_flags(self):
+        assert cudla.AccessPermissionFlags.READ_WRITE_PERM == 0
+        assert cudla.AccessPermissionFlags.READ_ONLY_PERM == 1
+        assert cudla.AccessPermissionFlags.TASK_STATISTICS == 2
+
+    def test_dev_attribute_type(self):
+        assert cudla.DevAttributeType.UNIFIED_ADDRESSING == 0
+        assert cudla.DevAttributeType.DEVICE_VERSION == 1
+
+
+# ---------------------------------------------------------------------------
+# POD type tests (always run -- no library needed)
+# ---------------------------------------------------------------------------
+
+
+class TestPodTypes:
+    def test_external_memory_handle_desc(self):
+        desc = cudla.ExternalMemoryHandleDesc()
+        desc.size_ = 4096
+        assert desc.size_ == 4096
+        desc.ext_buf_object = 0xABCD
+        assert desc.ext_buf_object == 0xABCD
+
+    def test_external_semaphore_handle_desc(self):
+        desc = cudla.ExternalSemaphoreHandleDesc()
+        desc.ext_sync_object = 0x1234
+        assert desc.ext_sync_object == 0x1234
+
+    def test_module_tensor_descriptor_fields(self):
+        desc = cudla.ModuleTensorDescriptor()
+        assert desc.size_ == 0
+        assert desc.n == 0
+        assert desc.c == 0
+        assert desc.h == 0
+        assert desc.w == 0
+        assert desc.data_format == 0
+        assert desc.data_type == 0
+        assert desc.data_category == 0
+        assert desc.pixel_format == 0
+        assert desc.pixel_mapping == 0
+
+    def test_module_tensor_descriptor_name(self):
+        desc = cudla.ModuleTensorDescriptor()
+        name = desc.name
+        assert isinstance(name, (str, bytes))
+
+    def test_module_tensor_descriptor_stride(self):
+        desc = cudla.ModuleTensorDescriptor()
+        stride = desc.stride
+        assert len(stride) == 8
+
+    def test_fence(self):
+        fence = cudla.Fence()
+        fence.fence = 0xBEEF
+        assert fence.fence == 0xBEEF
+        fence.type = int(cudla.FenceType.NVSCISYNC_FENCE)
+        assert fence.type == 1
+
+    def test_dev_attribute(self):
+        attr = cudla.DevAttribute()
+        assert attr.unified_addressing_supported == 0
+        assert attr.device_version == 0
+        attr.unified_addressing_supported = 1
+        assert attr.unified_addressing_supported == 1
+        attr.device_version = 0x20
+        assert attr.device_version == 0x20
+
+    def test_module_attribute(self):
+        attr = cudla.ModuleAttribute()
+        assert attr.num_input_tensors == 0
+        assert attr.num_output_tensors == 0
+        attr.num_input_tensors = 3
+        assert attr.num_input_tensors == 3
+        attr.num_output_tensors = 1
+        assert attr.num_output_tensors == 1
+
+    def test_wait_events(self):
+        we = cudla.WaitEvents()
+        assert we.pre_fences == []
+
+    def test_signal_events(self):
+        se = cudla.SignalEvents()
+        assert se.eof_fences == []
+
+    def test_task_construction(self):
+        task = cudla.Task()
+        task.module_handle = 0xDEAD
+        assert task.module_handle == 0xDEAD
+
+    def test_task_input_tensor_auto_size(self):
+        task = cudla.Task()
+        task.input_tensor = [0x1000, 0x2000, 0x3000]
+        assert len(task.input_tensor) == 3
+
+    def test_task_output_tensor_auto_size(self):
+        task = cudla.Task()
+        task.output_tensor = [0x4000]
+        assert len(task.output_tensor) == 1
+
+    def test_task_combined(self):
+        task = cudla.Task()
+        task.module_handle = 0xABCD
+        task.input_tensor = [0x1000, 0x2000]
+        task.output_tensor = [0x3000]
+        task.wait_events = 0
+        task.signal_events = 0
+        assert task.module_handle == 0xABCD
+        assert len(task.input_tensor) == 2
+        assert len(task.output_tensor) == 1
+
+    def test_pod_ptr_is_nonzero(self):
+        """Verify that int(pod) returns a nonzero pointer (memory is allocated)."""
+        task = cudla.Task()
+        assert int(task) != 0
+        desc = cudla.ModuleTensorDescriptor()
+        assert int(desc) != 0
+
+
+# ---------------------------------------------------------------------------
+# Error type tests (always run -- no library needed)
+# ---------------------------------------------------------------------------
+
+
+class TestErrorType:
+    def test_cudla_error_is_exception(self):
+        assert issubclass(cudla.CudlaError, Exception)
+
+    def test_cudla_error_stores_status(self):
+        err = cudla.CudlaError(int(cudla.Status.ErrorInvalidParam))
+        assert err.status == int(cudla.Status.ErrorInvalidParam)
+
+    def test_cudla_error_str(self):
+        err = cudla.CudlaError(int(cudla.Status.ErrorInvalidParam))
+        assert "ErrorInvalidParam" in str(err)
+
+
+# ---------------------------------------------------------------------------
+# API surface tests (always run -- no library needed)
+# ---------------------------------------------------------------------------
+
+
+class TestApiSurface:
+    """Verify that all expected functions exist as callable attributes."""
+
+    @pytest.mark.parametrize(
+        "func_name",
+        [
+            "get_version",
+            "device_get_count",
+            "create_device",
+            "destroy_device",
+            "mem_register",
+            "mem_unregister",
+            "module_load_from_memory",
+            "module_get_attributes",
+            "module_unload",
+            "submit_task",
+            "device_get_attribute",
+            "get_last_error",
+            "set_task_timeout_in_ms",
+        ],
+    )
+    def test_function_exists(self, func_name):
+        assert callable(getattr(cudla, func_name))
+
+
+# ---------------------------------------------------------------------------
+# Function tests (hardware-gated -- skipped when libcudla.so is unavailable)
+# ---------------------------------------------------------------------------
+
+
+@requires_cudla_library
+class TestFunctions:
+    def test_get_version(self):
+        version = cudla.get_version()
+        assert version > 0
+
+    def test_device_get_count(self):
+        count = cudla.device_get_count()
+        assert count >= 0
+
+    def test_create_destroy_device(self):
+        from cuda.bindings import driver, runtime
+
+        driver.cuInit(0)
+        runtime.cudaSetDevice(0)
+
+        handle = cudla.create_device(0, int(cudla.Mode.CUDA_DLA))
+        try:
+            assert handle != 0
+        finally:
+            cudla.destroy_device(handle)
+
+    def test_create_device_rejects_standalone(self):
+        with pytest.raises(cudla.CudlaError, match="ErrorUnsupportedOperation"):
+            cudla.create_device(0, int(cudla.Mode.STANDALONE))
+
+    def test_create_device_rejects_standalone_raw_int(self):
+        with pytest.raises(cudla.CudlaError, match="ErrorUnsupportedOperation"):
+            cudla.create_device(0, 1)
+
+    def test_mem_register_unregister(self):
+        from cuda.bindings import driver, runtime
+
+        driver.cuInit(0)
+        runtime.cudaSetDevice(0)
+
+        dev_handle = cudla.create_device(0, int(cudla.Mode.CUDA_DLA))
+        try:
+            buf_size = 1024
+            err, gpu_ptr = runtime.cudaMalloc(buf_size)
+            assert err.value == 0, f"cudaMalloc failed: {err}"
+            try:
+                registered_ptr = cudla.mem_register(
+                    dev_handle, int(gpu_ptr), buf_size, 0
+                )
+                assert registered_ptr != 0
+                cudla.mem_unregister(dev_handle, registered_ptr)
+            finally:
+                runtime.cudaFree(gpu_ptr)
+        finally:
+            cudla.destroy_device(dev_handle)

From dcd089e4ec0b04571b845e2f4fbbe205e82488ca Mon Sep 17 00:00:00 2001
From: Nikshay Shrivastava <nikshays@nvidia.com>
Date: Wed, 6 May 2026 07:29:34 -0700
Subject: [PATCH 2/4] fixup: ruff lint fixes

---
 cuda_bindings/tests/cudla/conftest.py            | 2 +-
 cuda_bindings/tests/cudla/test_cudla_bindings.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/cuda_bindings/tests/cudla/conftest.py b/cuda_bindings/tests/cudla/conftest.py
index 2d08058f735..c22097fce97 100644
--- a/cuda_bindings/tests/cudla/conftest.py
+++ b/cuda_bindings/tests/cudla/conftest.py
@@ -10,4 +10,4 @@ def ctx():
     context.  cuDLA tests do not require a CUDA context, so this no-op
     prevents ``cuInit`` / ``cuCtxCreate`` from running (and failing on
     machines without a CUDA-capable GPU)."""
-    yield None
+    return None
diff --git a/cuda_bindings/tests/cudla/test_cudla_bindings.py b/cuda_bindings/tests/cudla/test_cudla_bindings.py
index 4caca695840..4a756329d75 100644
--- a/cuda_bindings/tests/cudla/test_cudla_bindings.py
+++ b/cuda_bindings/tests/cudla/test_cudla_bindings.py
@@ -294,9 +294,7 @@ def test_mem_register_unregister(self):
             err, gpu_ptr = runtime.cudaMalloc(buf_size)
             assert err.value == 0, f"cudaMalloc failed: {err}"
             try:
-                registered_ptr = cudla.mem_register(
-                    dev_handle, int(gpu_ptr), buf_size, 0
-                )
+                registered_ptr = cudla.mem_register(dev_handle, int(gpu_ptr), buf_size, 0)
                 assert registered_ptr != 0
                 cudla.mem_unregister(dev_handle, registered_ptr)
             finally:

From 285eeb59d548799dbdf341bf55695bd9ef2b6f55 Mon Sep 17 00:00:00 2001
From: Nikshay Shrivastava <nikshays@nvidia.com>
Date: Wed, 6 May 2026 11:11:37 -0700
Subject: [PATCH 3/4] Add SPDX license headers to cuDLA binding files

---
 cuda_bindings/cuda/bindings/_internal/cudla.pxd         | 4 ++++
 cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx   | 4 ++++
 cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx | 4 ++++
 cuda_bindings/cuda/bindings/cudla.pxd                   | 4 ++++
 cuda_bindings/cuda/bindings/cudla.pyx                   | 7 ++++---
 cuda_bindings/cuda/bindings/cycudla.pxd                 | 7 ++++++-
 cuda_bindings/cuda/bindings/cycudla.pyx                 | 4 ++++
 7 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/cudla.pxd b/cuda_bindings/cuda/bindings/_internal/cudla.pxd
index 57fef6a7323..2cbdc706f5a 100644
--- a/cuda_bindings/cuda/bindings/_internal/cudla.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/cudla.pxd
@@ -1,3 +1,7 @@
+# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from ..cycudla cimport *
diff --git a/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx b/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
index 6a0f4f2984b..10202ec3e36 100644
--- a/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
@@ -1,3 +1,7 @@
+# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
diff --git a/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx b/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx
index 2cddfd6d19e..5baf53efb45 100644
--- a/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx
@@ -1,3 +1,7 @@
+# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
diff --git a/cuda_bindings/cuda/bindings/cudla.pxd b/cuda_bindings/cuda/bindings/cudla.pxd
index 09b55c579b6..484e17bd50b 100644
--- a/cuda_bindings/cuda/bindings/cudla.pxd
+++ b/cuda_bindings/cuda/bindings/cudla.pxd
@@ -1,3 +1,7 @@
+# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
diff --git a/cuda_bindings/cuda/bindings/cudla.pyx b/cuda_bindings/cuda/bindings/cudla.pyx
index c11b2494e1b..1ade8672599 100644
--- a/cuda_bindings/cuda/bindings/cudla.pyx
+++ b/cuda_bindings/cuda/bindings/cudla.pyx
@@ -1,3 +1,7 @@
+# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 cimport cython  # NOQA
@@ -761,7 +765,6 @@ dev_attribute_dtype = _numpy.dtype((
     }
     ))
 
-
 cdef class DevAttribute:
     """Empty-initialize an instance of `cudlaDevAttribute`.
 
@@ -901,7 +904,6 @@ module_attribute_dtype = _numpy.dtype((
     }
     ))
 
-
 cdef class ModuleAttribute:
     """Empty-initialize an instance of `cudlaModuleAttribute`.
 
@@ -1571,7 +1573,6 @@ cdef class Task:
         return obj
 
 
-
 ###############################################################################
 # Enum
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cycudla.pxd b/cuda_bindings/cuda/bindings/cycudla.pxd
index 42578710de9..50410519246 100644
--- a/cuda_bindings/cuda/bindings/cycudla.pxd
+++ b/cuda_bindings/cuda/bindings/cycudla.pxd
@@ -1,3 +1,7 @@
+# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
@@ -76,7 +80,9 @@ ctypedef enum cudlaDevAttributeType "cudlaDevAttributeType":
 
 # types
 ctypedef void* cudlaDevHandle 'cudlaDevHandle'
+
 ctypedef void* cudlaModule 'cudlaModule'
+
 ctypedef struct cudlaExternalMemoryHandleDesc_t 'cudlaExternalMemoryHandleDesc_t':
     void* extBufObject
     unsigned long long size
@@ -130,7 +136,6 @@ ctypedef struct cudlaTask 'cudlaTask':
     cudlaWaitEvents* waitEvents
     cudlaSignalEvents* signalEvents
 
-
 # Typedef aliases for struct types (struct has _t, typedef doesn't)
 ctypedef cudlaExternalMemoryHandleDesc_t cudlaExternalMemoryHandleDesc 'cudlaExternalMemoryHandleDesc'
 ctypedef cudlaExternalSemaphoreHandleDesc_t cudlaExternalSemaphoreHandleDesc 'cudlaExternalSemaphoreHandleDesc'
diff --git a/cuda_bindings/cuda/bindings/cycudla.pyx b/cuda_bindings/cuda/bindings/cycudla.pyx
index d64e28f847d..7fd88c17648 100644
--- a/cuda_bindings/cuda/bindings/cycudla.pyx
+++ b/cuda_bindings/cuda/bindings/cycudla.pyx
@@ -1,3 +1,7 @@
+# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from ._internal cimport cudla as _cudla

From 04de604cbc386a98ca6fe5efdd4dd11bfe6ff543 Mon Sep 17 00:00:00 2001
From: Nikshay Shrivastava <nikshays@nvidia.com>
Date: Wed, 6 May 2026 11:27:05 -0700
Subject: [PATCH 4/4] fixed SPDX license headers format

---
 cuda_bindings/cuda/bindings/_internal/cudla.pxd         | 5 ++---
 cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx   | 5 ++---
 cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx | 5 ++---
 cuda_bindings/cuda/bindings/cudla.pxd                   | 5 ++---
 cuda_bindings/cuda/bindings/cudla.pyx                   | 5 ++---
 cuda_bindings/cuda/bindings/cycudla.pxd                 | 5 ++---
 cuda_bindings/cuda/bindings/cycudla.pyx                 | 5 ++---
 7 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/cudla.pxd b/cuda_bindings/cuda/bindings/_internal/cudla.pxd
index 2cbdc706f5a..beca59f3e6b 100644
--- a/cuda_bindings/cuda/bindings/_internal/cudla.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/cudla.pxd
@@ -1,7 +1,6 @@
-# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-#
+
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from ..cycudla cimport *
diff --git a/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx b/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
index 10202ec3e36..ccc53f32ca8 100644
--- a/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/cudla_linux.pyx
@@ -1,7 +1,6 @@
-# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-#
+
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
diff --git a/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx b/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx
index 5baf53efb45..2d91c16ee99 100644
--- a/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/cudla_windows.pyx
@@ -1,7 +1,6 @@
-# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-#
+
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
diff --git a/cuda_bindings/cuda/bindings/cudla.pxd b/cuda_bindings/cuda/bindings/cudla.pxd
index 484e17bd50b..786622ac9a9 100644
--- a/cuda_bindings/cuda/bindings/cudla.pxd
+++ b/cuda_bindings/cuda/bindings/cudla.pxd
@@ -1,7 +1,6 @@
-# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-#
+
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
diff --git a/cuda_bindings/cuda/bindings/cudla.pyx b/cuda_bindings/cuda/bindings/cudla.pyx
index 1ade8672599..ff7569b9b7a 100644
--- a/cuda_bindings/cuda/bindings/cudla.pyx
+++ b/cuda_bindings/cuda/bindings/cudla.pyx
@@ -1,7 +1,6 @@
-# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-#
+
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 cimport cython  # NOQA
diff --git a/cuda_bindings/cuda/bindings/cycudla.pxd b/cuda_bindings/cuda/bindings/cycudla.pxd
index 50410519246..5bcbe623469 100644
--- a/cuda_bindings/cuda/bindings/cycudla.pxd
+++ b/cuda_bindings/cuda/bindings/cycudla.pxd
@@ -1,7 +1,6 @@
-# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-#
+
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
diff --git a/cuda_bindings/cuda/bindings/cycudla.pyx b/cuda_bindings/cuda/bindings/cycudla.pyx
index 7fd88c17648..8d0cbdc5111 100644
--- a/cuda_bindings/cuda/bindings/cycudla.pyx
+++ b/cuda_bindings/cuda/bindings/cycudla.pyx
@@ -1,7 +1,6 @@
-# Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-#
+
 # This code was automatically generated with version 1.5.0, generator version 0.3.1.dev1465+gc5c5c8652. Do not modify it directly.
 
 from ._internal cimport cudla as _cudla