Add support for thread block clusters by eschnett · Pull Request #3017 · JuliaGPU/CUDA.jl

eschnett · 2026-01-13T18:02:36Z

No description provided.

codecov · 2026-01-13T19:01:14Z

Codecov Report

❌ Patch coverage is 93.33333% with 2 lines in your changes missing coverage. Please review.
✅ Project coverage is 89.48%. Comparing base (5472295) to head (23d28a6).
⚠️ Report is 5 commits behind head on master.

Files with missing lines	Patch %	Lines
lib/cudadrv/execution.jl	93.33%	2 Missing ⚠️

Additional details and impacted files

@@           Coverage Diff           @@
##           master    #3017   +/-   ##
=======================================
  Coverage   89.48%   89.48%           
=======================================
  Files         148      148           
  Lines       13043    13066   +23     
=======================================
+ Hits        11671    11692   +21     
- Misses       1372     1374    +2

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

eschnett · 2026-01-14T15:37:13Z

For the record, this PR addresses #1989.

eschnett · 2026-01-16T00:48:01Z

I'll have to clean up the patch, but apart from that it's finished.

There is a way to launch kernels with non-trivial cluster sizes
There are new kernel functions to query about clusters
There is distributed shared memory
There is a cluster_wait function.

Missing cleanups:

Probably needs more device_capability >= v"9.0" checks
Requires LLVM >= 20, i.e. Julia >= 1.13; will need to add @static if statements for this
Need to make tests pass
CUDA.jl is broken for Julia 1.13. I have a work-around, but I assume someone else will clean this up, and I'll then have to merge it

github-actions · 2026-03-10T20:23:11Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/lib/cudadrv/execution.jl b/lib/cudadrv/execution.jl
index 693dddc72..37f07001b 100644
--- a/lib/cudadrv/execution.jl
+++ b/lib/cudadrv/execution.jl
@@ -59,7 +59,7 @@ internal kernel parameter buffer, or a pointer to device memory.
 This is a low-level call, prefer to use [`cudacall`](@ref) instead.
 """
 function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::CuDim=1,
-                clustersize::CuDim=1, cooperative::Bool=false, shmem::Integer=0,
+        clustersize::CuDim = 1, cooperative::Bool = false, shmem::Integer = 0,
                 stream::CuStream=stream()) where {N}
     blockdim = CuDim3(blocks)
     threaddim = CuDim3(threads)
@@ -67,13 +67,13 @@ function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::Cu
 
     attrs = CUDA.CUlaunchAttribute[]
     if cooperative
-        resize!(attrs, length(attrs)+1)
+        resize!(attrs, length(attrs) + 1)
         attr = pointer(attrs, length(attrs))
         attr.id = CUDA.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
-        attr.value.cooperative = 1;
+        attr.value.cooperative = 1
     end
     if clusterdim.x != 1 || clusterdim.y != 1 || clusterdim.z != 1
-        resize!(attrs, length(attrs)+1)
+        resize!(attrs, length(attrs) + 1)
         attr = pointer(attrs, length(attrs))
         attr.id = CUDA.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
         attr.value.clusterDim.x = clusterdim.x
@@ -81,10 +81,14 @@ function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::Cu
         attr.value.clusterDim.z = clusterdim.z
     end
 
-    GC.@preserve attrs stream begin
-        config = Ref(CUlaunchConfig(blockdim.x, blockdim.y, blockdim.z,
-                                    threaddim.x, threaddim.y, threaddim.z,
-                                    shmem, stream.handle, pointer(attrs), length(attrs)))
+    return GC.@preserve attrs stream begin
+        config = Ref(
+            CUlaunchConfig(
+                blockdim.x, blockdim.y, blockdim.z,
+                threaddim.x, threaddim.y, threaddim.z,
+                shmem, stream.handle, pointer(attrs), length(attrs)
+            )
+        )
         try
             pack_arguments(args...) do kernelParams
                 cuLaunchKernelEx(config, f, kernelParams, C_NULL)
@@ -106,7 +110,7 @@ end
         error("Grid dimensions $blockdim are not positive")
     (threaddim.x>0 && threaddim.y>0 && threaddim.z>0) ||
         error("Block dimensions $threaddim are not positive")
-    (clusterdim.x>0 && clusterdim.y>0 && clusterdim.z>0) ||
+    (clusterdim.x > 0 && clusterdim.y > 0 && clusterdim.z > 0) ||
         error("Cluster dimensions $clusterdim are not positive")
     (blockdim.x % clusterdim.x == 0 && blockdim.y % clusterdim.y == 0 && blockdim.z % clusterdim.z == 0) ||
         error("Block dimensions $blockdim are not multiples of the cluster dimensions $clusterdim")
@@ -147,9 +151,9 @@ end
         # which reports a value that depends on the function's attributes.
     else
         # Thread block clusters are not supported
-         if active_clusters > 1
-             error("Thread block cluster dimensions exceed device limit ($(clusterdim.x) * $(clusterdim.y) * $(clusterdim.z) > 1). (The device does not support thread block clusters.)")
-         end
+        if active_clusters > 1
+            error("Thread block cluster dimensions exceed device limit ($(clusterdim.x) * $(clusterdim.y) * $(clusterdim.z) > 1). (The device does not support thread block clusters.)")
+        end
     end
     ## shared memory limit
     shmem_lim = attribute(dev, DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl
index 7fb9ad60f..458423d02 100644
--- a/src/device/intrinsics/indexing.jl
+++ b/src/device/intrinsics/indexing.jl
@@ -24,8 +24,12 @@ export
             idx = call!(builder, intr_typ, intr)
 
             # attach range metadata
-            range_metadata = MDNode([ConstantInt(range.start % Int32),
-                                     ConstantInt((range.stop + 1) % Int32)])
+            range_metadata = MDNode(
+                [
+                    ConstantInt(range.start % Int32),
+                    ConstantInt((range.stop + 1) % Int32),
+                ]
+            )
             metadata(idx)[LLVM.MD_range] = range_metadata
 
             ret!(builder, idx)
@@ -42,7 +46,7 @@ const max_block_length = 1024
 const max_grid_size  = (x=2^31-1, y=65535, z=65535)
 # maximum guaranteed linear dimension is 8, but 16 is possible on Hopper
 # https://forums.developer.nvidia.com/t/cluster-size-limitation/279795
-const max_cluster_size = (x=16, y=16, z=16)
+const max_cluster_size = (x = 16, y = 16, z = 16)
 const max_cluster_length = 16
 
 for dim in (:x, :y, :z)
@@ -69,7 +73,7 @@ for dim in (:x, :y, :z)
     # Block index in cluster
     fn = Symbol("blockIdxInCluster_$dim")
     intr = Symbol("cluster.ctaid.$dim")
-    @eval @inline $fn() = _index($(Val(intr)), $(Val(0:max_cluster_size[dim]-1))) + 1i32
+    @eval @inline $fn() = _index($(Val(intr)), $(Val(0:(max_cluster_size[dim] - 1)))) + 1i32
 
     # Cluster size (#blocks per cluster)
     fn = Symbol("clusterDim_$dim")
@@ -79,7 +83,7 @@ for dim in (:x, :y, :z)
     # Cluster index in grid
     fn = Symbol("clusterIdx_$dim")
     intr = Symbol("clusterid.$dim")
-    @eval @inline $fn() = _index($(Val(intr)), $(Val(0:max_grid_size[dim]-1))) + 1i32
+    @eval @inline $fn() = _index($(Val(intr)), $(Val(0:(max_grid_size[dim] - 1)))) + 1i32
 
     # Grid size in clusters (#clusters per grid)
     fn = Symbol("gridClusterDim_$dim")
@@ -117,47 +121,47 @@ Returns the dimensions (in blocks) of the grid.
 """ gridDim
 @inline gridDim() = (x=gridDim_x(), y=gridDim_y(), z=gridDim_z())
 
-@doc """
-    blockIdxInCluster()::NamedTuple
+    @doc """
+        blockIdxInCluster()::NamedTuple
 
-Returns the block index within the cluster.
-""" blockIdxInCluster
-@inline blockIdxInCluster() = (x=blockIdxInCluster_x(), y=blockIdxInCluster_y(), z=blockIdxInCluster_z())
+    Returns the block index within the cluster.
+    """ blockIdxInCluster
+    @inline blockIdxInCluster() = (x = blockIdxInCluster_x(), y = blockIdxInCluster_y(), z = blockIdxInCluster_z())
 
-@doc """
-    clusterDim()::NamedTuple
+    @doc """
+        clusterDim()::NamedTuple
 
-Returns the dimensions (in blocks) of the cluster
-""" clusterDim
-@inline clusterDim() = (x=clusterDim_x(), y=clusterDim_y(), z=clusterDim_z())
+    Returns the dimensions (in blocks) of the cluster
+    """ clusterDim
+    @inline clusterDim() = (x = clusterDim_x(), y = clusterDim_y(), z = clusterDim_z())
 
-@doc """
-    clusterIdx()::NamedTuple
+    @doc """
+        clusterIdx()::NamedTuple
 
-Returns the cluster index within the grid.
-""" clusterIdx
-@inline clusterIdx() = (x=clusterIdx_x(), y=clusterIdx_y(), z=clusterIdx_z())
+    Returns the cluster index within the grid.
+    """ clusterIdx
+    @inline clusterIdx() = (x = clusterIdx_x(), y = clusterIdx_y(), z = clusterIdx_z())
 
-@doc """
-    gridClusterDim()::NamedTuple
+    @doc """
+        gridClusterDim()::NamedTuple
 
-Returns the dimensions (in clusters) of the grid
-""" gridClusterDim
-@inline gridClusterDim() = (x=gridClusterDim_x(), y=gridClusterDim_y(), z=gridClusterDim_z())
+    Returns the dimensions (in clusters) of the grid
+    """ gridClusterDim
+    @inline gridClusterDim() = (x = gridClusterDim_x(), y = gridClusterDim_y(), z = gridClusterDim_z())
 
-@doc """
-    linearBlockIdxInCluster()::Int32
+    @doc """
+        linearBlockIdxInCluster()::Int32
 
-Returns the linear block index within the cluster.
-""" linearBlockIdxInCluster
-@eval @inline $(:linearBlockIdxInCluster)() = _index($(Val(Symbol("cluster.ctarank"))), $(Val(0:max_cluster_length-1))) + 1i32
+    Returns the linear block index within the cluster.
+    """ linearBlockIdxInCluster
+    @eval @inline $(:linearBlockIdxInCluster)() = _index($(Val(Symbol("cluster.ctarank"))), $(Val(0:(max_cluster_length - 1)))) + 1i32
 
-@doc """
-    linearClusterSize()::Int32
+    @doc """
+        linearClusterSize()::Int32
 
-Returns the linear cluster size (in blocks).
-""" linearClusterSize
-@eval @inline $(:linearClusterSize)() = _index($(Val(Symbol("cluster.nctarank"))), $(Val(1:max_cluster_length)))
+    Returns the linear cluster size (in blocks).
+    """ linearClusterSize
+    @eval @inline $(:linearClusterSize)() = _index($(Val(Symbol("cluster.nctarank"))), $(Val(1:max_cluster_length)))
 
 @doc """
     warpsize()::Int32
diff --git a/src/device/intrinsics/shared_memory.jl b/src/device/intrinsics/shared_memory.jl
index e3c89d168..a5d848c8f 100644
--- a/src/device/intrinsics/shared_memory.jl
+++ b/src/device/intrinsics/shared_memory.jl
@@ -71,7 +71,7 @@ end
 dynamic_smem_size() =
     @asmcall("mov.u32 \$0, %dynamic_smem_size;", "=r", true, UInt32, Tuple{})
 
-@inline function CuDistributedSharedArray(shared_array::CuDeviceArray{T,N,AS.Shared}, blockidx::Integer) where {T,N}
+@inline function CuDistributedSharedArray(shared_array::CuDeviceArray{T, N, AS.Shared}, blockidx::Integer) where {T, N}
     # Distributed shared memory has address space 7 (SharedCluster).
     # This is only supported in LLVM >= 21 which we can't yet use with
     # Julia. We therefore need to map it to address space 0 (Generic).
@@ -80,23 +80,23 @@ dynamic_smem_size() =
     # we're using LLVM >=21.
 
     ptr = map_shared_rank(shared_array.ptr, blockidx)
-    CuDeviceArray{T,N,AS.Generic}(ptr, shared_array.dims, shared_array.maxsize)
+    return CuDeviceArray{T, N, AS.Generic}(ptr, shared_array.dims, shared_array.maxsize)
 end
 
-@inline function map_shared_rank(ptr_shared::LLVMPtr{T,AS.Shared}, rank::Integer) where {T}
+@inline function map_shared_rank(ptr_shared::LLVMPtr{T, AS.Shared}, rank::Integer) where {T}
     # This requires LLVM >=20 (i.e. Julia >= 1.13)
     ptr7 = @asmcall(
         "mapa.shared::cluster.u64 \$0, \$1, \$2;",
         "=l,l,r",
-        LLVMPtr{T,AS.SharedCluster},
-        Tuple{Core.LLVMPtr{T,AS.Shared}, Int32},
+        LLVMPtr{T, AS.SharedCluster},
+        Tuple{Core.LLVMPtr{T, AS.Shared}, Int32},
         ptr_shared, Int32(rank - 1i32),
     )
     ptr0 = @asmcall(
         "cvta.shared::cluster.u64 \$0, \$1;",
         "=l,l",
-        LLVMPtr{T,AS.Generic},
-        Tuple{Core.LLVMPtr{T,AS.SharedCluster}},
+        LLVMPtr{T, AS.Generic},
+        Tuple{Core.LLVMPtr{T, AS.SharedCluster}},
         ptr7,
     )
     return ptr0
diff --git a/src/device/pointer.jl b/src/device/pointer.jl
index 5b6d38d9b..e75d4bf0f 100644
--- a/src/device/pointer.jl
+++ b/src/device/pointer.jl
@@ -6,12 +6,12 @@ export AS
 
 module AS
 
-const Generic       = 0
-const Global        = 1
-const Shared        = 3
-const Constant      = 4
-const Local         = 5
-const SharedCluster = 7
+    const Generic = 0
+    const Global = 1
+    const Shared = 3
+    const Constant = 4
+    const Local = 5
+    const SharedCluster = 7
 
 end
 
diff --git a/test/core/cudadrv.jl b/test/core/cudadrv.jl
index 382a01eb2..33a432d86 100644
--- a/test/core/cudadrv.jl
+++ b/test/core/cudadrv.jl
@@ -269,21 +269,21 @@ let
     cudacall(dummy, Tuple{})
     cudacall(dummy, Tuple{}; threads=1)
     cudacall(dummy, Tuple{}; threads=1, blocks=1)
-    cudacall(dummy, Tuple{}; threads=1, blocks=1, clustersize=1)
-    cudacall(dummy, Tuple{}; threads=1, blocks=1, clustersize=1, shmem=0)
-    cudacall(dummy, Tuple{}; threads=1, blocks=1, clustersize=1, shmem=0, stream=stream())
-    cudacall(dummy, Tuple{}; threads=1, blocks=1, clustersize=1, shmem=0, stream=stream(), cooperative=false)
+            cudacall(dummy, Tuple{}; threads = 1, blocks = 1, clustersize = 1)
+            cudacall(dummy, Tuple{}; threads = 1, blocks = 1, clustersize = 1, shmem = 0)
+            cudacall(dummy, Tuple{}; threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream())
+            cudacall(dummy, Tuple{}; threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream(), cooperative = false)
     cudacall(dummy, ())
-    cudacall(dummy, (); threads=1, blocks=1, clustersize=1, shmem=0, stream=stream(), cooperative=false)
+            cudacall(dummy, (); threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream(), cooperative = false)
 
     # different launch syntaxes
     CUDA.launch(dummy)
     CUDA.launch(dummy; threads=1)
     CUDA.launch(dummy; threads=1, blocks=1)
-    CUDA.launch(dummy; threads=1, blocks=1, clustersize=1)
-    CUDA.launch(dummy; threads=1, blocks=1, clustersize=1, shmem=0)
-    CUDA.launch(dummy; threads=1, blocks=1, clustersize=1, shmem=0, stream=stream())
-    CUDA.launch(dummy; threads=1, blocks=1, clustersize=1, shmem=0, stream=stream(), cooperative=false)
+            CUDA.launch(dummy; threads = 1, blocks = 1, clustersize = 1)
+            CUDA.launch(dummy; threads = 1, blocks = 1, clustersize = 1, shmem = 0)
+            CUDA.launch(dummy; threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream())
+            CUDA.launch(dummy; threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream(), cooperative = false)
 end
 
 let
diff --git a/test/core/device/intrinsics/clusters.jl b/test/core/device/intrinsics/clusters.jl
index 9b241e550..911a89c6b 100644
--- a/test/core/device/intrinsics/clusters.jl
+++ b/test/core/device/intrinsics/clusters.jl
@@ -1,78 +1,78 @@
 @testset "thread block clusters" begin
-if capability(device()) >= v"9.0"
+    if capability(device()) >= v"9.0"
 
-###########################################################################################
+        ###########################################################################################
 
-@testset "indexing" begin
-    function f(A::AbstractArray{Int32,9})
-        ti = threadIdx().x
-        tj = threadIdx().y
-        tk = threadIdx().z
-        bi = blockIdxInCluster().x
-        bj = blockIdxInCluster().y
-        bk = blockIdxInCluster().z
-        ci = clusterIdx().x
-        cj = clusterIdx().y
-        ck = clusterIdx().z
-        A[ti,tj,tk,bi,bj,bk,ci,cj,ck] = 1
-        nothing
-    end
+        @testset "indexing" begin
+            function f(A::AbstractArray{Int32, 9})
+                ti = threadIdx().x
+                tj = threadIdx().y
+                tk = threadIdx().z
+                bi = blockIdxInCluster().x
+                bj = blockIdxInCluster().y
+                bk = blockIdxInCluster().z
+                ci = clusterIdx().x
+                cj = clusterIdx().y
+                ck = clusterIdx().z
+                A[ti, tj, tk, bi, bj, bk, ci, cj, ck] = 1
+                nothing
+            end
 
-    A = CUDA.zeros(Int32, threads..., clustersize..., (blocks .÷ clustersize)...)
+            A = CUDA.zeros(Int32, threads..., clustersize..., (blocks .÷ clustersize)...)
 
-    threads = (3,5,7)
-    clustersize = (2,2,2)
-    blocks = (4,6,8)
-    @cuda threads=threads blocks=blocks clustersize=clustersize f(A)
+            threads = (3, 5, 7)
+            clustersize = (2, 2, 2)
+            blocks = (4, 6, 8)
+            @cuda threads = threads blocks = blocks clustersize = clustersize f(A)
 
-    @test all(==(1), Array(A))
-end
+            @test all(==(1), Array(A))
+        end
 
-###########################################################################################
+        ###########################################################################################
 
-@testset "distributed shared memory" begin
-    function f(A::AbstractArray{Int32,3})
-        ti = threadIdx().x
-        nt = blockDim().x
-        @assert 1<=ti<=nt
-        bi = blockIdxInCluster().x
-        nb = clusterDim().x
-        @assert 1<=bi<=nb
-        ci = clusterIdx().x
-        nc = gridClusterDim().x
-        @assert 1<=ci<=nc
+        @testset "distributed shared memory" begin
+            function f(A::AbstractArray{Int32, 3})
+                ti = threadIdx().x
+                nt = blockDim().x
+                @assert 1 <= ti <= nt
+                bi = blockIdxInCluster().x
+                nb = clusterDim().x
+                @assert 1 <= bi <= nb
+                ci = clusterIdx().x
+                nc = gridClusterDim().x
+                @assert 1 <= ci <= nc
 
-        sm = CuStaticSharedArray(Int32, 8)
-        for i in 1:nb
-            sm[i] = -1
-        end
-        cluster_wait()
+                sm = CuStaticSharedArray(Int32, 8)
+                for i in 1:nb
+                    sm[i] = -1
+                end
+                cluster_wait()
 
-        for i in 1:nb
-            dsm = CuDistributedSharedArray(sm, i)
-            dsm[bi] = bi
-        end
-        cluster_wait()
+                for i in 1:nb
+                    dsm = CuDistributedSharedArray(sm, i)
+                    dsm[bi] = bi
+                end
+                cluster_wait()
 
-        for i in 1:nb
-            A[i,bi,ci] = sm[i]
-        end
-        return nothing
-    end
+                for i in 1:nb
+                    A[i, bi, ci] = sm[i]
+                end
+                return nothing
+            end
 
-    A = CUDA.zeros(Int32, clustersize, clustersize, blocks ÷ clustersize)
+            A = CUDA.zeros(Int32, clustersize, clustersize, blocks ÷ clustersize)
 
-    threads = 1
-    clustersize = 4
-    blocks = 16
-    @cuda threads=threads blocks=blocks clustersize=clustersize f(A)
+            threads = 1
+            clustersize = 4
+            blocks = 16
+            @cuda threads = threads blocks = blocks clustersize = clustersize f(A)
 
-    B = Array(A)
-    goodB = [i for i in 1:clustersize, bi in 1:clustersize, ci in 1:blocks ÷ clustersize]
-    @test B == goodB
-end
+            B = Array(A)
+            goodB = [i for i in 1:clustersize, bi in 1:clustersize, ci in 1:(blocks ÷ clustersize)]
+            @test B == goodB
+        end
 
-###########################################################################################
+        ###########################################################################################
 
-end
+    end
 end
diff --git a/test/core/execution.jl b/test/core/execution.jl
index 3a7ef7783..8b6622534 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -21,11 +21,11 @@ dummy() = return
     @cuda blocks=(1,1) dummy()
     @cuda blocks=(1,1,1) dummy()
 
-    clustersize = 1
-    @cuda clustersize dummy()
-    @cuda clustersize=1 dummy()
-    @cuda clustersize=(1,1) dummy()
-    @cuda clustersize=(1,1,1) dummy()
+        clustersize = 1
+        @cuda clustersize dummy()
+        @cuda clustersize = 1 dummy()
+        @cuda clustersize = (1, 1) dummy()
+        @cuda clustersize = (1, 1, 1) dummy()
 end
 
 
@@ -165,13 +165,13 @@ end
     @cuda stream=s dummy()
 end
 
-@testset "clusters" begin
-    if CUDA.capability(device()) >= v"9.0"
-        @cuda threads=64 clustersize=2 dummy()
-    else
-        @test_throws CuError @cuda threads=64 clustersize=2 dummy()
+    @testset "clusters" begin
+        if CUDA.capability(device()) >= v"9.0"
+            @cuda threads = 64 clustersize = 2 dummy()
+        else
+            @test_throws CuError @cuda threads = 64 clustersize = 2 dummy()
+        end
     end
-end
 
 @testset "external kernels" begin
     @eval module KernelModule

maleadt

LGTM. Couple of minor nits.

github-actions

CUDA.jl Benchmarks

Details

Benchmark suite	Current: `23d28a6`	Previous: `5472295`	Ratio
`latency/precompile`	`44374924006.5` ns	`43893707632` ns	`1.01`
`latency/ttfp`	`12775535247` ns	`12909648613` ns	`0.99`
`latency/import`	`3520832716` ns	`3506340040` ns	`1.00`
`integration/volumerhs`	`9437708.5` ns	`9441122` ns	`1.00`
`integration/byval/slices=1`	`146137` ns	`145704` ns	`1.00`
`integration/byval/slices=3`	`423188` ns	`422873` ns	`1.00`
`integration/byval/reference`	`144284` ns	`143887` ns	`1.00`
`integration/byval/slices=2`	`284721` ns	`284218` ns	`1.00`
`integration/cudadevrt`	`102779` ns	`102447` ns	`1.00`
`kernel/indexing`	`13414` ns	`13397` ns	`1.00`
`kernel/indexing_checked`	`14430` ns	`14106` ns	`1.02`
`kernel/occupancy`	`705.7062937062936` ns	`642.1951219512196` ns	`1.10`
`kernel/launch`	`2145.6666666666665` ns	`2182.9444444444443` ns	`0.98`
`kernel/rand`	`14518` ns	`15509` ns	`0.94`
`array/reverse/1d`	`18638` ns	`19015.5` ns	`0.98`
`array/reverse/2dL_inplace`	`66048` ns	`66294` ns	`1.00`
`array/reverse/1dL`	`69192` ns	`69264` ns	`1.00`
`array/reverse/2d`	`20686` ns	`21406` ns	`0.97`
`array/reverse/1d_inplace`	`10364` ns	`10627.666666666666` ns	`0.98`
`array/reverse/2d_inplace`	`10332` ns	`10996` ns	`0.94`
`array/reverse/2dL`	`72682` ns	`73336` ns	`0.99`
`array/reverse/1dL_inplace`	`66112` ns	`66391` ns	`1.00`
`array/copy`	`18710` ns	`18444` ns	`1.01`
`array/iteration/findall/int`	`149278.5` ns	`145423` ns	`1.03`
`array/iteration/findall/bool`	`131915` ns	`130731` ns	`1.01`
`array/iteration/findfirst/int`	`83253` ns	`84638` ns	`0.98`
`array/iteration/findfirst/bool`	`81773` ns	`81595` ns	`1.00`
`array/iteration/scalar`	`68503` ns	`67259` ns	`1.02`
`array/iteration/logical`	`200037.5` ns	`200755.5` ns	`1.00`
`array/iteration/findmin/1d`	`87057.5` ns	`86165.5` ns	`1.01`
`array/iteration/findmin/2d`	`117119` ns	`117304.5` ns	`1.00`
`array/reductions/reduce/Int64/1d`	`42692` ns	`39371` ns	`1.08`
`array/reductions/reduce/Int64/dims=1`	`42332.5` ns	`44367` ns	`0.95`
`array/reductions/reduce/Int64/dims=2`	`59660` ns	`59429` ns	`1.00`
`array/reductions/reduce/Int64/dims=1L`	`87812` ns	`87550` ns	`1.00`
`array/reductions/reduce/Int64/dims=2L`	`84836` ns	`85113.5` ns	`1.00`
`array/reductions/reduce/Float32/1d`	`35298` ns	`34268` ns	`1.03`
`array/reductions/reduce/Float32/dims=1`	`40506` ns	`48865.5` ns	`0.83`
`array/reductions/reduce/Float32/dims=2`	`57009` ns	`56196` ns	`1.01`
`array/reductions/reduce/Float32/dims=1L`	`51926` ns	`51777` ns	`1.00`
`array/reductions/reduce/Float32/dims=2L`	`70017` ns	`69644` ns	`1.01`
`array/reductions/mapreduce/Int64/1d`	`42948` ns	`39157` ns	`1.10`
`array/reductions/mapreduce/Int64/dims=1`	`42894` ns	`51282.5` ns	`0.84`
`array/reductions/mapreduce/Int64/dims=2`	`59948` ns	`59191` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=1L`	`87934` ns	`87450` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=2L`	`85266` ns	`85108.5` ns	`1.00`
`array/reductions/mapreduce/Float32/1d`	`34973` ns	`33989` ns	`1.03`
`array/reductions/mapreduce/Float32/dims=1`	`40619` ns	`48728` ns	`0.83`
`array/reductions/mapreduce/Float32/dims=2`	`56809` ns	`56178` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=1L`	`51922` ns	`51465` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=2L`	`69624` ns	`68877` ns	`1.01`
`array/broadcast`	`20842` ns	`20339` ns	`1.02`
`array/copyto!/gpu_to_gpu`	`11380` ns	`10634.166666666668` ns	`1.07`
`array/copyto!/cpu_to_gpu`	`212669.5` ns	`215032` ns	`0.99`
`array/copyto!/gpu_to_cpu`	`282559` ns	`283345.5` ns	`1.00`
`array/accumulate/Int64/1d`	`119133` ns	`118179` ns	`1.01`
`array/accumulate/Int64/dims=1`	`79746` ns	`79470` ns	`1.00`
`array/accumulate/Int64/dims=2`	`155805` ns	`155613` ns	`1.00`
`array/accumulate/Int64/dims=1L`	`1694276` ns	`1694158` ns	`1.00`
`array/accumulate/Int64/dims=2L`	`960988.5` ns	`960865` ns	`1.00`
`array/accumulate/Float32/1d`	`101525` ns	`100641.5` ns	`1.01`
`array/accumulate/Float32/dims=1`	`76648` ns	`76173` ns	`1.01`
`array/accumulate/Float32/dims=2`	`143517` ns	`144217.5` ns	`1.00`
`array/accumulate/Float32/dims=1L`	`1585237.5` ns	`1584446` ns	`1.00`
`array/accumulate/Float32/dims=2L`	`656946` ns	`656699` ns	`1.00`
`array/construct`	`1311` ns	`1306.3` ns	`1.00`
`array/random/randn/Float32`	`44211` ns	`41156` ns	`1.07`
`array/random/randn!/Float32`	`27925` ns	`30029` ns	`0.93`
`array/random/rand!/Int64`	`34428` ns	`34605` ns	`0.99`
`array/random/rand!/Float32`	`8674.666666666666` ns	`8182.25` ns	`1.06`
`array/random/rand/Int64`	`37256` ns	`32316` ns	`1.15`
`array/random/rand/Float32`	`13370` ns	`12466` ns	`1.07`
`array/permutedims/4d`	`52804.5` ns	`50943` ns	`1.04`
`array/permutedims/2d`	`52528` ns	`52373` ns	`1.00`
`array/permutedims/3d`	`52984` ns	`52694` ns	`1.01`
`array/sorting/1d`	`2735008` ns	`2734863` ns	`1.00`
`array/sorting/by`	`3303991.5` ns	`3304190.5` ns	`1.00`
`array/sorting/2d`	`1068161` ns	`1068173` ns	`1.00`
`cuda/synchronization/stream/auto`	`1044.0666666666666` ns	`1039.7692307692307` ns	`1.00`
`cuda/synchronization/stream/nonblocking`	`7484.4` ns	`7691.4` ns	`0.97`
`cuda/synchronization/stream/blocking`	`847.3533333333334` ns	`818.6910112359551` ns	`1.04`
`cuda/synchronization/context/auto`	`1174.7` ns	`1142.4` ns	`1.03`
`cuda/synchronization/context/nonblocking`	`7147.2` ns	`7606.8` ns	`0.94`
`cuda/synchronization/context/blocking`	`936.7777777777778` ns	`888.9473684210526` ns	`1.05`

This comment was automatically generated by workflow using github-action-benchmark.

eschnett · 2026-03-12T14:17:20Z

@maleadt I removed the nits.

eschnett · 2026-03-17T00:42:46Z

Could you merge this PR?

kshyatt and others added 3 commits July 26, 2025 11:00

Trial support for thread-block clusters

90c9f6a

Merge branch 'ksh/clusters' into eschnett/clusters

3625427

Rename :clusters -> :clustersize

35f34e9

eschnett added 11 commits January 13, 2026 19:18

Correct passing cluster dimension to C

5051195

Define CuDim3 early

b289d1f

Correct attr definition

506dc1e

Correct argument types

a7a0721

Convert launch config to reference

a9c645f

Correct getting pointers from objects

ddaa13f

CUStream vs. CuStream

1447b43

Add clustersize tests

ebe7180

Add/correct cluster index/size intrinsics

6d3fcc8

Export new cluster intrinsics

87c7b40

Correct blockIdxInCluster

39cacc4

maleadt requested changes Jan 14, 2026

View reviewed changes

Comment thread lib/cudadrv/libcuda.jl Outdated

Comment thread lib/cudadrv/execution.jl

eschnett added 2 commits January 14, 2026 10:21

Omit run-time capability check

c639670

Undo changes to autogenerated files

af4a1f4

eschnett added 11 commits January 14, 2026 11:39

Test thread block clusters only for capability >=9.0

c7cb54c

map_shared_rank: New function

6301e01

Changes to address spaces

5996c0a

Debug Julia 1.13

b5a9cad

Debug Julia 1.13

3be2035

Avoid @device_functions

c779f7f

More Julia 1.13 work-arounds

1e64e99

Use unsafe_convert instead of pointer

d434e01

CuDistributedSharedArray: New function

041fd0d

Correct llvmcall

2294d91

Correct llvmcall

04ad606

eschnett added 2 commits January 15, 2026 19:17

CI: Disable most builds

287a30d

Add cluster sync test

6db286a

eschnett added 12 commits January 15, 2026 20:11

libnvml is now a String

3f411a8

Loosen wrong bounds on clusterIdx and gridClusterDim

3704073

Disable indexing range metadata for debugging

b18e747

Correct range metadata

66276ee

Correct range metadata

b40b9ae

Correct range metadata

88e5662

Diagnose more launch failures

443833e

Merge branch 'master' into eschnett/clusters

790a186

Correct syntax error

c33bcd1

CI: Skip testing Julia nightly

d6a3273

Correct expected execption type

5b0f0cb

Merge master

e0bbd69

eschnett marked this pull request as ready for review March 10, 2026 20:21

maleadt reviewed Mar 11, 2026

View reviewed changes

Comment thread lib/cudadrv/execution.jl Outdated

Comment thread lib/cudadrv/execution.jl Outdated

eschnett added 4 commits March 11, 2026 09:54

Rewrite kernel launch for thread block clusters

8555c35

Simplify check on thread block cluster dimensions

3f1ef50

Create uninitialized CUlaunchAttributes

f2fea72

Document cluster indexing intrinsics

ba2a707

github-actions Bot reviewed Mar 11, 2026

View reviewed changes

eschnett added 2 commits March 11, 2026 14:13

Correct launching kernels with cooperative groups

39ce0a2

Correct launching kernels with cooperative groups

23d28a6

kshyatt requested a review from maleadt March 17, 2026 12:45

maleadt merged commit f7b7929 into JuliaGPU:master Mar 17, 2026
3 checks passed

giordano mentioned this pull request Mar 18, 2026

Various improvements to the docs #3030

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add support for thread block clusters#3017

Add support for thread block clusters#3017
maleadt merged 61 commits intoJuliaGPU:masterfrom
eschnett:eschnett/clusters

eschnett commented Jan 13, 2026

Uh oh!

codecov Bot commented Jan 13, 2026 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

eschnett commented Jan 14, 2026

Uh oh!

eschnett commented Jan 16, 2026

Uh oh!

github-actions Bot commented Mar 10, 2026 •

edited

Loading

Uh oh!

maleadt left a comment

Uh oh!

Uh oh!

Uh oh!

github-actions Bot left a comment •

edited

Loading

Uh oh!

eschnett commented Mar 12, 2026

Uh oh!

eschnett commented Mar 17, 2026

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Conversation

eschnett commented Jan 13, 2026

Uh oh!

codecov Bot commented Jan 13, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

Uh oh!

Uh oh!

eschnett commented Jan 14, 2026

Uh oh!

eschnett commented Jan 16, 2026

Uh oh!

github-actions Bot commented Mar 10, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

maleadt left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

github-actions Bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

eschnett commented Mar 12, 2026

Uh oh!

eschnett commented Mar 17, 2026

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

codecov Bot commented Jan 13, 2026 •

edited

Loading

github-actions Bot commented Mar 10, 2026 •

edited

Loading

github-actions Bot left a comment •

edited

Loading