Add support for thread block clusters#3017
Merged
maleadt merged 61 commits intoJuliaGPU:masterfrom Mar 17, 2026
Merged
Conversation
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #3017 +/- ##
=======================================
Coverage 89.48% 89.48%
=======================================
Files 148 148
Lines 13043 13066 +23
=======================================
+ Hits 11671 11692 +21
- Misses 1372 1374 +2 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
maleadt
requested changes
Jan 14, 2026
Contributor
Author
|
For the record, this PR addresses #1989. |
Contributor
Author
|
I'll have to clean up the patch, but apart from that it's finished.
Missing cleanups:
|
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cudadrv/execution.jl b/lib/cudadrv/execution.jl
index 693dddc72..37f07001b 100644
--- a/lib/cudadrv/execution.jl
+++ b/lib/cudadrv/execution.jl
@@ -59,7 +59,7 @@ internal kernel parameter buffer, or a pointer to device memory.
This is a low-level call, prefer to use [`cudacall`](@ref) instead.
"""
function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::CuDim=1,
- clustersize::CuDim=1, cooperative::Bool=false, shmem::Integer=0,
+ clustersize::CuDim = 1, cooperative::Bool = false, shmem::Integer = 0,
stream::CuStream=stream()) where {N}
blockdim = CuDim3(blocks)
threaddim = CuDim3(threads)
@@ -67,13 +67,13 @@ function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::Cu
attrs = CUDA.CUlaunchAttribute[]
if cooperative
- resize!(attrs, length(attrs)+1)
+ resize!(attrs, length(attrs) + 1)
attr = pointer(attrs, length(attrs))
attr.id = CUDA.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
- attr.value.cooperative = 1;
+ attr.value.cooperative = 1
end
if clusterdim.x != 1 || clusterdim.y != 1 || clusterdim.z != 1
- resize!(attrs, length(attrs)+1)
+ resize!(attrs, length(attrs) + 1)
attr = pointer(attrs, length(attrs))
attr.id = CUDA.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
attr.value.clusterDim.x = clusterdim.x
@@ -81,10 +81,14 @@ function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::Cu
attr.value.clusterDim.z = clusterdim.z
end
- GC.@preserve attrs stream begin
- config = Ref(CUlaunchConfig(blockdim.x, blockdim.y, blockdim.z,
- threaddim.x, threaddim.y, threaddim.z,
- shmem, stream.handle, pointer(attrs), length(attrs)))
+ return GC.@preserve attrs stream begin
+ config = Ref(
+ CUlaunchConfig(
+ blockdim.x, blockdim.y, blockdim.z,
+ threaddim.x, threaddim.y, threaddim.z,
+ shmem, stream.handle, pointer(attrs), length(attrs)
+ )
+ )
try
pack_arguments(args...) do kernelParams
cuLaunchKernelEx(config, f, kernelParams, C_NULL)
@@ -106,7 +110,7 @@ end
error("Grid dimensions $blockdim are not positive")
(threaddim.x>0 && threaddim.y>0 && threaddim.z>0) ||
error("Block dimensions $threaddim are not positive")
- (clusterdim.x>0 && clusterdim.y>0 && clusterdim.z>0) ||
+ (clusterdim.x > 0 && clusterdim.y > 0 && clusterdim.z > 0) ||
error("Cluster dimensions $clusterdim are not positive")
(blockdim.x % clusterdim.x == 0 && blockdim.y % clusterdim.y == 0 && blockdim.z % clusterdim.z == 0) ||
error("Block dimensions $blockdim are not multiples of the cluster dimensions $clusterdim")
@@ -147,9 +151,9 @@ end
# which reports a value that depends on the function's attributes.
else
# Thread block clusters are not supported
- if active_clusters > 1
- error("Thread block cluster dimensions exceed device limit ($(clusterdim.x) * $(clusterdim.y) * $(clusterdim.z) > 1). (The device does not support thread block clusters.)")
- end
+ if active_clusters > 1
+ error("Thread block cluster dimensions exceed device limit ($(clusterdim.x) * $(clusterdim.y) * $(clusterdim.z) > 1). (The device does not support thread block clusters.)")
+ end
end
## shared memory limit
shmem_lim = attribute(dev, DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl
index 7fb9ad60f..458423d02 100644
--- a/src/device/intrinsics/indexing.jl
+++ b/src/device/intrinsics/indexing.jl
@@ -24,8 +24,12 @@ export
idx = call!(builder, intr_typ, intr)
# attach range metadata
- range_metadata = MDNode([ConstantInt(range.start % Int32),
- ConstantInt((range.stop + 1) % Int32)])
+ range_metadata = MDNode(
+ [
+ ConstantInt(range.start % Int32),
+ ConstantInt((range.stop + 1) % Int32),
+ ]
+ )
metadata(idx)[LLVM.MD_range] = range_metadata
ret!(builder, idx)
@@ -42,7 +46,7 @@ const max_block_length = 1024
const max_grid_size = (x=2^31-1, y=65535, z=65535)
# maximum guaranteed linear dimension is 8, but 16 is possible on Hopper
# https://forums.developer.nvidia.com/t/cluster-size-limitation/279795
-const max_cluster_size = (x=16, y=16, z=16)
+const max_cluster_size = (x = 16, y = 16, z = 16)
const max_cluster_length = 16
for dim in (:x, :y, :z)
@@ -69,7 +73,7 @@ for dim in (:x, :y, :z)
# Block index in cluster
fn = Symbol("blockIdxInCluster_$dim")
intr = Symbol("cluster.ctaid.$dim")
- @eval @inline $fn() = _index($(Val(intr)), $(Val(0:max_cluster_size[dim]-1))) + 1i32
+ @eval @inline $fn() = _index($(Val(intr)), $(Val(0:(max_cluster_size[dim] - 1)))) + 1i32
# Cluster size (#blocks per cluster)
fn = Symbol("clusterDim_$dim")
@@ -79,7 +83,7 @@ for dim in (:x, :y, :z)
# Cluster index in grid
fn = Symbol("clusterIdx_$dim")
intr = Symbol("clusterid.$dim")
- @eval @inline $fn() = _index($(Val(intr)), $(Val(0:max_grid_size[dim]-1))) + 1i32
+ @eval @inline $fn() = _index($(Val(intr)), $(Val(0:(max_grid_size[dim] - 1)))) + 1i32
# Grid size in clusters (#clusters per grid)
fn = Symbol("gridClusterDim_$dim")
@@ -117,47 +121,47 @@ Returns the dimensions (in blocks) of the grid.
""" gridDim
@inline gridDim() = (x=gridDim_x(), y=gridDim_y(), z=gridDim_z())
-@doc """
- blockIdxInCluster()::NamedTuple
+ @doc """
+ blockIdxInCluster()::NamedTuple
-Returns the block index within the cluster.
-""" blockIdxInCluster
-@inline blockIdxInCluster() = (x=blockIdxInCluster_x(), y=blockIdxInCluster_y(), z=blockIdxInCluster_z())
+ Returns the block index within the cluster.
+ """ blockIdxInCluster
+ @inline blockIdxInCluster() = (x = blockIdxInCluster_x(), y = blockIdxInCluster_y(), z = blockIdxInCluster_z())
-@doc """
- clusterDim()::NamedTuple
+ @doc """
+ clusterDim()::NamedTuple
-Returns the dimensions (in blocks) of the cluster
-""" clusterDim
-@inline clusterDim() = (x=clusterDim_x(), y=clusterDim_y(), z=clusterDim_z())
+ Returns the dimensions (in blocks) of the cluster
+ """ clusterDim
+ @inline clusterDim() = (x = clusterDim_x(), y = clusterDim_y(), z = clusterDim_z())
-@doc """
- clusterIdx()::NamedTuple
+ @doc """
+ clusterIdx()::NamedTuple
-Returns the cluster index within the grid.
-""" clusterIdx
-@inline clusterIdx() = (x=clusterIdx_x(), y=clusterIdx_y(), z=clusterIdx_z())
+ Returns the cluster index within the grid.
+ """ clusterIdx
+ @inline clusterIdx() = (x = clusterIdx_x(), y = clusterIdx_y(), z = clusterIdx_z())
-@doc """
- gridClusterDim()::NamedTuple
+ @doc """
+ gridClusterDim()::NamedTuple
-Returns the dimensions (in clusters) of the grid
-""" gridClusterDim
-@inline gridClusterDim() = (x=gridClusterDim_x(), y=gridClusterDim_y(), z=gridClusterDim_z())
+ Returns the dimensions (in clusters) of the grid
+ """ gridClusterDim
+ @inline gridClusterDim() = (x = gridClusterDim_x(), y = gridClusterDim_y(), z = gridClusterDim_z())
-@doc """
- linearBlockIdxInCluster()::Int32
+ @doc """
+ linearBlockIdxInCluster()::Int32
-Returns the linear block index within the cluster.
-""" linearBlockIdxInCluster
-@eval @inline $(:linearBlockIdxInCluster)() = _index($(Val(Symbol("cluster.ctarank"))), $(Val(0:max_cluster_length-1))) + 1i32
+ Returns the linear block index within the cluster.
+ """ linearBlockIdxInCluster
+ @eval @inline $(:linearBlockIdxInCluster)() = _index($(Val(Symbol("cluster.ctarank"))), $(Val(0:(max_cluster_length - 1)))) + 1i32
-@doc """
- linearClusterSize()::Int32
+ @doc """
+ linearClusterSize()::Int32
-Returns the linear cluster size (in blocks).
-""" linearClusterSize
-@eval @inline $(:linearClusterSize)() = _index($(Val(Symbol("cluster.nctarank"))), $(Val(1:max_cluster_length)))
+ Returns the linear cluster size (in blocks).
+ """ linearClusterSize
+ @eval @inline $(:linearClusterSize)() = _index($(Val(Symbol("cluster.nctarank"))), $(Val(1:max_cluster_length)))
@doc """
warpsize()::Int32
diff --git a/src/device/intrinsics/shared_memory.jl b/src/device/intrinsics/shared_memory.jl
index e3c89d168..a5d848c8f 100644
--- a/src/device/intrinsics/shared_memory.jl
+++ b/src/device/intrinsics/shared_memory.jl
@@ -71,7 +71,7 @@ end
dynamic_smem_size() =
@asmcall("mov.u32 \$0, %dynamic_smem_size;", "=r", true, UInt32, Tuple{})
-@inline function CuDistributedSharedArray(shared_array::CuDeviceArray{T,N,AS.Shared}, blockidx::Integer) where {T,N}
+@inline function CuDistributedSharedArray(shared_array::CuDeviceArray{T, N, AS.Shared}, blockidx::Integer) where {T, N}
# Distributed shared memory has address space 7 (SharedCluster).
# This is only supported in LLVM >= 21 which we can't yet use with
# Julia. We therefore need to map it to address space 0 (Generic).
@@ -80,23 +80,23 @@ dynamic_smem_size() =
# we're using LLVM >=21.
ptr = map_shared_rank(shared_array.ptr, blockidx)
- CuDeviceArray{T,N,AS.Generic}(ptr, shared_array.dims, shared_array.maxsize)
+ return CuDeviceArray{T, N, AS.Generic}(ptr, shared_array.dims, shared_array.maxsize)
end
-@inline function map_shared_rank(ptr_shared::LLVMPtr{T,AS.Shared}, rank::Integer) where {T}
+@inline function map_shared_rank(ptr_shared::LLVMPtr{T, AS.Shared}, rank::Integer) where {T}
# This requires LLVM >=20 (i.e. Julia >= 1.13)
ptr7 = @asmcall(
"mapa.shared::cluster.u64 \$0, \$1, \$2;",
"=l,l,r",
- LLVMPtr{T,AS.SharedCluster},
- Tuple{Core.LLVMPtr{T,AS.Shared}, Int32},
+ LLVMPtr{T, AS.SharedCluster},
+ Tuple{Core.LLVMPtr{T, AS.Shared}, Int32},
ptr_shared, Int32(rank - 1i32),
)
ptr0 = @asmcall(
"cvta.shared::cluster.u64 \$0, \$1;",
"=l,l",
- LLVMPtr{T,AS.Generic},
- Tuple{Core.LLVMPtr{T,AS.SharedCluster}},
+ LLVMPtr{T, AS.Generic},
+ Tuple{Core.LLVMPtr{T, AS.SharedCluster}},
ptr7,
)
return ptr0
diff --git a/src/device/pointer.jl b/src/device/pointer.jl
index 5b6d38d9b..e75d4bf0f 100644
--- a/src/device/pointer.jl
+++ b/src/device/pointer.jl
@@ -6,12 +6,12 @@ export AS
module AS
-const Generic = 0
-const Global = 1
-const Shared = 3
-const Constant = 4
-const Local = 5
-const SharedCluster = 7
+ const Generic = 0
+ const Global = 1
+ const Shared = 3
+ const Constant = 4
+ const Local = 5
+ const SharedCluster = 7
end
diff --git a/test/core/cudadrv.jl b/test/core/cudadrv.jl
index 382a01eb2..33a432d86 100644
--- a/test/core/cudadrv.jl
+++ b/test/core/cudadrv.jl
@@ -269,21 +269,21 @@ let
cudacall(dummy, Tuple{})
cudacall(dummy, Tuple{}; threads=1)
cudacall(dummy, Tuple{}; threads=1, blocks=1)
- cudacall(dummy, Tuple{}; threads=1, blocks=1, clustersize=1)
- cudacall(dummy, Tuple{}; threads=1, blocks=1, clustersize=1, shmem=0)
- cudacall(dummy, Tuple{}; threads=1, blocks=1, clustersize=1, shmem=0, stream=stream())
- cudacall(dummy, Tuple{}; threads=1, blocks=1, clustersize=1, shmem=0, stream=stream(), cooperative=false)
+ cudacall(dummy, Tuple{}; threads = 1, blocks = 1, clustersize = 1)
+ cudacall(dummy, Tuple{}; threads = 1, blocks = 1, clustersize = 1, shmem = 0)
+ cudacall(dummy, Tuple{}; threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream())
+ cudacall(dummy, Tuple{}; threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream(), cooperative = false)
cudacall(dummy, ())
- cudacall(dummy, (); threads=1, blocks=1, clustersize=1, shmem=0, stream=stream(), cooperative=false)
+ cudacall(dummy, (); threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream(), cooperative = false)
# different launch syntaxes
CUDA.launch(dummy)
CUDA.launch(dummy; threads=1)
CUDA.launch(dummy; threads=1, blocks=1)
- CUDA.launch(dummy; threads=1, blocks=1, clustersize=1)
- CUDA.launch(dummy; threads=1, blocks=1, clustersize=1, shmem=0)
- CUDA.launch(dummy; threads=1, blocks=1, clustersize=1, shmem=0, stream=stream())
- CUDA.launch(dummy; threads=1, blocks=1, clustersize=1, shmem=0, stream=stream(), cooperative=false)
+ CUDA.launch(dummy; threads = 1, blocks = 1, clustersize = 1)
+ CUDA.launch(dummy; threads = 1, blocks = 1, clustersize = 1, shmem = 0)
+ CUDA.launch(dummy; threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream())
+ CUDA.launch(dummy; threads = 1, blocks = 1, clustersize = 1, shmem = 0, stream = stream(), cooperative = false)
end
let
diff --git a/test/core/device/intrinsics/clusters.jl b/test/core/device/intrinsics/clusters.jl
index 9b241e550..911a89c6b 100644
--- a/test/core/device/intrinsics/clusters.jl
+++ b/test/core/device/intrinsics/clusters.jl
@@ -1,78 +1,78 @@
@testset "thread block clusters" begin
-if capability(device()) >= v"9.0"
+ if capability(device()) >= v"9.0"
-###########################################################################################
+ ###########################################################################################
-@testset "indexing" begin
- function f(A::AbstractArray{Int32,9})
- ti = threadIdx().x
- tj = threadIdx().y
- tk = threadIdx().z
- bi = blockIdxInCluster().x
- bj = blockIdxInCluster().y
- bk = blockIdxInCluster().z
- ci = clusterIdx().x
- cj = clusterIdx().y
- ck = clusterIdx().z
- A[ti,tj,tk,bi,bj,bk,ci,cj,ck] = 1
- nothing
- end
+ @testset "indexing" begin
+ function f(A::AbstractArray{Int32, 9})
+ ti = threadIdx().x
+ tj = threadIdx().y
+ tk = threadIdx().z
+ bi = blockIdxInCluster().x
+ bj = blockIdxInCluster().y
+ bk = blockIdxInCluster().z
+ ci = clusterIdx().x
+ cj = clusterIdx().y
+ ck = clusterIdx().z
+ A[ti, tj, tk, bi, bj, bk, ci, cj, ck] = 1
+ nothing
+ end
- A = CUDA.zeros(Int32, threads..., clustersize..., (blocks .÷ clustersize)...)
+ A = CUDA.zeros(Int32, threads..., clustersize..., (blocks .÷ clustersize)...)
- threads = (3,5,7)
- clustersize = (2,2,2)
- blocks = (4,6,8)
- @cuda threads=threads blocks=blocks clustersize=clustersize f(A)
+ threads = (3, 5, 7)
+ clustersize = (2, 2, 2)
+ blocks = (4, 6, 8)
+ @cuda threads = threads blocks = blocks clustersize = clustersize f(A)
- @test all(==(1), Array(A))
-end
+ @test all(==(1), Array(A))
+ end
-###########################################################################################
+ ###########################################################################################
-@testset "distributed shared memory" begin
- function f(A::AbstractArray{Int32,3})
- ti = threadIdx().x
- nt = blockDim().x
- @assert 1<=ti<=nt
- bi = blockIdxInCluster().x
- nb = clusterDim().x
- @assert 1<=bi<=nb
- ci = clusterIdx().x
- nc = gridClusterDim().x
- @assert 1<=ci<=nc
+ @testset "distributed shared memory" begin
+ function f(A::AbstractArray{Int32, 3})
+ ti = threadIdx().x
+ nt = blockDim().x
+ @assert 1 <= ti <= nt
+ bi = blockIdxInCluster().x
+ nb = clusterDim().x
+ @assert 1 <= bi <= nb
+ ci = clusterIdx().x
+ nc = gridClusterDim().x
+ @assert 1 <= ci <= nc
- sm = CuStaticSharedArray(Int32, 8)
- for i in 1:nb
- sm[i] = -1
- end
- cluster_wait()
+ sm = CuStaticSharedArray(Int32, 8)
+ for i in 1:nb
+ sm[i] = -1
+ end
+ cluster_wait()
- for i in 1:nb
- dsm = CuDistributedSharedArray(sm, i)
- dsm[bi] = bi
- end
- cluster_wait()
+ for i in 1:nb
+ dsm = CuDistributedSharedArray(sm, i)
+ dsm[bi] = bi
+ end
+ cluster_wait()
- for i in 1:nb
- A[i,bi,ci] = sm[i]
- end
- return nothing
- end
+ for i in 1:nb
+ A[i, bi, ci] = sm[i]
+ end
+ return nothing
+ end
- A = CUDA.zeros(Int32, clustersize, clustersize, blocks ÷ clustersize)
+ A = CUDA.zeros(Int32, clustersize, clustersize, blocks ÷ clustersize)
- threads = 1
- clustersize = 4
- blocks = 16
- @cuda threads=threads blocks=blocks clustersize=clustersize f(A)
+ threads = 1
+ clustersize = 4
+ blocks = 16
+ @cuda threads = threads blocks = blocks clustersize = clustersize f(A)
- B = Array(A)
- goodB = [i for i in 1:clustersize, bi in 1:clustersize, ci in 1:blocks ÷ clustersize]
- @test B == goodB
-end
+ B = Array(A)
+ goodB = [i for i in 1:clustersize, bi in 1:clustersize, ci in 1:(blocks ÷ clustersize)]
+ @test B == goodB
+ end
-###########################################################################################
+ ###########################################################################################
-end
+ end
end
diff --git a/test/core/execution.jl b/test/core/execution.jl
index 3a7ef7783..8b6622534 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -21,11 +21,11 @@ dummy() = return
@cuda blocks=(1,1) dummy()
@cuda blocks=(1,1,1) dummy()
- clustersize = 1
- @cuda clustersize dummy()
- @cuda clustersize=1 dummy()
- @cuda clustersize=(1,1) dummy()
- @cuda clustersize=(1,1,1) dummy()
+ clustersize = 1
+ @cuda clustersize dummy()
+ @cuda clustersize = 1 dummy()
+ @cuda clustersize = (1, 1) dummy()
+ @cuda clustersize = (1, 1, 1) dummy()
end
@@ -165,13 +165,13 @@ end
@cuda stream=s dummy()
end
-@testset "clusters" begin
- if CUDA.capability(device()) >= v"9.0"
- @cuda threads=64 clustersize=2 dummy()
- else
- @test_throws CuError @cuda threads=64 clustersize=2 dummy()
+ @testset "clusters" begin
+ if CUDA.capability(device()) >= v"9.0"
+ @cuda threads = 64 clustersize = 2 dummy()
+ else
+ @test_throws CuError @cuda threads = 64 clustersize = 2 dummy()
+ end
end
-end
@testset "external kernels" begin
@eval module KernelModule |
maleadt
reviewed
Mar 11, 2026
Contributor
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 23d28a6 | Previous: 5472295 | Ratio |
|---|---|---|---|
latency/precompile |
44374924006.5 ns |
43893707632 ns |
1.01 |
latency/ttfp |
12775535247 ns |
12909648613 ns |
0.99 |
latency/import |
3520832716 ns |
3506340040 ns |
1.00 |
integration/volumerhs |
9437708.5 ns |
9441122 ns |
1.00 |
integration/byval/slices=1 |
146137 ns |
145704 ns |
1.00 |
integration/byval/slices=3 |
423188 ns |
422873 ns |
1.00 |
integration/byval/reference |
144284 ns |
143887 ns |
1.00 |
integration/byval/slices=2 |
284721 ns |
284218 ns |
1.00 |
integration/cudadevrt |
102779 ns |
102447 ns |
1.00 |
kernel/indexing |
13414 ns |
13397 ns |
1.00 |
kernel/indexing_checked |
14430 ns |
14106 ns |
1.02 |
kernel/occupancy |
705.7062937062936 ns |
642.1951219512196 ns |
1.10 |
kernel/launch |
2145.6666666666665 ns |
2182.9444444444443 ns |
0.98 |
kernel/rand |
14518 ns |
15509 ns |
0.94 |
array/reverse/1d |
18638 ns |
19015.5 ns |
0.98 |
array/reverse/2dL_inplace |
66048 ns |
66294 ns |
1.00 |
array/reverse/1dL |
69192 ns |
69264 ns |
1.00 |
array/reverse/2d |
20686 ns |
21406 ns |
0.97 |
array/reverse/1d_inplace |
10364 ns |
10627.666666666666 ns |
0.98 |
array/reverse/2d_inplace |
10332 ns |
10996 ns |
0.94 |
array/reverse/2dL |
72682 ns |
73336 ns |
0.99 |
array/reverse/1dL_inplace |
66112 ns |
66391 ns |
1.00 |
array/copy |
18710 ns |
18444 ns |
1.01 |
array/iteration/findall/int |
149278.5 ns |
145423 ns |
1.03 |
array/iteration/findall/bool |
131915 ns |
130731 ns |
1.01 |
array/iteration/findfirst/int |
83253 ns |
84638 ns |
0.98 |
array/iteration/findfirst/bool |
81773 ns |
81595 ns |
1.00 |
array/iteration/scalar |
68503 ns |
67259 ns |
1.02 |
array/iteration/logical |
200037.5 ns |
200755.5 ns |
1.00 |
array/iteration/findmin/1d |
87057.5 ns |
86165.5 ns |
1.01 |
array/iteration/findmin/2d |
117119 ns |
117304.5 ns |
1.00 |
array/reductions/reduce/Int64/1d |
42692 ns |
39371 ns |
1.08 |
array/reductions/reduce/Int64/dims=1 |
42332.5 ns |
44367 ns |
0.95 |
array/reductions/reduce/Int64/dims=2 |
59660 ns |
59429 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
87812 ns |
87550 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
84836 ns |
85113.5 ns |
1.00 |
array/reductions/reduce/Float32/1d |
35298 ns |
34268 ns |
1.03 |
array/reductions/reduce/Float32/dims=1 |
40506 ns |
48865.5 ns |
0.83 |
array/reductions/reduce/Float32/dims=2 |
57009 ns |
56196 ns |
1.01 |
array/reductions/reduce/Float32/dims=1L |
51926 ns |
51777 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
70017 ns |
69644 ns |
1.01 |
array/reductions/mapreduce/Int64/1d |
42948 ns |
39157 ns |
1.10 |
array/reductions/mapreduce/Int64/dims=1 |
42894 ns |
51282.5 ns |
0.84 |
array/reductions/mapreduce/Int64/dims=2 |
59948 ns |
59191 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=1L |
87934 ns |
87450 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2L |
85266 ns |
85108.5 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
34973 ns |
33989 ns |
1.03 |
array/reductions/mapreduce/Float32/dims=1 |
40619 ns |
48728 ns |
0.83 |
array/reductions/mapreduce/Float32/dims=2 |
56809 ns |
56178 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=1L |
51922 ns |
51465 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=2L |
69624 ns |
68877 ns |
1.01 |
array/broadcast |
20842 ns |
20339 ns |
1.02 |
array/copyto!/gpu_to_gpu |
11380 ns |
10634.166666666668 ns |
1.07 |
array/copyto!/cpu_to_gpu |
212669.5 ns |
215032 ns |
0.99 |
array/copyto!/gpu_to_cpu |
282559 ns |
283345.5 ns |
1.00 |
array/accumulate/Int64/1d |
119133 ns |
118179 ns |
1.01 |
array/accumulate/Int64/dims=1 |
79746 ns |
79470 ns |
1.00 |
array/accumulate/Int64/dims=2 |
155805 ns |
155613 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1694276 ns |
1694158 ns |
1.00 |
array/accumulate/Int64/dims=2L |
960988.5 ns |
960865 ns |
1.00 |
array/accumulate/Float32/1d |
101525 ns |
100641.5 ns |
1.01 |
array/accumulate/Float32/dims=1 |
76648 ns |
76173 ns |
1.01 |
array/accumulate/Float32/dims=2 |
143517 ns |
144217.5 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1585237.5 ns |
1584446 ns |
1.00 |
array/accumulate/Float32/dims=2L |
656946 ns |
656699 ns |
1.00 |
array/construct |
1311 ns |
1306.3 ns |
1.00 |
array/random/randn/Float32 |
44211 ns |
41156 ns |
1.07 |
array/random/randn!/Float32 |
27925 ns |
30029 ns |
0.93 |
array/random/rand!/Int64 |
34428 ns |
34605 ns |
0.99 |
array/random/rand!/Float32 |
8674.666666666666 ns |
8182.25 ns |
1.06 |
array/random/rand/Int64 |
37256 ns |
32316 ns |
1.15 |
array/random/rand/Float32 |
13370 ns |
12466 ns |
1.07 |
array/permutedims/4d |
52804.5 ns |
50943 ns |
1.04 |
array/permutedims/2d |
52528 ns |
52373 ns |
1.00 |
array/permutedims/3d |
52984 ns |
52694 ns |
1.01 |
array/sorting/1d |
2735008 ns |
2734863 ns |
1.00 |
array/sorting/by |
3303991.5 ns |
3304190.5 ns |
1.00 |
array/sorting/2d |
1068161 ns |
1068173 ns |
1.00 |
cuda/synchronization/stream/auto |
1044.0666666666666 ns |
1039.7692307692307 ns |
1.00 |
cuda/synchronization/stream/nonblocking |
7484.4 ns |
7691.4 ns |
0.97 |
cuda/synchronization/stream/blocking |
847.3533333333334 ns |
818.6910112359551 ns |
1.04 |
cuda/synchronization/context/auto |
1174.7 ns |
1142.4 ns |
1.03 |
cuda/synchronization/context/nonblocking |
7147.2 ns |
7606.8 ns |
0.94 |
cuda/synchronization/context/blocking |
936.7777777777778 ns |
888.9473684210526 ns |
1.05 |
This comment was automatically generated by workflow using github-action-benchmark.
Contributor
Author
|
@maleadt I removed the nits. |
Contributor
Author
|
Could you merge this PR? |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
No description provided.