Also trial AMD support

kshyatt · kshyatt · commit 9721b1eb9baf · 2026-02-26T05:12:37.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -9,14 +9,17 @@ StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
+StridedAMDGPUExt = "AMDGPU"
 StridedGPUArraysExt = "GPUArrays"
 StridedCUDAExt = "CUDA"
 
 [compat]
+AMDGPU = "2"
 Aqua = "0.8"
 CUDA = "5"
 GPUArrays = "11.4.1"
@@ -28,11 +31,12 @@ TupleTools = "1.6"
 julia = "1.6"
 
 [extras]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Random", "Aqua", "CUDA", "GPUArrays"]
+test = ["Test", "Random", "Aqua", "AMDGPU", "CUDA", "GPUArrays"]
diff --git a/ext/StridedAMDGPUExt.jl b/ext/StridedAMDGPUExt.jl
@@ -0,0 +1,16 @@
+module StridedAMDGPUExt
+
+using Strided, StridedViews, AMDGPU
+using AMDGPU: Adapt
+using AMDGPU: GPUArrays
+
+const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
+
+function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: ROCArray{TD}, FD <: ALL_FS, TS, NS, TAS <: ROCArray{TS}, FS <: ALL_FS}
+    bc_style = Base.Broadcast.BroadcastStyle(TAS)
+    bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
+    GPUArrays._copyto!(dst, bc)
+    return dst
+end
+
+end
diff --git a/ext/StridedCUDAExt.jl b/ext/StridedCUDAExt.jl
@@ -1,17 +1,13 @@
 module StridedCUDAExt
 
-using Strided, CUDA
+using Strided, StridedViews, CUDA
 using CUDA: Adapt, KernelAdaptor
 using CUDA: GPUArrays
 
 const ALL_FS = Union{typeof(adjoint), typeof(conj), typeof(identity), typeof(transpose)}
 
-function Adapt.adapt_storage(to::KernelAdaptor, xs::StridedView{T,N,TA,F}) where {T,N,TA<:CuArray{T},F <: ALL_FS}
-    return StridedView(Adapt.adapt(to, parent(xs)), xs.size, xs.strides, xs.offset, xs.op)
-end
-
 function Base.copy!(dst::StridedView{TD, ND, TAD, FD}, src::StridedView{TS, NS, TAS, FS}) where {TD, ND, TAD <: CuArray{TD}, FD <: ALL_FS, TS, NS, TAS <: CuArray{TS}, FS <: ALL_FS}
-    bc_style = Base.Broadcast.BroadcastStyle(TAS) 
+    bc_style = Base.Broadcast.BroadcastStyle(TAS)
     bc = Base.Broadcast.Broadcasted(bc_style, identity, (src,), axes(dst))
     GPUArrays._copyto!(dst, bc)
     return dst
diff --git a/test/amd.jl b/test/amd.jl
@@ -0,0 +1,14 @@
+for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
+    @testset "Copy with ROCStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
+        for m1 in (0, 16, 32), m2 in (0, 16, 32)
+            A1 = AMDGPU.randn(T, (m1, m2))
+            A2 = similar(A1)
+            A1c = copy(A1)
+            A2c = copy(A2)
+            B1 = f1(StridedView(A1c))
+            B2 = f2(StridedView(A2c))
+            axes(f1(A1)) == axes(f2(A2)) || continue
+            @test collect(ROCMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1))
+        end
+    end
+end
diff --git a/test/cuda.jl b/test/cuda.jl
@@ -1,14 +1,14 @@
 for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
-    m1 = 32
-    m2 = 16
     @testset "Copy with CuStridedView: $T, $f1, $f2" for f2 in (identity, conj, adjoint, transpose), f1 in (identity, conj, transpose, adjoint)
-        A1 = CUDA.randn(T, (m1, m2))
-        A2 = similar(A1)
-        A1c = copy(A1)
-        A2c = copy(A2)
-        B1 = f1(StridedView(A1c))
-        B2 = f2(StridedView(A2c))
-        axes(f1(A1)) == axes(f2(A2)) || continue
-        @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == Adapt.adapt(Vector{T}, copy!(B2, B1))
+        for m1 in (0, 16, 32), m2 in (0, 16, 32)
+            A1 = CUDA.randn(T, (m1, m2))
+            A2 = similar(A1)
+            A1c = copy(A1)
+            A2c = copy(A2)
+            B1 = f1(StridedView(A1c))
+            B2 = f2(StridedView(A2c))
+            axes(f1(A1)) == axes(f2(A2)) || continue
+            @test collect(CuMatrix(copy!(f2(A2), f1(A1)))) == CUDA.Adapt.adapt(Vector{T}, copy!(B2, B1))
+        end
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,8 +4,7 @@ using Random
 using Strided
 using Strided: StridedView
 using Aqua
-using CUDA, GPUArrays
-using CUDA: Adapt
+using AMDGPU, CUDA, GPUArrays
 
 Random.seed!(1234)
 
@@ -29,9 +28,13 @@ if !is_buildkite
     include("blasmultests.jl")
     Strided.disable_threaded_mul()
 
-    Aqua.test_all(Strided; piracies=false)
+    Aqua.test_all(Strided; piracies = false)
 end
 
 if CUDA.functional()
     include("cuda.jl")
 end
+
+if AMDGPU.functional()
+    include("amd.jl")
+end