JuliaGPU
diff --git a/‎.github/workflows/CI.yml‎
Lines changed: 55 additions & 0 deletions b/‎.github/workflows/CI.yml‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 0 deletions b/‎Project.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/benchmarks.jl‎
Lines changed: 232 additions & 0 deletions b/‎examples/benchmarks.jl‎
Lines changed: 232 additions & 0 deletions
@@ -0,0 +1,55 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - '1.11'
+          - '1.12'
+        os:
+          - Linux
+          - Windows
+          - macOS
+        arch:
+          - x64
+          - aarch64
+        exclude:
+          - os: Windows
+            arch: aarch64
+          - os: macOS
+            arch: x64
+        include:
+          - os: Linux
+            arch: x64
+            runner: ubuntu-latest
+          - os: Linux
+            arch: aarch64
+            runner: ubuntu-24.04-arm
+          - os: Windows
+            arch: x64
+            runner: windows-latest
+          - os: macOS
+            arch: aarch64
+            runner: macos-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+        with:
+          test_args: '--quickfail'
@@ -18,6 +18,7 @@ IRStructurizer = {path = "IRStructurizer"}
 CUDAExt = "CUDA"
 
 [compat]
+julia = "1.11"
 CUDA_Compiler_jll = "0.4"
 CUDA_Tile_jll = "13.1"
 
 
@@ -101,6 +101,7 @@ Benchmarks comparing cuTile.jl against cuTile Python on an RTX 5080:
 | Matrix Multiplication | 48.3 TFLOPS | 48.6 TFLOPS | OK (=) |
 | Layer Normalization | 254 GB/s | 683 GB/s | https://github.com/JuliaGPU/cuTile.jl/issues/1 (-63%) |
 | Batch Matrix Multiply | 31.7 TFLOPS | 31.6 TFLOPS | OK (=) |
+| FFT (3-stage Cooley-Tukey) | 508 μs | 230 μs | (-55%) |
 
 Compute-intensive kernels (matmul, batch matmul) perform identically to Python. Memory-bound
 kernels (vadd, transpose) are within ~3% of Python. The layernorm kernel is slower due to
 
@@ -7,6 +7,7 @@
 using CUDA
 using LinearAlgebra
 using CUDA: GPUArrays
+using FFTW
 import cuTile as ct
 
 #=============================================================================
@@ -375,6 +376,14 @@ const BATCHMATMUL_TM = 128
 const BATCHMATMUL_TN = 256
 const BATCHMATMUL_TK = 64
 
+# FFT sizes
+# Tile size is (D, BS, N2D), limited by tileiras compiler.
+# Current kernel loads all batches per block, limiting scalability.
+const FFT_BATCH = 64
+const FFT_SIZE = 512
+const FFT_FACTORS = (8, 8, 8)
+const FFT_ATOM_PACKING_DIM = 2
+
 # SIMT naive kernel (2-pass: compute mean/var, then normalize)
 function layernorm_simt_kernel!(X, W, B, Y, Mean, Rstd, N, eps)
     m = blockIdx().x
@@ -603,6 +612,228 @@ function benchmark_batchmatmul()
     return results
 end
 
+#=============================================================================
+ FFT (3-stage Cooley-Tukey) - Column-Major Version
+=============================================================================#
+
+# FFT kernel - 3-stage Cooley-Tukey decomposition (column-major)
+# Uses swapped dimensions and right-multiply for column-major compatibility.
+# Input/output layout: (D, BS, N2D) where D=2 for real/imag interleaving.
+function fft_kernel(
+    x_packed_in::ct.TileArray{Float32, 3},
+    y_packed_out::ct.TileArray{Float32, 3},
+    W0::ct.TileArray{Float32, 3},
+    W1::ct.TileArray{Float32, 3},
+    W2::ct.TileArray{Float32, 3},
+    T0::ct.TileArray{Float32, 3},
+    T1::ct.TileArray{Float32, 3},
+    n_const::ct.Constant{Int},
+    f0_const::ct.Constant{Int},
+    f1_const::ct.Constant{Int},
+    f2_const::ct.Constant{Int},
+    f0f1_const::ct.Constant{Int},
+    f1f2_const::ct.Constant{Int},
+    f0f2_const::ct.Constant{Int},
+    bs_const::ct.Constant{Int},
+    d_const::ct.Constant{Int},
+    n2d_const::ct.Constant{Int}
+)
+    N = n_const[]
+    F0 = f0_const[]
+    F1 = f1_const[]
+    F2 = f2_const[]
+    F0F1 = f0f1_const[]
+    F1F2 = f1f2_const[]
+    F0F2 = f0f2_const[]
+    BS = bs_const[]
+    D = d_const[]
+    N2D = n2d_const[]
+
+    bid = ct.bid(1)
+
+    # Load input (D, BS, N2D) and reshape to (2, BS, N)
+    X_ri = ct.reshape(ct.load(x_packed_in, (1, bid, 1), (D, BS, N2D)), (2, BS, N))
+    X_r = ct.reshape(ct.extract(X_ri, (1, 1, 1), (1, BS, N)), (BS, F1F2, F0))
+    X_i = ct.reshape(ct.extract(X_ri, (2, 1, 1), (1, BS, N)), (BS, F1F2, F0))
+
+    # Load DFT matrices
+    W0_ri = ct.reshape(ct.load(W0, (1, 1, 1), (F0, F0, 2)), (F0, F0, 2))
+    W0_r = ct.broadcast_to(ct.reshape(ct.extract(W0_ri, (1, 1, 1), (F0, F0, 1)), (1, F0, F0)), (BS, F0, F0))
+    W0_i = ct.broadcast_to(ct.reshape(ct.extract(W0_ri, (1, 1, 2), (F0, F0, 1)), (1, F0, F0)), (BS, F0, F0))
+
+    W1_ri = ct.reshape(ct.load(W1, (1, 1, 1), (F1, F1, 2)), (F1, F1, 2))
+    W1_r = ct.broadcast_to(ct.reshape(ct.extract(W1_ri, (1, 1, 1), (F1, F1, 1)), (1, F1, F1)), (BS, F1, F1))
+    W1_i = ct.broadcast_to(ct.reshape(ct.extract(W1_ri, (1, 1, 2), (F1, F1, 1)), (1, F1, F1)), (BS, F1, F1))
+
+    W2_ri = ct.reshape(ct.load(W2, (1, 1, 1), (F2, F2, 2)), (F2, F2, 2))
+    W2_r = ct.broadcast_to(ct.reshape(ct.extract(W2_ri, (1, 1, 1), (F2, F2, 1)), (1, F2, F2)), (BS, F2, F2))
+    W2_i = ct.broadcast_to(ct.reshape(ct.extract(W2_ri, (1, 1, 2), (F2, F2, 1)), (1, F2, F2)), (BS, F2, F2))
+
+    # Load twiddle factors (column-major layout)
+    T0_ri = ct.reshape(ct.load(T0, (1, 1, 1), (F1F2, F0, 2)), (F1F2, F0, 2))
+    T0_r = ct.reshape(ct.extract(T0_ri, (1, 1, 1), (F1F2, F0, 1)), (1, N))
+    T0_i = ct.reshape(ct.extract(T0_ri, (1, 1, 2), (F1F2, F0, 1)), (1, N))
+
+    T1_ri = ct.reshape(ct.load(T1, (1, 1, 1), (F0F2, F1, 2)), (F0F2, F1, 2))
+    T1_r = ct.reshape(ct.extract(T1_ri, (1, 1, 1), (F0F2, F1, 1)), (1, F0F2 * F1))
+    T1_i = ct.reshape(ct.extract(T1_ri, (1, 1, 2), (F0F2, F1, 1)), (1, F0F2 * F1))
+
+    # Stage 0: F0-point DFT via right-multiply
+    X_r_ = X_r * W0_r - X_i * W0_i
+    X_i_ = X_r * W0_i + X_i * W0_r
+
+    # Twiddle & Permute 0
+    X_r_flat = ct.reshape(X_r_, (BS, N))
+    X_i_flat = ct.reshape(X_i_, (BS, N))
+    X_r2 = T0_r .* X_r_flat .- T0_i .* X_i_flat
+    X_i2 = T0_i .* X_r_flat .+ T0_r .* X_i_flat
+
+    X_r3 = ct.reshape(X_r2, (BS, F2, F1, F0))
+    X_i3 = ct.reshape(X_i2, (BS, F2, F1, F0))
+    X_r4 = ct.permute(X_r3, (1, 2, 4, 3))
+    X_i4 = ct.permute(X_i3, (1, 2, 4, 3))
+    X_r5 = ct.reshape(X_r4, (BS, F0F2, F1))
+    X_i5 = ct.reshape(X_i4, (BS, F0F2, F1))
+
+    # Stage 1: F1-point DFT
+    X_r6 = X_r5 * W1_r - X_i5 * W1_i
+    X_i6 = X_r5 * W1_i + X_i5 * W1_r
+
+    # Twiddle & Permute 1
+    X_r_flat2 = ct.reshape(X_r6, (BS, N))
+    X_i_flat2 = ct.reshape(X_i6, (BS, N))
+    X_r7 = T1_r .* X_r_flat2 .- T1_i .* X_i_flat2
+    X_i7 = T1_i .* X_r_flat2 .+ T1_r .* X_i_flat2
+
+    X_r8 = ct.reshape(X_r7, (BS, F2, F0, F1))
+    X_i8 = ct.reshape(X_i7, (BS, F2, F0, F1))
+    X_r9 = ct.permute(X_r8, (1, 3, 4, 2))
+    X_i9 = ct.permute(X_i8, (1, 3, 4, 2))
+    X_r10 = ct.reshape(X_r9, (BS, F0F1, F2))
+    X_i10 = ct.reshape(X_i9, (BS, F0F1, F2))
+
+    # Stage 2: F2-point DFT
+    X_r11 = X_r10 * W2_r - X_i10 * W2_i
+    X_i11 = X_r10 * W2_i + X_i10 * W2_r
+
+    # Final output
+    X_r_final = ct.reshape(X_r11, (1, BS, N))
+    X_i_final = ct.reshape(X_i11, (1, BS, N))
+
+    # Concatenate and Store
+    Y_ri = ct.reshape(ct.cat((X_r_final, X_i_final), 1), (D, BS, N2D))
+    ct.store(y_packed_out, (1, bid, 1), Y_ri)
+
+    return
+end
+
+# Helper: Generate DFT matrix
+function fft_dft_matrix(size::Int)
+    W = zeros(ComplexF32, size, size)
+    for i in 0:size-1, j in 0:size-1
+        W[i+1, j+1] = exp(-2π * im * i * j / size)
+    end
+    result = zeros(Float32, size, size, 2)
+    result[:, :, 1] = Float32.(real.(W))
+    result[:, :, 2] = Float32.(imag.(W))
+    return result
+end
+
+# Twiddle factors T0 for column-major layout (F1F2, F0)
+function fft_make_twiddles_T0(F0::Int, F1F2::Int, N::Int)
+    T0 = zeros(Float32, F1F2, F0, 2)
+    for j in 0:F1F2-1, i in 0:F0-1
+        val = exp(-2π * im * i * j / N)
+        T0[j+1, i+1, 1] = Float32(real(val))
+        T0[j+1, i+1, 2] = Float32(imag(val))
+    end
+    return T0
+end
+
+# Twiddle factors T1 for column-major layout (F0F2, F1)
+function fft_make_twiddles_T1(F0::Int, F1::Int, F2::Int)
+    F0F2 = F0 * F2
+    F1F2 = F1 * F2
+    T1 = zeros(Float32, F0F2, F1, 2)
+    for k in 0:F0F2-1, j in 0:F1-1
+        f2 = k % F2
+        val = exp(-2π * im * j * f2 / F1F2)
+        T1[k+1, j+1, 1] = Float32(real(val))
+        T1[k+1, j+1, 2] = Float32(imag(val))
+    end
+    return T1
+end
+
+function fft_make_twiddles(factors::NTuple{3, Int})
+    F0, F1, F2 = factors
+    N = F0 * F1 * F2
+    F1F2 = F1 * F2
+    W0 = fft_dft_matrix(F0)
+    W1 = fft_dft_matrix(F1)
+    W2 = fft_dft_matrix(F2)
+    T0 = fft_make_twiddles_T0(F0, F1F2, N)
+    T1 = fft_make_twiddles_T1(F0, F1, F2)
+    return (W0, W1, W2, T0, T1)
+end
+
+function benchmark_fft()
+    println("\nBenchmarking FFT...")
+    BS, N = FFT_BATCH, FFT_SIZE
+    F0, F1, F2 = FFT_FACTORS
+    D = FFT_ATOM_PACKING_DIM
+    println("  Size: $BS batches × $N FFT ($(BS * N * 8 / 1e6) MB)")
+
+    # Create complex input
+    CUDA.seed!(42)
+    input = CUDA.randn(ComplexF32, BS, N)
+
+    # Reference result (FFTW)
+    reference = FFTW.fft(Array(input), 2)
+
+    results = BenchmarkResult[]
+
+    # Pre-compute twiddles (one-time CPU cost)
+    W0, W1, W2, T0, T1 = fft_make_twiddles(FFT_FACTORS)
+    W0_gpu, W1_gpu, W2_gpu = CuArray(W0), CuArray(W1), CuArray(W2)
+    T0_gpu, T1_gpu = CuArray(T0), CuArray(T1)
+
+    # Pre-pack input (zero-copy)
+    N2D = N * 2 ÷ D
+    x_packed = reinterpret(reshape, Float32, input)
+    y_packed = CUDA.zeros(Float32, D, BS, N2D)
+
+    # Kernel launch parameters
+    F0F1, F1F2, F0F2 = F0 * F1, F1 * F2, F0 * F2
+    grid = (BS, 1, 1)
+
+    # Kernel-only timing function
+    cutile_kernel_f = () -> ct.launch(fft_kernel, grid,
+        x_packed, y_packed,
+        W0_gpu, W1_gpu, W2_gpu, T0_gpu, T1_gpu,
+        ct.Constant(N), ct.Constant(F0), ct.Constant(F1), ct.Constant(F2),
+        ct.Constant(F0F1), ct.Constant(F1F2), ct.Constant(F0F2),
+        ct.Constant(BS), ct.Constant(D), ct.Constant(N2D))
+
+    # Verify correctness
+    cutile_kernel_f()
+    CUDA.synchronize()
+    y_complex = reinterpret(reshape, ComplexF32, y_packed)
+    output = copy(y_complex)
+    @assert isapprox(Array(output), reference, rtol=1e-3) "cuTile FFT incorrect!"
+
+    # Benchmark kernel only
+    min_t, mean_t = benchmark_kernel(cutile_kernel_f)
+    push!(results, BenchmarkResult("cuTile.jl", min_t, mean_t))
+
+    # Performance metric: GFLOPS (5 * N * log2(N) per complex FFT)
+    flops_per_fft = 5.0 * N * log2(N)
+    total_flops = BS * flops_per_fft
+    gflops = [string(round(total_flops / (r.min_ms * 1e-3) / 1e9, digits=1), " GFLOPS") for r in results]
+
+    print_table("FFT (ComplexF32)", results; extra_col=("Performance", gflops))
+    return results
+end
+
 #=============================================================================
  Main
 =============================================================================#
@@ -622,6 +853,7 @@ function main()
     matmul_results = benchmark_matmul()
     layernorm_results = benchmark_layernorm()
     batchmatmul_results = benchmark_batchmatmul()
+    fft_results = benchmark_fft()
 
     println()
     println("=" ^ 60)