diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 27f4f36..5c04ebf 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -52,7 +52,7 @@ jobs: - name: Install cargo-gpu run: | - cargo install --git https://github.com/Rust-GPU/cargo-gpu cargo-gpu + cargo install cargo-gpu --version 0.10.0-alpha.1 cargo gpu install --auto-install-rust-toolchain - name: Run clippy lints @@ -80,7 +80,7 @@ jobs: - name: Install cargo-gpu run: | - cargo install --git https://github.com/Rust-GPU/cargo-gpu cargo-gpu + cargo install cargo-gpu --version 0.10.0-alpha.1 cargo gpu install --auto-install-rust-toolchain - name: Check documentation @@ -106,7 +106,7 @@ jobs: sweep-cache: true - name: Install cargo-gpu run: | - cargo install --git https://github.com/Rust-GPU/cargo-gpu cargo-gpu + cargo install cargo-gpu --version 0.10.0-alpha.1 cargo gpu install --auto-install-rust-toolchain - name: Run Cargo Tests run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index e69de29..12ede24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -0,0 +1,15 @@ +# Changelog + +_Disclaimer: this changelog is updated using generative AI, but is still verified manually._ + +## v0.1.0 + +This shows the changes between the time of open-sourcing the crate and its first release to crates.io: + +### Added +- `println!` support for shaders running on the CPU backend (`khal-std`). + +### Changed +- Switch `spirv-std` and `spirv-std-macros` to the published `0.10.0-alpha.1` release (previously pinned to a git revision). +- Cache coroutines on the CPU backend for improved performance. +- Enable incremental builds in the workspace to work around a `rust-gpu` issue where the example shader entrypoint was being dropped. diff --git a/README.md b/README.md index 3a30b57..fc3a270 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,9 @@ on any platform: **WebGPU**, **CUDA**, or **CPU** -- from a single codebase. > **Warning** -> KHAL is still under heavy development. +> KHAL is still under heavy development. The CUDA backend is currently only supported when using the +> github version of `khal-std` (because some dependencies are not available on cartes.io yet). If you +> don’t intend to target cuda, then the published version of `khal-std` is the way to go.

@@ -25,26 +27,23 @@ on any platform: **WebGPU**, **CUDA**, or **CPU** -- from a single codebase. - **Proc-macro bindings** -- `#[spirv_bindgen]` generates type-safe host-side structs from your shader function signature. - **Build pipeline** -- `khal-builder` orchestrates `cargo gpu` and `cargo cuda` to compile shaders at build time. - ## Development setup ### cargo-gpu (required for SPIR-V / WebGPU) -The crates.io version of `cargo-gpu` is outdated. Install from Git and let it set up its Rust -toolchain: +Install `cargo-gpu` from crates.io: ```bash -cargo install --git https://github.com/Rust-GPU/cargo-gpu cargo-gpu +cargo install cargo-gpu --version 0.10.0-alpha.1 cargo gpu install ``` ### cargo-cuda (required for CUDA / PTX) -`cargo-cuda` lives in this repository (`crates/cargo-cuda`). Install it from the workspace and -build the `rustc_codegen_nvvm` codegen backend: +Install `cargo-cuda` from crates.io: ```bash -cargo install --path https://github.com/dimforge/khal cargo-cuda +cargo install cargo-cuda --version 0.1.0 cargo cuda install ``` diff --git a/crates/khal-example-shaders/src/lib.rs b/crates/khal-example-shaders/src/lib.rs index a455093..167a116 100644 --- a/crates/khal-example-shaders/src/lib.rs +++ b/crates/khal-example-shaders/src/lib.rs @@ -1,6 +1,7 @@ #![cfg_attr(target_arch = "spirv", no_std)] use khal_std::glamx::UVec3; +use khal_std::index::MaybeIndexUnchecked; use khal_std::macros::{spirv, spirv_bindgen}; #[spirv_bindgen] @@ -11,7 +12,7 @@ pub fn add_assign( #[spirv(storage_buffer, descriptor_set = 0, binding = 1)] b: &[f32], ) { let thread_id = invocation_id.x as usize; - if thread_id < a.len() { - a[thread_id] += b[thread_id]; + if thread_id < a.len() && thread_id < b.len() { + *a.at_mut(thread_id) += b.read(thread_id); } } diff --git a/crates/khal-example/shaders-spirv/add_assign.spv b/crates/khal-example/shaders-spirv/add_assign.spv index d88ad1c..2d47075 100644 Binary files a/crates/khal-example/shaders-spirv/add_assign.spv and b/crates/khal-example/shaders-spirv/add_assign.spv differ diff --git a/crates/khal-std/Cargo.toml b/crates/khal-std/Cargo.toml index 864b617..53f6b27 100644 --- a/crates/khal-std/Cargo.toml +++ b/crates/khal-std/Cargo.toml @@ -24,14 +24,14 @@ unsafe_remove_boundchecks = [] glamx = { version = "0.2", default-features = false, features = ["nostd-libm", "bytemuck"] } rayon = { version = "1", optional = true } corosensei = { version = "0.3", optional = true } -spirv-std-macros = { git = "https://github.com/Rust-GPU/rust-gpu.git", rev = "6a67e7b5" } +spirv-std-macros = "0.10.0-alpha.1" khal-derive = { path = "../khal-derive" } [lints] workspace = true [target.'cfg(not(target_arch = "nvptx64"))'.dependencies] -spirv-std = { git = "https://github.com/Rust-GPU/rust-gpu.git", rev = "6a67e7b5" } +spirv-std = "0.10.0-alpha.1" [target.'cfg(target_arch = "nvptx64")'.dependencies] # Fixes the UVec3::element_product bug diff --git a/crates/khal-std/src/arch/cpu.rs b/crates/khal-std/src/arch/cpu.rs index 4de5d72..acca30b 100644 --- a/crates/khal-std/src/arch/cpu.rs +++ b/crates/khal-std/src/arch/cpu.rs @@ -9,10 +9,14 @@ //! we simulate GPU threads as lightweight stackful coroutines (via `corosensei`) //! that yield at each barrier. A single OS thread runs all coroutines //! cooperatively, with zero OS scheduling overhead. +//! +//! Coroutine stacks are pooled per-thread to avoid repeated mmap/munmap +//! syscalls across dispatches. extern crate std; use std::cell::Cell; +use std::cell::RefCell; // ============================================================================= // Barrier: yields the current coroutine back to the scheduler @@ -63,13 +67,46 @@ pub fn dispatch_workgroups(num_workgroups: usize, f: impl Fn(u32) + Sync + Send) } // ============================================================================= -// Intra-workgroup dispatch (using corosensei coroutines) +// Intra-workgroup dispatch (using corosensei coroutines with stack pooling) // ============================================================================= +/// Stack size for coroutines. Shader functions use very little stack space +/// (local variables and small arrays), so 64KB is more than sufficient. +const COROUTINE_STACK_SIZE: usize = 64 * 1024; + thread_local! { /// Pointer to the active Yielder (null when not in coroutine mode). /// Each coroutine sets this before calling the work function. static COROUTINE_YIELDER: Cell<*mut corosensei::Yielder<(), ()>> = const { Cell::new(std::ptr::null_mut()) }; + + /// Pool of reusable coroutine stacks. Stacks are allocated on first use + /// and returned to the pool after each dispatch, avoiding repeated + /// mmap/munmap syscalls. + static STACK_POOL: RefCell> = RefCell::new(Vec::new()); +} + +/// Takes `count` stacks from the thread-local pool, allocating new ones if needed. +fn take_stacks(count: usize) -> Vec { + STACK_POOL.with(|pool| { + let mut pool = pool.borrow_mut(); + let reusable = count.min(pool.len()); + let drain_start = pool.len() - reusable; + let mut stacks: Vec = pool.drain(drain_start..).collect(); + for _ in stacks.len()..count { + stacks.push( + corosensei::stack::DefaultStack::new(COROUTINE_STACK_SIZE) + .expect("failed to allocate coroutine stack"), + ); + } + stacks + }) +} + +/// Returns stacks to the thread-local pool for reuse. +fn return_stacks(stacks: impl IntoIterator) { + STACK_POOL.with(|pool| { + pool.borrow_mut().extend(stacks); + }); } /// Dispatches `num_threads` virtual threads using cooperative coroutines. @@ -79,6 +116,7 @@ thread_local! { /// reached the barrier), the scheduler resumes them for the next phase. /// /// This runs on a single OS thread with zero OS scheduling overhead. +/// Coroutine stacks are pooled to avoid repeated allocation. pub fn dispatch_workgroup_threads(num_threads: usize, f: impl Fn(u32) + Sync) { use corosensei::{Coroutine, CoroutineResult}; @@ -88,41 +126,50 @@ pub fn dispatch_workgroup_threads(num_threads: usize, f: impl Fn(u32) + Sync) { let f_ref: &'static (dyn Fn(u32) + Sync) = unsafe { core::mem::transmute(&f as &(dyn Fn(u32) + Sync)) }; - // Create one coroutine per virtual thread. - let mut coroutines: Vec>> = (0..num_threads) - .map(|tid| { - Some(Coroutine::new(move |yielder, ()| { - // Store the yielder pointer in TLS so barrier_wait() can find it. - COROUTINE_YIELDER.with(|cell| { - cell.set(yielder as *const _ as *mut _); - }); - f_ref(tid as u32); - // Clear the yielder pointer. - COROUTINE_YIELDER.with(|cell| cell.set(std::ptr::null_mut())); - })) - }) - .collect(); + // Take stacks from the pool (reuses existing ones, allocates only if needed). + let stacks = take_stacks(num_threads); + + // Create one coroutine per virtual thread, using pooled stacks. + let mut coroutines: Vec>> = + stacks + .into_iter() + .enumerate() + .map(|(tid, stack)| { + Some(Coroutine::with_stack(stack, move |yielder, ()| { + // Store the yielder pointer in TLS so barrier_wait() can find it. + COROUTINE_YIELDER.with(|cell| { + cell.set(yielder as *const _ as *mut _); + }); + f_ref(tid as u32); + // Clear the yielder pointer. + COROUTINE_YIELDER.with(|cell| cell.set(std::ptr::null_mut())); + })) + }) + .collect(); // Run all coroutines in round-robin until all complete. // Each "round" corresponds to one barrier synchronization point. + // Completed coroutines have their stacks recovered for pooling. + let mut recovered_stacks = Vec::with_capacity(num_threads); loop { let mut all_done = true; - for slot in coroutines.iter_mut() { - if let Some(coroutine) = slot { - match coroutine.resume(()) { - CoroutineResult::Yield(()) => { - // Coroutine yielded at a barrier — continue to next one. - all_done = false; - } - CoroutineResult::Return(()) => { - // Coroutine completed — remove it. - *slot = None; - } + for i in 0..coroutines.len() { + let result = coroutines[i].as_mut().map(|c| c.resume(())); + match result { + Some(CoroutineResult::Yield(())) => { + all_done = false; + } + Some(CoroutineResult::Return(())) => { + recovered_stacks.push(coroutines[i].take().unwrap().into_stack()); } + None => {} } } if all_done { break; } } + + // Return stacks to the pool for reuse by future dispatches. + return_stacks(recovered_stacks); } diff --git a/crates/khal-std/src/lib.rs b/crates/khal-std/src/lib.rs index 9ad4d93..c154e5f 100644 --- a/crates/khal-std/src/lib.rs +++ b/crates/khal-std/src/lib.rs @@ -28,3 +28,12 @@ pub use glamx; #[cfg(target_arch = "nvptx64")] pub use cuda_std; + +#[cfg(not(any(target_arch = "spirv", target_arch = "nvptx64")))] +pub use std::println; +#[cfg(any(target_arch = "spirv", target_arch = "nvptx64"))] +#[macro_export] +macro_rules! println { + () => {}; + ($($arg:tt)*) => {}; +}