diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 27f4f36..5c04ebf 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -52,7 +52,7 @@ jobs: - name: Install cargo-gpu run: | - cargo install --git https://github.com/Rust-GPU/cargo-gpu cargo-gpu + cargo install cargo-gpu --version 0.10.0-alpha.1 cargo gpu install --auto-install-rust-toolchain - name: Run clippy lints @@ -80,7 +80,7 @@ jobs: - name: Install cargo-gpu run: | - cargo install --git https://github.com/Rust-GPU/cargo-gpu cargo-gpu + cargo install cargo-gpu --version 0.10.0-alpha.1 cargo gpu install --auto-install-rust-toolchain - name: Check documentation @@ -106,7 +106,7 @@ jobs: sweep-cache: true - name: Install cargo-gpu run: | - cargo install --git https://github.com/Rust-GPU/cargo-gpu cargo-gpu + cargo install cargo-gpu --version 0.10.0-alpha.1 cargo gpu install --auto-install-rust-toolchain - name: Run Cargo Tests run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index e69de29..12ede24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -0,0 +1,15 @@ +# Changelog + +_Disclaimer: this changelog is updated using generative AI, but is still verified manually._ + +## v0.1.0 + +This shows the changes between the time of open-sourcing the crate and its first release to crates.io: + +### Added +- `println!` support for shaders running on the CPU backend (`khal-std`). + +### Changed +- Switch `spirv-std` and `spirv-std-macros` to the published `0.10.0-alpha.1` release (previously pinned to a git revision). +- Cache coroutines on the CPU backend for improved performance. +- Enable incremental builds in the workspace to work around a `rust-gpu` issue where the example shader entrypoint was being dropped. diff --git a/README.md b/README.md index 3a30b57..fc3a270 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,9 @@ on any platform: **WebGPU**, **CUDA**, or **CPU** -- from a single codebase. > **Warning** -> KHAL is still under heavy development. +> KHAL is still under heavy development. The CUDA backend is currently only supported when using the +> github version of `khal-std` (because some dependencies are not available on cartes.io yet). If you +> don’t intend to target cuda, then the published version of `khal-std` is the way to go.
@@ -25,26 +27,23 @@ on any platform: **WebGPU**, **CUDA**, or **CPU** -- from a single codebase.
- **Proc-macro bindings** -- `#[spirv_bindgen]` generates type-safe host-side structs from your shader function signature.
- **Build pipeline** -- `khal-builder` orchestrates `cargo gpu` and `cargo cuda` to compile shaders at build time.
-
## Development setup
### cargo-gpu (required for SPIR-V / WebGPU)
-The crates.io version of `cargo-gpu` is outdated. Install from Git and let it set up its Rust
-toolchain:
+Install `cargo-gpu` from crates.io:
```bash
-cargo install --git https://github.com/Rust-GPU/cargo-gpu cargo-gpu
+cargo install cargo-gpu --version 0.10.0-alpha.1
cargo gpu install
```
### cargo-cuda (required for CUDA / PTX)
-`cargo-cuda` lives in this repository (`crates/cargo-cuda`). Install it from the workspace and
-build the `rustc_codegen_nvvm` codegen backend:
+Install `cargo-cuda` from crates.io:
```bash
-cargo install --path https://github.com/dimforge/khal cargo-cuda
+cargo install cargo-cuda --version 0.1.0
cargo cuda install
```
diff --git a/crates/khal-example-shaders/src/lib.rs b/crates/khal-example-shaders/src/lib.rs
index a455093..167a116 100644
--- a/crates/khal-example-shaders/src/lib.rs
+++ b/crates/khal-example-shaders/src/lib.rs
@@ -1,6 +1,7 @@
#![cfg_attr(target_arch = "spirv", no_std)]
use khal_std::glamx::UVec3;
+use khal_std::index::MaybeIndexUnchecked;
use khal_std::macros::{spirv, spirv_bindgen};
#[spirv_bindgen]
@@ -11,7 +12,7 @@ pub fn add_assign(
#[spirv(storage_buffer, descriptor_set = 0, binding = 1)] b: &[f32],
) {
let thread_id = invocation_id.x as usize;
- if thread_id < a.len() {
- a[thread_id] += b[thread_id];
+ if thread_id < a.len() && thread_id < b.len() {
+ *a.at_mut(thread_id) += b.read(thread_id);
}
}
diff --git a/crates/khal-example/shaders-spirv/add_assign.spv b/crates/khal-example/shaders-spirv/add_assign.spv
index d88ad1c..2d47075 100644
Binary files a/crates/khal-example/shaders-spirv/add_assign.spv and b/crates/khal-example/shaders-spirv/add_assign.spv differ
diff --git a/crates/khal-std/Cargo.toml b/crates/khal-std/Cargo.toml
index 864b617..53f6b27 100644
--- a/crates/khal-std/Cargo.toml
+++ b/crates/khal-std/Cargo.toml
@@ -24,14 +24,14 @@ unsafe_remove_boundchecks = []
glamx = { version = "0.2", default-features = false, features = ["nostd-libm", "bytemuck"] }
rayon = { version = "1", optional = true }
corosensei = { version = "0.3", optional = true }
-spirv-std-macros = { git = "https://github.com/Rust-GPU/rust-gpu.git", rev = "6a67e7b5" }
+spirv-std-macros = "0.10.0-alpha.1"
khal-derive = { path = "../khal-derive" }
[lints]
workspace = true
[target.'cfg(not(target_arch = "nvptx64"))'.dependencies]
-spirv-std = { git = "https://github.com/Rust-GPU/rust-gpu.git", rev = "6a67e7b5" }
+spirv-std = "0.10.0-alpha.1"
[target.'cfg(target_arch = "nvptx64")'.dependencies]
# Fixes the UVec3::element_product bug
diff --git a/crates/khal-std/src/arch/cpu.rs b/crates/khal-std/src/arch/cpu.rs
index 4de5d72..acca30b 100644
--- a/crates/khal-std/src/arch/cpu.rs
+++ b/crates/khal-std/src/arch/cpu.rs
@@ -9,10 +9,14 @@
//! we simulate GPU threads as lightweight stackful coroutines (via `corosensei`)
//! that yield at each barrier. A single OS thread runs all coroutines
//! cooperatively, with zero OS scheduling overhead.
+//!
+//! Coroutine stacks are pooled per-thread to avoid repeated mmap/munmap
+//! syscalls across dispatches.
extern crate std;
use std::cell::Cell;
+use std::cell::RefCell;
// =============================================================================
// Barrier: yields the current coroutine back to the scheduler
@@ -63,13 +67,46 @@ pub fn dispatch_workgroups(num_workgroups: usize, f: impl Fn(u32) + Sync + Send)
}
// =============================================================================
-// Intra-workgroup dispatch (using corosensei coroutines)
+// Intra-workgroup dispatch (using corosensei coroutines with stack pooling)
// =============================================================================
+/// Stack size for coroutines. Shader functions use very little stack space
+/// (local variables and small arrays), so 64KB is more than sufficient.
+const COROUTINE_STACK_SIZE: usize = 64 * 1024;
+
thread_local! {
/// Pointer to the active Yielder (null when not in coroutine mode).
/// Each coroutine sets this before calling the work function.
static COROUTINE_YIELDER: Cell<*mut corosensei::Yielder<(), ()>> = const { Cell::new(std::ptr::null_mut()) };
+
+ /// Pool of reusable coroutine stacks. Stacks are allocated on first use
+ /// and returned to the pool after each dispatch, avoiding repeated
+ /// mmap/munmap syscalls.
+ static STACK_POOL: RefCell