-
-
Notifications
You must be signed in to change notification settings - Fork 14.8k
Add intrinsic for launch-sized workgroup memory on GPUs #146181
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -299,10 +299,12 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M, | |
| .getCallee()); | ||
| } | ||
|
|
||
| extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, | ||
| const char *Name, | ||
| size_t NameLen, | ||
| LLVMTypeRef Ty) { | ||
| // Get the global variable with the given name if it exists or create a new | ||
| // external global. | ||
| extern "C" LLVMValueRef | ||
| LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name, | ||
|
Comment on lines
+304
to
+305
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this function will always create a specifically-extern global, can we document that here? The name is whatever, to me.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I added a comment for that (it already did that before my change, I only added the addrspace argument) |
||
| size_t NameLen, LLVMTypeRef Ty, | ||
| unsigned int AddressSpace) { | ||
| Module *Mod = unwrap(M); | ||
| auto NameRef = StringRef(Name, NameLen); | ||
|
|
||
|
|
@@ -313,10 +315,24 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, | |
| GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true); | ||
| if (!GV) | ||
| GV = new GlobalVariable(*Mod, unwrap(Ty), false, | ||
| GlobalValue::ExternalLinkage, nullptr, NameRef); | ||
| GlobalValue::ExternalLinkage, nullptr, NameRef, | ||
| nullptr, GlobalValue::NotThreadLocal, AddressSpace); | ||
| return wrap(GV); | ||
| } | ||
|
|
||
| // Get the global variable with the given name if it exists or create a new | ||
| // external global. | ||
| extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, | ||
| const char *Name, | ||
| size_t NameLen, | ||
| LLVMTypeRef Ty) { | ||
| Module *Mod = unwrap(M); | ||
| unsigned int AddressSpace = | ||
| Mod->getDataLayout().getDefaultGlobalsAddressSpace(); | ||
| return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty, | ||
| AddressSpace); | ||
| } | ||
|
|
||
| // Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`. | ||
| enum class LLVMRustAttributeKind { | ||
| AlwaysInline = 0, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,46 @@ | |
|
|
||
| #![unstable(feature = "gpu_intrinsics", issue = "none")] | ||
|
|
||
| /// Returns the pointer to workgroup memory allocated at launch-time on GPUs. | ||
| /// | ||
| /// Workgroup memory is a memory region that is shared between all threads in | ||
| /// the same workgroup. It is faster to access than other memory but pointers do not | ||
| /// work outside the workgroup where they were obtained. | ||
| /// Workgroup memory can be allocated statically or after compilation, when | ||
| /// launching a gpu-kernel. `gpu_launch_sized_workgroup_mem` returns the pointer to | ||
| /// the memory that is allocated at launch-time. | ||
| /// The size of this memory can differ between launches of a gpu-kernel, depending on | ||
| /// what is specified at launch-time. | ||
| /// However, the alignment is fixed by the kernel itself, at compile-time. | ||
| /// | ||
| /// The returned pointer is the start of the workgroup memory region that is | ||
| /// allocated at launch-time. | ||
| /// All calls to `gpu_launch_sized_workgroup_mem` in a workgroup, independent of the | ||
| /// generic type, return the same address, so alias the same memory. | ||
| /// The returned pointer is aligned by at least the alignment of `T`. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @RalfJung I don't think there is anything in these docs that currently supports this assumption, so I wasn't particularly concerned about: The docs only tie the alignment of "[this]" returned pointer to "[this]" T, and Rust also isn't really known for spooky actions at a distance that would support other interpretations. But if both you and Jubilee are concerned, we can also be more explicit. Do you prefer this (please feel free to suggest better wording)?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This intrinsic adds spooky action at a distance, that's why I am so concerned. ;) All invocations of the intrinsic return the same pointer, so they magically affect each other in terms of alignment. This is somewhat contradicting the statement that they all return the same address. I'd propose something like:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The two properties guaranteed by the intrinsic are
That allows a bunch of implications, but I don’t think they are important. The core goal is, you want to use launch-sized workgroup mem, you call the intrinsic with your needed alignment, you use the pointer. That’s it, nothing else needed, no other derived guarantees used.
The two properties allow inferring even wider guarantees. If The two core guarantees are written down in the docs. If there is no use-case for such inferred guarantees (I cannot think of any), I fear that writing down inferred guarantees in the docs adds more confusion than it helps.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That's great. But other people will read these docs, notice the implications, and if it even remotely fits their usecase they will (ab)use everything they can find. If there are implications of our spec, or things that seem like implications, that we don't actually intend to be used or guaranteed, then we can't just hope that people will not use them. We have to make it explicit, or someone will use them.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This spec is extremely problematic. We do not allow time travel in Rust; time travel usually leads to semantic contradictions. That's why I insist on a clarification like what I described: we do absolutely not want code that might be executed in the future to affect the reasoning I am allowed to do here and now. If there truly is no usecase for such "time travel" use of the intrinsic, then my proposed clarification should be uncontroversial.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I get that. For me, what I wrote follows logically from the two properties (alignment + all invocations return the same pointer). Let me try to explain with an example (pseudo-code): fn main() {
let p = gpu_launch_sized_workgroup_mem::<u32>();
// As I understand it, you say we should declare that this assert can fail
assert!(p is aligned to at least 8 byte);
let p2 = gpu_launch_sized_workgroup_mem::<u64>();
// This is guaranteed to be true
assert!(p2 is aligned to at least 8 byte);
assert_eq!(p, p2);
}Given that Am I missing something here?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The problem is defining what exactly "all invocations" are. Only the invocations that are actually executed in this run of the program matter. And since it's impossible to tell whether the program will actually reach an invocation further down the code, I think we want to be very sure to exclude any reasoning "elsewhere / in the future, this invocation exists, and hence ...".
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Flakebi I think what Ralf is trying to say, is that your definition, with just the two points above, is too strong to be soundly expressed in Rust. As you point out, you can derive a lot of things, including Ralf's extension to the docs. We also know that with today's LLVM, we could never break his extension to the docs. But explaining how the alignment within a workgroup is affected by multiple calls is hard (impossible) to do without using time-travelling, which is ~prohibited in Rust. |
||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// The pointer is safe to dereference from the start (the returned pointer) up to the | ||
| /// size of workgroup memory that was specified when launching the current gpu-kernel. | ||
| /// This allocated size is not related in any way to `T`. | ||
| /// | ||
| /// The user must take care of synchronizing access to workgroup memory between | ||
| /// threads in a workgroup. The usual data race requirements apply. | ||
| /// | ||
| /// # Other APIs | ||
| /// | ||
| /// CUDA and HIP call this dynamic shared memory, shared between threads in a block. | ||
| /// OpenCL and SYCL call this local memory, shared between threads in a work-group. | ||
| /// GLSL calls this shared memory, shared between invocations in a work group. | ||
| /// DirectX calls this groupshared memory, shared between threads in a thread-group. | ||
| #[must_use = "returns a pointer that does nothing unless used"] | ||
| #[rustc_intrinsic] | ||
| #[rustc_nounwind] | ||
| #[unstable(feature = "gpu_launch_sized_workgroup_mem", issue = "135513")] | ||
| #[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))] | ||
| pub fn gpu_launch_sized_workgroup_mem<T>() -> *mut T; | ||
|
|
||
| /// Returns a pointer to the HSA kernel dispatch packet. | ||
| /// | ||
| /// A `gpu-kernel` on amdgpu is always launched through a kernel dispatch packet. | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At first I kind of wanted there to be another test that does the cross-crate version of this. Then I remembered what was discussed elsewhere: that the targets in question are pure LLVM bitcode that gets mashed together anyways, so I am not sure it would actually benefit us, and it would probably involve a ton of tedium with run-make, having considered it in more detail. So, meh. Basically only leaving this note here to remind myself that if this turns out to go awry in the future, I can update in the direction of following this kind of instinct more often. :^) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| // Checks that the GPU intrinsic to get launch-sized workgroup memory works | ||
| // and correctly aligns the `external addrspace(...) global`s over multiple calls. | ||
|
|
||
| //@ revisions: amdgpu nvptx-pre-llvm-23 nvptx-post-llvm-23 | ||
| //@ compile-flags: --crate-type=rlib -Copt-level=1 | ||
| // | ||
| //@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 | ||
| //@ [amdgpu] needs-llvm-components: amdgpu | ||
|
|
||
| //@ [nvptx-pre-llvm-23] compile-flags: --target nvptx64-nvidia-cuda | ||
| //@ [nvptx-pre-llvm-23] needs-llvm-components: nvptx | ||
| //@ [nvptx-pre-llvm-23] max-llvm-major-version: 22 | ||
| //@ [nvptx-post-llvm-23] compile-flags: --target nvptx64-nvidia-cuda | ||
| //@ [nvptx-post-llvm-23] needs-llvm-components: nvptx | ||
| //@ [nvptx-post-llvm-23] min-llvm-version: 23 | ||
| //@ add-minicore | ||
| #![feature(intrinsics, no_core, rustc_attrs)] | ||
| #![no_core] | ||
|
|
||
| extern crate minicore; | ||
|
|
||
| #[rustc_intrinsic] | ||
| #[rustc_nounwind] | ||
| fn gpu_launch_sized_workgroup_mem<T>() -> *mut T; | ||
|
|
||
| // amdgpu-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4 | ||
| // amdgpu-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8 | ||
| // amdgpu: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) } | ||
|
|
||
| // nvptx-pre-llvm-23: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8 | ||
| // nvptx-pre-llvm-23: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) } | ||
|
|
||
| // nvptx-post-llvm-23-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4 | ||
| // nvptx-post-llvm-23-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8 | ||
| // nvptx-post-llvm-23: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) } | ||
| #[unsafe(no_mangle)] | ||
| pub fn fun() -> (*mut i32, *mut f64) { | ||
| let small = gpu_launch_sized_workgroup_mem::<i32>(); | ||
| let big = gpu_launch_sized_workgroup_mem::<f64>(); // Increase alignment to 8 | ||
| (small, big) | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.