Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions compiler/rustc_abi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1753,6 +1753,9 @@ pub struct AddressSpace(pub u32);
impl AddressSpace {
/// LLVM's `0` address space.
pub const ZERO: Self = AddressSpace(0);
/// The address space for workgroup memory on nvptx and amdgpu.
/// See e.g. the `gpu_launch_sized_workgroup_mem` intrinsic for details.
pub const GPU_WORKGROUP: Self = AddressSpace(3);
}

/// How many scalable vectors are in a `BackendRepr::ScalableVector`?
Expand Down
23 changes: 23 additions & 0 deletions compiler/rustc_codegen_llvm/src/declare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use std::borrow::Borrow;

use itertools::Itertools;
use rustc_abi::AddressSpace;
use rustc_codegen_ssa::traits::{MiscCodegenMethods, TypeMembershipCodegenMethods};
use rustc_data_structures::fx::FxIndexSet;
use rustc_middle::ty::{Instance, Ty};
Expand Down Expand Up @@ -104,6 +105,28 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
)
}
}

/// Declare a global value in a specific address space.
///
/// If there’s a value with the same name already declared, the function will
/// return its Value instead.
pub(crate) fn declare_global_in_addrspace(
&self,
name: &str,
ty: &'ll Type,
addr_space: AddressSpace,
) -> &'ll Value {
debug!("declare_global(name={name:?}, addrspace={addr_space:?})");
unsafe {
llvm::LLVMRustGetOrInsertGlobalInAddrspace(
(**self).borrow().llmod,
name.as_c_char_ptr(),
name.len(),
ty,
addr_space.0,
)
}
}
}

impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
Expand Down
49 changes: 45 additions & 4 deletions compiler/rustc_codegen_llvm/src/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ use std::ffi::c_uint;
use std::{assert_matches, iter, ptr};

use rustc_abi::{
Align, BackendRepr, Float, HasDataLayout, Integer, NumScalableVectors, Primitive, Size,
WrappingRange,
AddressSpace, Align, BackendRepr, Float, HasDataLayout, Integer, NumScalableVectors, Primitive,
Size, WrappingRange,
};
use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh};
use rustc_codegen_ssa::common::{IntPredicate, TypeKind};
Expand Down Expand Up @@ -176,6 +176,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
span: Span,
) -> Result<(), ty::Instance<'tcx>> {
let tcx = self.tcx;
let llvm_version = crate::llvm_util::get_version();

let name = tcx.item_name(instance.def_id());
let fn_args = instance.args;
Expand All @@ -192,7 +193,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
| sym::maximum_number_nsz_f64
| sym::maximum_number_nsz_f128
// Need at least LLVM 22 for `min/maximumnum` to not crash LLVM.
if crate::llvm_util::get_version() >= (22, 0, 0) =>
if llvm_version >= (22, 0, 0) =>
{
let intrinsic_name = if name.as_str().starts_with("min") {
"llvm.minimumnum"
Expand Down Expand Up @@ -418,7 +419,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
}

// FIXME move into the branch below when LLVM 22 is the lowest version we support.
sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => {
sym::carryless_mul if llvm_version >= (22, 0, 0) => {
let ty = args[0].layout.ty;
if !ty.is_integral() {
tcx.dcx().emit_err(InvalidMonomorphization::BasicIntegerType {
Expand Down Expand Up @@ -618,6 +619,46 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
return Ok(());
}

sym::gpu_launch_sized_workgroup_mem => {
// Generate an anonymous global per call, with these properties:
// 1. The global is in the address space for workgroup memory
// 2. It is an `external` global
// 3. It is correctly aligned for the pointee `T`
// All instances of extern addrspace(gpu_workgroup) globals are merged in the LLVM backend.
// The name is irrelevant.
// See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared
let name = if llvm_version < (23, 0, 0) && tcx.sess.target.arch == Arch::Nvptx64 {
// The auto-assigned name for extern shared globals in the nvptx backend does
// not compile in ptxas. Workaround this issue by assigning a name.
// Fixed in LLVM 23.
"gpu_launch_sized_workgroup_mem"
} else {
""
};
let global = self.declare_global_in_addrspace(
name,
self.type_array(self.type_i8(), 0),
AddressSpace::GPU_WORKGROUP,
);
let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() };
// The alignment of the global is used to specify the *minimum* alignment that
// must be obeyed by the GPU runtime.
// When multiple of these global variables are used by a kernel, the maximum alignment is taken.
// See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821
let alignment = self.align_of(*inner_ty).bytes() as u32;
unsafe {
// FIXME Workaround the above issue by taking maximum alignment if the global existed
if tcx.sess.target.arch == Arch::Nvptx64 {
if alignment > llvm::LLVMGetAlignment(global) {
llvm::LLVMSetAlignment(global, alignment);
}
} else {
llvm::LLVMSetAlignment(global, alignment);
}
}
Comment thread
Flakebi marked this conversation as resolved.
self.cx().const_pointercast(global, self.type_ptr())
}

sym::amdgpu_dispatch_ptr => {
let val = self.call_intrinsic("llvm.amdgcn.dispatch.ptr", &[], &[]);
// Relying on `LLVMBuildPointerCast` to produce an addrspacecast
Expand Down
7 changes: 7 additions & 0 deletions compiler/rustc_codegen_llvm/src/llvm/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2003,6 +2003,13 @@ unsafe extern "C" {
NameLen: size_t,
T: &'a Type,
) -> &'a Value;
pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>(
M: &'a Module,
Name: *const c_char,
NameLen: size_t,
T: &'a Type,
AddressSpace: c_uint,
) -> &'a Value;
pub(crate) fn LLVMRustGetNamedValue(
M: &Module,
Name: *const c_char,
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
sym::abort
| sym::unreachable
| sym::cold_path
| sym::gpu_launch_sized_workgroup_mem
| sym::breakpoint
| sym::amdgpu_dispatch_ptr
| sym::assert_zero_valid
Expand Down
2 changes: 2 additions & 0 deletions compiler/rustc_hir_analysis/src/check/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
| sym::forget
| sym::frem_algebraic
| sym::fsub_algebraic
| sym::gpu_launch_sized_workgroup_mem
| sym::is_val_statically_known
| sym::log2f16
| sym::log2f32
Expand Down Expand Up @@ -297,6 +298,7 @@ pub(crate) fn check_intrinsic_type(
sym::field_offset => (1, 0, vec![], tcx.types.usize),
sym::rustc_peek => (1, 0, vec![param(0)], param(0)),
sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()),
sym::gpu_launch_sized_workgroup_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))),
sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => {
(1, 0, vec![], tcx.types.unit)
}
Expand Down
26 changes: 21 additions & 5 deletions compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,10 +299,12 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M,
.getCallee());
}

extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
const char *Name,
size_t NameLen,
LLVMTypeRef Ty) {
// Get the global variable with the given name if it exists or create a new
// external global.
extern "C" LLVMValueRef
LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name,
Comment on lines +304 to +305
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this function will always create a specifically-extern global, can we document that here? The name is whatever, to me.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I added a comment for that (it already did that before my change, I only added the addrspace argument)

size_t NameLen, LLVMTypeRef Ty,
unsigned int AddressSpace) {
Module *Mod = unwrap(M);
auto NameRef = StringRef(Name, NameLen);

Expand All @@ -313,10 +315,24 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true);
if (!GV)
GV = new GlobalVariable(*Mod, unwrap(Ty), false,
GlobalValue::ExternalLinkage, nullptr, NameRef);
GlobalValue::ExternalLinkage, nullptr, NameRef,
nullptr, GlobalValue::NotThreadLocal, AddressSpace);
return wrap(GV);
}

// Get the global variable with the given name if it exists or create a new
// external global.
extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
const char *Name,
size_t NameLen,
LLVMTypeRef Ty) {
Module *Mod = unwrap(M);
unsigned int AddressSpace =
Mod->getDataLayout().getDefaultGlobalsAddressSpace();
return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty,
AddressSpace);
}

// Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`.
enum class LLVMRustAttributeKind {
AlwaysInline = 0,
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_span/src/symbol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,7 @@ symbols! {
global_asm,
global_registration,
globs,
gpu_launch_sized_workgroup_mem,
gt,
guard,
guard_patterns,
Expand Down
40 changes: 40 additions & 0 deletions library/core/src/intrinsics/gpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,46 @@

#![unstable(feature = "gpu_intrinsics", issue = "none")]

/// Returns the pointer to workgroup memory allocated at launch-time on GPUs.
///
/// Workgroup memory is a memory region that is shared between all threads in
/// the same workgroup. It is faster to access than other memory but pointers do not
/// work outside the workgroup where they were obtained.
/// Workgroup memory can be allocated statically or after compilation, when
/// launching a gpu-kernel. `gpu_launch_sized_workgroup_mem` returns the pointer to
/// the memory that is allocated at launch-time.
/// The size of this memory can differ between launches of a gpu-kernel, depending on
/// what is specified at launch-time.
/// However, the alignment is fixed by the kernel itself, at compile-time.
///
/// The returned pointer is the start of the workgroup memory region that is
/// allocated at launch-time.
/// All calls to `gpu_launch_sized_workgroup_mem` in a workgroup, independent of the
/// generic type, return the same address, so alias the same memory.
/// The returned pointer is aligned by at least the alignment of `T`.
Copy link
Copy Markdown
Member

@ZuseZ4 ZuseZ4 Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@RalfJung I don't think there is anything in these docs that currently supports this assumption, so I wasn't particularly concerned about:
Hmm. I guess I'm worrying about someone calling it with ::<u8>, casting the pointer, and then assuming that a call elsewhere (perhaps in a library they are depending on?) will enforce the alignment they want, but that call might get subject to DCE or other "non-compilation events".

The docs only tie the alignment of "[this]" returned pointer to "[this]" T, and Rust also isn't really known for spooky actions at a distance that would support other interpretations. But if both you and Jubilee are concerned, we can also be more explicit. Do you prefer this (please feel free to suggest better wording)?

/// The returned pointer is aligned by at least the alignment of `T`.
/// No stronger alignment guarantee is provided.
/// In particular, callers may not rely on one invocation of
/// `gpu_launch_sized_workgroup_mem` to affect the alignment of a pointer
/// returned by another invocation.

View changes since the review

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docs only tie the alignment of "[this]" returned pointer to "[this]" T, and Rust also isn't really known for spooky actions at a distance that would support other interpretations.

This intrinsic adds spooky action at a distance, that's why I am so concerned. ;) All invocations of the intrinsic return the same pointer, so they magically affect each other in terms of alignment.

callers may not rely on one invocation of
/// `gpu_launch_sized_workgroup_mem` to affect the alignment of a pointer
/// returned by another invocation.

This is somewhat contradicting the statement that they all return the same address.

I'd propose something like:
If gpu_launch_sized_workgroup_mem is invoked multiple times with different types that have different alignment, then you may only rely on the resulting pointer having the alignment of T after a call to gpu_launch_sized_workgroup_mem::<T> has occurred in the current program execution.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The two properties guaranteed by the intrinsic are

  1. the returned pointer has at least the alignment of T and
  2. all invocations within a workgroup return the same pointer.

That allows a bunch of implications, but I don’t think they are important. The core goal is, you want to use launch-sized workgroup mem, you call the intrinsic with your needed alignment, you use the pointer. That’s it, nothing else needed, no other derived guarantees used.
In other words, I do not see a use-case for (ab)using this action at a distance.

If gpu_launch_sized_workgroup_mem is invoked multiple times with different types that have different alignment, then you may only rely on the resulting pointer having the alignment of T after a call to gpu_launch_sized_workgroup_mem::<T> has occurred in the current program execution.

The two properties allow inferring even wider guarantees. If gpu_launch_sized_workgroup_mem is invoked with a certain alignment, in any execution within the same workgroup, every other call to gpu_launch_sized_workgroup_mem in that workgroup at any time before or after is guaranteed to receive at least this alignment.
The calls that “observe” the action at a distance do not need to be in the same thread of execution, nor do they need to be after the “observed” call.

The two core guarantees are written down in the docs. If there is no use-case for such inferred guarantees (I cannot think of any), I fear that writing down inferred guarantees in the docs adds more confusion than it helps.
(If I read something like this in the docs, it would leave me wondering if there is an intended use-case for this and if I am supposed to hold it differently.)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In other words, I do not see a use-case for (ab)using this action at a distance.

That's great. But other people will read these docs, notice the implications, and if it even remotely fits their usecase they will (ab)use everything they can find. If there are implications of our spec, or things that seem like implications, that we don't actually intend to be used or guaranteed, then we can't just hope that people will not use them. We have to make it explicit, or someone will use them.

Copy link
Copy Markdown
Member

@RalfJung RalfJung Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If gpu_launch_sized_workgroup_mem is invoked with a certain alignment, in any execution within the same workgroup, every other call to gpu_launch_sized_workgroup_mem in that workgroup at any time before or after is guaranteed to receive at least this alignment.

This spec is extremely problematic. We do not allow time travel in Rust; time travel usually leads to semantic contradictions. That's why I insist on a clarification like what I described: we do absolutely not want code that might be executed in the future to affect the reasoning I am allowed to do here and now.

If there truly is no usecase for such "time travel" use of the intrinsic, then my proposed clarification should be uncontroversial.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I get that. For me, what I wrote follows logically from the two properties (alignment + all invocations return the same pointer).
We cannot do anything to prevent that (any implementation that would not satisfy the wide guarantee can never be correct).

Let me try to explain with an example (pseudo-code):

fn main() {
  let p = gpu_launch_sized_workgroup_mem::<u32>();
  // As I understand it, you say we should declare that this assert can fail
  assert!(p is aligned to at least 8 byte);

  let p2 = gpu_launch_sized_workgroup_mem::<u64>();
  // This is guaranteed to be true
  assert!(p2 is aligned to at least 8 byte);
  assert_eq!(p, p2);
}

Given that p2 must be aligned to at least 8 byte and p2 == p, I can’t imagine any implementation where the first assert is allowed to fail.
Or, declaring that it can fail would contradict the other guarantees we give.

Am I missing something here?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all invocations

The problem is defining what exactly "all invocations" are. Only the invocations that are actually executed in this run of the program matter. And since it's impossible to tell whether the program will actually reach an invocation further down the code, I think we want to be very sure to exclude any reasoning "elsewhere / in the future, this invocation exists, and hence ...".

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Flakebi I think what Ralf is trying to say, is that your definition, with just the two points above, is too strong to be soundly expressed in Rust. As you point out, you can derive a lot of things, including Ralf's extension to the docs. We also know that with today's LLVM, we could never break his extension to the docs.

But explaining how the alignment within a workgroup is affected by multiple calls is hard (impossible) to do without using time-travelling, which is ~prohibited in Rust.
Then again, time-travelling is totally fine in code that is UB (afaik). So, by just making it UB to argue over the alignment based on later calls, we are now allowed to use our time-travelling implementation.
Implementation-wise, nothing would change on our side. We just prohibit Rust devs who peeked into LLVM to use that internal knowledge.

///
/// # Safety
///
/// The pointer is safe to dereference from the start (the returned pointer) up to the
/// size of workgroup memory that was specified when launching the current gpu-kernel.
/// This allocated size is not related in any way to `T`.
///
/// The user must take care of synchronizing access to workgroup memory between
/// threads in a workgroup. The usual data race requirements apply.
///
/// # Other APIs
///
/// CUDA and HIP call this dynamic shared memory, shared between threads in a block.
/// OpenCL and SYCL call this local memory, shared between threads in a work-group.
/// GLSL calls this shared memory, shared between invocations in a work group.
/// DirectX calls this groupshared memory, shared between threads in a thread-group.
#[must_use = "returns a pointer that does nothing unless used"]
#[rustc_intrinsic]
#[rustc_nounwind]
#[unstable(feature = "gpu_launch_sized_workgroup_mem", issue = "135513")]
#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))]
pub fn gpu_launch_sized_workgroup_mem<T>() -> *mut T;

/// Returns a pointer to the HSA kernel dispatch packet.
///
/// A `gpu-kernel` on amdgpu is always launched through a kernel dispatch packet.
Expand Down
4 changes: 4 additions & 0 deletions src/tools/tidy/src/style.rs
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ fn should_ignore(line: &str) -> bool {
|| static_regex!(
"\\s*//@ \\!?(count|files|has|has-dir|hasraw|matches|matchesraw|snapshot)\\s.*"
).is_match(line)
// Matching for FileCheck checks
|| static_regex!(
"\\s*// [a-zA-Z0-9-_]*:\\s.*"
).is_match(line)
}

/// Returns `true` if `line` is allowed to be longer than the normal limit.
Expand Down
41 changes: 41 additions & 0 deletions tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At first I kind of wanted there to be another test that does the cross-crate version of this. Then I remembered what was discussed elsewhere: that the targets in question are pure LLVM bitcode that gets mashed together anyways, so I am not sure it would actually benefit us, and it would probably involve a ton of tedium with run-make, having considered it in more detail. So, meh.

Basically only leaving this note here to remind myself that if this turns out to go awry in the future, I can update in the direction of following this kind of instinct more often. :^)

Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Checks that the GPU intrinsic to get launch-sized workgroup memory works
// and correctly aligns the `external addrspace(...) global`s over multiple calls.

//@ revisions: amdgpu nvptx-pre-llvm-23 nvptx-post-llvm-23
//@ compile-flags: --crate-type=rlib -Copt-level=1
//
//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900
//@ [amdgpu] needs-llvm-components: amdgpu

//@ [nvptx-pre-llvm-23] compile-flags: --target nvptx64-nvidia-cuda
//@ [nvptx-pre-llvm-23] needs-llvm-components: nvptx
//@ [nvptx-pre-llvm-23] max-llvm-major-version: 22
//@ [nvptx-post-llvm-23] compile-flags: --target nvptx64-nvidia-cuda
//@ [nvptx-post-llvm-23] needs-llvm-components: nvptx
//@ [nvptx-post-llvm-23] min-llvm-version: 23
//@ add-minicore
#![feature(intrinsics, no_core, rustc_attrs)]
#![no_core]

extern crate minicore;

#[rustc_intrinsic]
#[rustc_nounwind]
fn gpu_launch_sized_workgroup_mem<T>() -> *mut T;

// amdgpu-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4
// amdgpu-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
// amdgpu: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }

// nvptx-pre-llvm-23: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
// nvptx-pre-llvm-23: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }

// nvptx-post-llvm-23-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4
// nvptx-post-llvm-23-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
// nvptx-post-llvm-23: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }
#[unsafe(no_mangle)]
pub fn fun() -> (*mut i32, *mut f64) {
let small = gpu_launch_sized_workgroup_mem::<i32>();
let big = gpu_launch_sized_workgroup_mem::<f64>(); // Increase alignment to 8
(small, big)
}
Loading