@@ -3,7 +3,8 @@ use std::ffi::c_uint;
33use std:: ptr;
44
55use rustc_abi:: {
6- Align , BackendRepr , ExternAbi , Float , HasDataLayout , Primitive , Size , WrappingRange ,
6+ AddressSpace , Align , BackendRepr , ExternAbi , Float , HasDataLayout , Primitive , Size ,
7+ WrappingRange ,
78} ;
89use rustc_codegen_ssa:: base:: { compare_simd_types, wants_msvc_seh, wants_wasm_eh} ;
910use rustc_codegen_ssa:: codegen_attrs:: autodiff_attrs;
@@ -24,7 +25,7 @@ use rustc_session::config::CrateType;
2425use rustc_span:: { Span , Symbol , sym} ;
2526use rustc_symbol_mangling:: { mangle_internal_symbol, symbol_name_for_instance_in_crate} ;
2627use rustc_target:: callconv:: PassMode ;
27- use rustc_target:: spec:: Os ;
28+ use rustc_target:: spec:: { Arch , Os } ;
2829use tracing:: debug;
2930
3031use crate :: abi:: FnAbiLlvmExt ;
@@ -554,6 +555,44 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
554555 return Ok ( ( ) ) ;
555556 }
556557
558+ sym:: gpu_launch_sized_workgroup_mem => {
559+ // The name of the global variable is not relevant, the important properties are.
560+ // 1. The global is in the address space for workgroup memory
561+ // 2. It is an extern global
562+ // All instances of extern addrspace(gpu_workgroup) globals are merged in the LLVM backend.
563+ // Generate an unnamed global per intrinsic call, so that different kernels can have
564+ // different minimum alignments.
565+ // See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared
566+ // FIXME Workaround an nvptx backend issue that extern globals must have a name
567+ let name = if tcx. sess . target . arch == Arch :: Nvptx64 {
568+ "gpu_launch_sized_workgroup_mem"
569+ } else {
570+ ""
571+ } ;
572+ let global = self . declare_global_in_addrspace (
573+ name,
574+ self . type_array ( self . type_i8 ( ) , 0 ) ,
575+ AddressSpace :: GPU_WORKGROUP ,
576+ ) ;
577+ let ty:: RawPtr ( inner_ty, _) = result. layout . ty . kind ( ) else { unreachable ! ( ) } ;
578+ // The alignment of the global is used to specify the *minimum* alignment that
579+ // must be obeyed by the GPU runtime.
580+ // When multiple of these global variables are used by a kernel, the maximum alignment is taken.
581+ // See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821
582+ let alignment = self . align_of ( * inner_ty) . bytes ( ) as u32 ;
583+ unsafe {
584+ // FIXME Workaround the above issue by taking maximum alignment if the global existed
585+ if tcx. sess . target . arch == Arch :: Nvptx64 {
586+ if alignment > llvm:: LLVMGetAlignment ( global) {
587+ llvm:: LLVMSetAlignment ( global, alignment) ;
588+ }
589+ } else {
590+ llvm:: LLVMSetAlignment ( global, alignment) ;
591+ }
592+ }
593+ self . cx ( ) . const_pointercast ( global, self . type_ptr ( ) )
594+ }
595+
557596 sym:: amdgpu_dispatch_ptr => {
558597 let val = self . call_intrinsic ( "llvm.amdgcn.dispatch.ptr" , & [ ] , & [ ] ) ;
559598 // Relying on `LLVMBuildPointerCast` to produce an addrspacecast
0 commit comments