Skip to content

Commit b454716

Browse files
committed
Avoid alloca for fully static sizes
1 parent 6efa357 commit b454716

5 files changed

Lines changed: 63 additions & 26 deletions

File tree

compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@ use std::ffi::CString;
33
use bitflags::Flags;
44
use llvm::Linkage::*;
55
use rustc_abi::Align;
6+
use rustc_codegen_ssa::MemFlags;
67
use rustc_codegen_ssa::common::TypeKind;
78
use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
89
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods};
910
use rustc_middle::bug;
10-
use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata};
11+
use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata, OffloadSize};
1112

1213
use crate::builder::Builder;
1314
use crate::common::CodegenCx;
@@ -450,7 +451,15 @@ pub(crate) fn gen_define_handling<'ll>(
450451
// FIXME(offload): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
451452
let transfer_kernel = vec![MappingFlags::TARGET_PARAM.bits(); transfer_to.len()];
452453

453-
let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &sizes);
454+
let actual_sizes = sizes
455+
.iter()
456+
.map(|s| match s {
457+
OffloadSize::Static(sz) => *sz,
458+
OffloadSize::Dynamic => 0,
459+
})
460+
.collect::<Vec<_>>();
461+
let offload_sizes =
462+
add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &actual_sizes);
454463
let memtransfer_begin =
455464
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.begin"), &transfer_to);
456465
let memtransfer_kernel =
@@ -499,9 +508,6 @@ pub(crate) fn gen_define_handling<'ll>(
499508
region_id,
500509
};
501510

502-
// FIXME(Sa4dUs): use this global for constant offload sizes
503-
cx.add_compiler_used_global(result.offload_sizes);
504-
505511
cx.offload_kernel_cache.borrow_mut().insert(symbol, result);
506512

507513
result
@@ -535,6 +541,15 @@ pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 {
535541
}
536542
}
537543

544+
fn get_runtime_size<'ll, 'tcx>(
545+
_cx: &CodegenCx<'ll, 'tcx>,
546+
_val: &'ll Value,
547+
_meta: &OffloadMetadata,
548+
) -> &'ll Value {
549+
// FIXME(Sa4dUs): handle dynamic-size data (e.g. slices)
550+
bug!("offload does not support dynamic sizes yet");
551+
}
552+
538553
// For each kernel *call*, we now use some of our previous declared globals to move data to and from
539554
// the gpu. For now, we only handle the data transfer part of it.
540555
// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
@@ -564,15 +579,17 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
564579
) {
565580
let cx = builder.cx;
566581
let OffloadKernelGlobals {
582+
offload_sizes,
567583
memtransfer_begin,
568584
memtransfer_kernel,
569585
memtransfer_end,
570586
region_id,
571-
..
572587
} = offload_data;
573588
let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } =
574589
offload_dims;
575590

591+
let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic));
592+
576593
let tgt_decl = offload_globals.launcher_fn;
577594
let tgt_target_kernel_ty = offload_globals.launcher_ty;
578595

@@ -596,7 +613,24 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
596613
let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
597614
// These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
598615
let ty2 = cx.type_array(cx.type_i64(), num_args);
599-
let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
616+
617+
let a4 = if has_dynamic {
618+
let alloc = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
619+
620+
builder.memcpy(
621+
alloc,
622+
Align::EIGHT,
623+
offload_sizes,
624+
Align::EIGHT,
625+
cx.get_const_i64(8 * args.len() as u64),
626+
MemFlags::empty(),
627+
None,
628+
);
629+
630+
alloc
631+
} else {
632+
offload_sizes
633+
};
600634

601635
//%kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
602636
let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args");
@@ -648,9 +682,12 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
648682
builder.store(vals[i as usize], gep1, Align::EIGHT);
649683
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
650684
builder.store(geps[i as usize], gep2, Align::EIGHT);
651-
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
652-
// FIXME(offload): write an offload frontend and handle arbitrary types.
653-
builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);
685+
686+
if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic) {
687+
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
688+
let size_val = get_runtime_size(cx, args[i as usize], &metadata[i as usize]);
689+
builder.store(size_val, gep3, Align::EIGHT);
690+
}
654691
}
655692

656693
// For now we have a very simplistic indexing scheme into our

compiler/rustc_middle/src/ty/offload_meta.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,16 @@ use bitflags::bitflags;
33
use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
44

55
pub struct OffloadMetadata {
6-
pub payload_size: u64,
6+
pub payload_size: OffloadSize,
77
pub mode: MappingFlags,
88
}
99

10+
#[derive(Debug, Copy, Clone)]
11+
pub enum OffloadSize {
12+
Dynamic,
13+
Static(u64),
14+
}
15+
1016
bitflags! {
1117
/// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
1218
#[derive(Debug, Copy, Clone)]
@@ -59,17 +65,18 @@ impl OffloadMetadata {
5965
}
6066

6167
// FIXME(Sa4dUs): implement a solid logic to determine the payload size
62-
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 {
68+
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> OffloadSize {
6369
match ty.kind() {
6470
ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
65-
_ => tcx
66-
.layout_of(PseudoCanonicalInput {
71+
_ => OffloadSize::Static(
72+
tcx.layout_of(PseudoCanonicalInput {
6773
typing_env: TypingEnv::fully_monomorphized(),
6874
value: ty,
6975
})
7076
.unwrap()
7177
.size
7278
.bytes(),
79+
),
7380
}
7481
}
7582

tests/codegen-llvm/gpu_offload/control_flow.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,13 @@
1414
// CHECK-NOT: define
1515
// CHECK: %.offload_baseptrs = alloca [1 x ptr], align 8
1616
// CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8
17-
// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
1817
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
1918
// CHECK: br label %bb3
2019
// CHECK-NOT define
2120
// CHECK: bb3
22-
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo.begin, ptr null, ptr null)
21+
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.foo, ptr nonnull @.offload_maptypes.foo.begin, ptr null, ptr null)
2322
// CHECK: %10 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.foo.region_id, ptr nonnull %kernel_args)
24-
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo.end, ptr null, ptr null)
23+
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.foo, ptr nonnull @.offload_maptypes.foo.end, ptr null, ptr null)
2524
#[unsafe(no_mangle)]
2625
unsafe fn main() {
2726
let A = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0];

tests/codegen-llvm/gpu_offload/gpu_host.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,18 +58,14 @@ pub fn _kernel_1(x: &mut [f32; 256], y: &[f32; 256]) {
5858
// CHECK-NEXT: %x = alloca [1024 x i8], align 16
5959
// CHECK-NEXT: %.offload_baseptrs = alloca [2 x ptr], align 8
6060
// CHECK-NEXT: %.offload_ptrs = alloca [2 x ptr], align 8
61-
// CHECK-NEXT: %.offload_sizes = alloca [2 x i64], align 8
6261
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
6362
// CHECK: store ptr %x, ptr %.offload_baseptrs, align 8
6463
// CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8
65-
// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8
6664
// CHECK-NEXT: [[BPTRS_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_baseptrs, i64 8
6765
// CHECK-NEXT: store ptr %y, ptr [[BPTRS_1]], align 8
6866
// CHECK-NEXT: [[PTRS_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8
6967
// CHECK-NEXT: store ptr %y, ptr [[PTRS_1]], align 8
70-
// CHECK-NEXT: [[SIZES_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8
71-
// CHECK-NEXT: store i64 1024, ptr [[SIZES_1]], align 8
72-
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null)
68+
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null)
7369
// CHECK-NEXT: store i32 3, ptr %kernel_args, align 8
7470
// CHECK-NEXT: [[P4:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4
7571
// CHECK-NEXT: store i32 2, ptr [[P4]], align 4
@@ -78,7 +74,7 @@ pub fn _kernel_1(x: &mut [f32; 256], y: &[f32; 256]) {
7874
// CHECK-NEXT: [[P16:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16
7975
// CHECK-NEXT: store ptr %.offload_ptrs, ptr [[P16]], align 8
8076
// CHECK-NEXT: [[P24:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24
81-
// CHECK-NEXT: store ptr %.offload_sizes, ptr [[P24]], align 8
77+
// CHECK-NEXT: store ptr @.offload_sizes.[[K]], ptr [[P24]], align 8
8278
// CHECK-NEXT: [[P32:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32
8379
// CHECK-NEXT: store ptr @.offload_maptypes.[[K]].kernel, ptr [[P32]], align 8
8480
// CHECK-NEXT: [[P40:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40
@@ -92,7 +88,7 @@ pub fn _kernel_1(x: &mut [f32; 256], y: &[f32; 256]) {
9288
// CHECK-NEXT: [[P96:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96
9389
// CHECK-NEXT: store i32 0, ptr [[P96]], align 8
9490
// CHECK-NEXT: [[TGT_RET:%.*]] = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.[[K]].region_id, ptr nonnull %kernel_args)
95-
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null)
91+
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null)
9692
// CHECK: ret void
9793
// CHECK-NEXT: }
9894

tests/codegen-llvm/gpu_offload/scalar_host.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
// CHECK-NEXT: store double %_0.i, ptr %1, align 8
2121
// CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8
2222
// CHECK-NEXT: store ptr %addr, ptr %2, align 8
23-
// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8
24-
// CHECK-NEXT: store i64 4, ptr %3, align 8
2523
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper
2624

2725
#[unsafe(no_mangle)]

0 commit comments

Comments
 (0)