diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index bd15513f199d71..7b6a1176694219 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -125,6 +125,7 @@ class CodeGen final : public CodeGenInterface //------------------------------------------------------------------------- + bool genLocallocUsed; // true if we have used localloc in the method bool genUseBlockInit; // true if we plan to block-initialize the local stack frame unsigned genInitStkLclCnt; // The count of local variables that we need to zero init diff --git a/src/coreclr/jit/codegenarm.cpp b/src/coreclr/jit/codegenarm.cpp index 635107af30ac26..603960e349ef61 100644 --- a/src/coreclr/jit/codegenarm.cpp +++ b/src/coreclr/jit/codegenarm.cpp @@ -391,6 +391,8 @@ void CodeGen::genLclHeap(GenTree* tree) GenTree* size = tree->AsOp()->gtOp1; noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL)); + bool const initMem = m_compiler->gtMustZeroLocalloc(tree); + // Result of localloc will be returned in regCnt. // Also it used as temporary register in code generation // for storing allocation size @@ -476,7 +478,7 @@ void CodeGen::genLclHeap(GenTree* tree) goto ALLOC_DONE; } - else if (!m_compiler->info.compInitMem && (amount < m_compiler->eeGetPageSize())) // must be < not <= + else if (!initMem && (amount < m_compiler->eeGetPageSize())) // must be < not <= { // Since the size is less than a page, simply adjust the SP value. // The SP might already be in the guard page, must touch it BEFORE @@ -500,7 +502,7 @@ void CodeGen::genLclHeap(GenTree* tree) } // Allocation - if (m_compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the total number of bytes to localloc. // Since we have to zero out the allocated memory AND ensure that the stack pointer is always valid diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 47ecfbea7dc7de..ad6983689f3ea3 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3081,7 +3081,7 @@ void CodeGen::genLclHeap(GenTree* tree) noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack - bool needsZeroing = m_compiler->info.compInitMem; + bool initMem = m_compiler->gtMustZeroLocalloc(tree); // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; @@ -3089,7 +3089,7 @@ void CodeGen::genLclHeap(GenTree* tree) { // The size node being a contained constant means that Lower has taken care of // zeroing the memory if compInitMem is true. - needsZeroing = false; + initMem = false; // If amount is zero then return null in targetReg amount = size->AsIntCon()->gtIconVal; @@ -3113,7 +3113,7 @@ void CodeGen::genLclHeap(GenTree* tree) // Compute the size of the block to allocate and perform alignment. // If compInitMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (needsZeroing) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -3200,10 +3200,10 @@ void CodeGen::genLclHeap(GenTree* tree) } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. + // If initMem=true, we can reuse targetReg as regcnt. // Since size is a constant, regCnt is not yet initialized. assert(regCnt == REG_NA); - if (needsZeroing) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -3215,7 +3215,7 @@ void CodeGen::genLclHeap(GenTree* tree) instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount); } - if (needsZeroing) + if (initMem) { BasicBlock* loop = genCreateTempLabel(); diff --git a/src/coreclr/jit/codegenloongarch64.cpp b/src/coreclr/jit/codegenloongarch64.cpp index 26a80d31204e52..ca9c8b1de64e16 100644 --- a/src/coreclr/jit/codegenloongarch64.cpp +++ b/src/coreclr/jit/codegenloongarch64.cpp @@ -1461,6 +1461,8 @@ void CodeGen::genLclHeap(GenTree* tree) noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack + bool const initMem = m_compiler->gtMustZeroLocalloc(tree); + // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; if (size->IsCnsIntOrI()) @@ -1487,9 +1489,9 @@ void CodeGen::genLclHeap(GenTree* tree) emit->emitIns_J_cond_la(INS_beq, endLabel, targetReg, REG_R0); // Compute the size of the block to allocate and perform alignment. - // If compInitMem=true, we can reuse targetReg as regcnt, + // If initMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (m_compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -1541,7 +1543,7 @@ void CodeGen::genLclHeap(GenTree* tree) static_assert(STACK_ALIGN == (REGSIZE_BYTES * 2)); assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time size_t stpCount = amount / (REGSIZE_BYTES * 2); - if (m_compiler->info.compInitMem) + if (initMem) { if (stpCount <= 4) { @@ -1588,10 +1590,10 @@ void CodeGen::genLclHeap(GenTree* tree) } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. + // If initMem=true, we can reuse targetReg as regcnt. // Since size is a constant, regCnt is not yet initialized. assert(regCnt == REG_NA); - if (m_compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -1603,7 +1605,7 @@ void CodeGen::genLclHeap(GenTree* tree) instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount); } - if (m_compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the total number of bytes to locAlloc. // Since we have to zero out the allocated memory AND ensure that the stack pointer is always valid diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index 540d92a906023c..b72b2b15a51bf3 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -1451,6 +1451,7 @@ void CodeGen::genLclHeap(GenTree* tree) noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack + bool const initMem = m_compiler->gtMustZeroLocalloc(tree); const target_size_t pageSize = m_compiler->eeGetPageSize(); // According to RISC-V Privileged ISA page size is 4KiB @@ -1482,9 +1483,9 @@ void CodeGen::genLclHeap(GenTree* tree) emit->emitIns_J_cond_la(INS_beq, endLabel, targetReg, REG_R0); // Compute the size of the block to allocate and perform alignment. - // If compInitMem=true, we can reuse targetReg as regcnt, + // If initMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (m_compiler->info.compInitMem) + if (initMem) { regCnt = targetReg; } @@ -1536,7 +1537,7 @@ void CodeGen::genLclHeap(GenTree* tree) static_assert(STACK_ALIGN == (REGSIZE_BYTES * 2)); assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time size_t stpCount = amount / (REGSIZE_BYTES * 2); - if (m_compiler->info.compInitMem) + if (initMem) { if (stpCount <= 4) { @@ -1585,10 +1586,10 @@ void CodeGen::genLclHeap(GenTree* tree) } // else, "mov regCnt, amount" - // If compInitMem=true, we can reuse targetReg as regcnt. + // If initMem=true, we can reuse targetReg as regcnt. // Since size is a constant, regCnt is not yet initialized. assert(regCnt == REG_NA); - if (m_compiler->info.compInitMem) + if (initMem) { regCnt = targetReg; } @@ -1599,7 +1600,7 @@ void CodeGen::genLclHeap(GenTree* tree) instGen_Set_Reg_To_Imm(((unsigned int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount); } - if (m_compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the total number of bytes to locAlloc. // Since we have to zero out the allocated memory AND ensure that the stack pointer is always valid diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp index 0d89746cf97da6..e1f6a813e426d4 100644 --- a/src/coreclr/jit/codegenwasm.cpp +++ b/src/coreclr/jit/codegenwasm.cpp @@ -2884,7 +2884,7 @@ void CodeGen::genLclHeap(GenTree* tree) assert(m_compiler->compLocallocUsed); assert(isFramePointerUsed()); - bool const needsZeroing = m_compiler->info.compInitMem; + bool const needsZeroing = m_compiler->gtMustZeroLocalloc(tree); GenTree* const size = tree->AsOp()->gtOp1; // We reserve this amount of space below any allocation for diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b8d5ffb8f9452c..1e5771621e7cba 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2745,6 +2745,7 @@ void CodeGen::genLclHeap(GenTree* tree) { assert(tree->OperIs(GT_LCLHEAP)); assert(m_compiler->compLocallocUsed); + genLocallocUsed = true; GenTree* size = tree->AsOp()->gtOp1; noway_assert((genActualType(size->gtType) == TYP_INT) || (genActualType(size->gtType) == TYP_I_IMPL)); @@ -2766,6 +2767,8 @@ void CodeGen::genLclHeap(GenTree* tree) target_size_t stackAdjustment = 0; target_size_t locAllocStackOffset = 0; + bool const initMem = m_compiler->gtMustZeroLocalloc(tree); + // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; if (size->IsCnsIntOrI() && size->isContained()) @@ -2789,7 +2792,7 @@ void CodeGen::genLclHeap(GenTree* tree) // Compute the size of the block to allocate and perform alignment. // If compInitMem=true, we can reuse targetReg as regcnt, // since we don't need any internal registers. - if (m_compiler->info.compInitMem) + if (initMem) { assert(internalRegisters.Count(tree) == 0); regCnt = targetReg; @@ -2814,7 +2817,7 @@ void CodeGen::genLclHeap(GenTree* tree) inst_RV_IV(INS_add, regCnt, STACK_ALIGN - 1, emitActualTypeSize(type)); - if (m_compiler->info.compInitMem) + if (initMem) { // Convert the count from a count of bytes to a loop count. We will loop once per // stack alignment size, so each loop will zero 4 bytes on Windows/x86, and 16 bytes @@ -2835,7 +2838,7 @@ void CodeGen::genLclHeap(GenTree* tree) } bool initMemOrLargeAlloc; // Declaration must be separate from initialization to avoid clang compiler error. - initMemOrLargeAlloc = m_compiler->info.compInitMem || (amount >= m_compiler->eeGetPageSize()); // must be >= not > + initMemOrLargeAlloc = initMem || (amount >= m_compiler->eeGetPageSize()); // must be >= not > #if FEATURE_FIXED_OUT_ARGS // If we have an outgoing arg area then we must adjust the SP by popping off the @@ -2909,7 +2912,7 @@ void CodeGen::genLclHeap(GenTree* tree) // We should not have any temp registers at this point. assert(internalRegisters.Count(tree) == 0); - if (m_compiler->info.compInitMem) + if (initMem) { // At this point 'regCnt' is set to the number of loop iterations for this loop, if each // iteration zeros (and subtracts from the stack pointer) STACK_ALIGN bytes. diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 9568aceac47f88..024621ac21ef33 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3889,6 +3889,15 @@ class Compiler bool gtTreeHasLocalRead(GenTree* tree, unsigned lclNum); bool gtTreeHasLocalStore(GenTree* tree, unsigned lclNum); + // Returns true iff the LCLHEAP node "tree" must zero-initialize its + // allocation, either because the method requests init-mem semantics or + // because the node carries the GTF_LCLHEAP_MUSTINIT flag. + bool gtMustZeroLocalloc(GenTree* tree) + { + assert(tree->OperIs(GT_LCLHEAP)); + return info.compInitMem || ((tree->gtFlags & GTF_LCLHEAP_MUSTINIT) != 0); + } + void gtSetStmtInfo(Statement* stmt); // Returns "true" iff "node" has any of the side effects in "flags". @@ -6489,7 +6498,10 @@ class Compiler bool fgExpandStaticInitForCall(BasicBlock** pBlock, Statement* stmt, GenTreeCall* call); PhaseStatus fgExpandStackArrayAllocations(); - bool fgExpandStackArrayAllocation(BasicBlock* pBlock, Statement* stmt, GenTreeCall* call); + bool fgExpandStackArrayAllocation(BasicBlock* pBlock, + Statement* stmt, + GenTreeCall* call, + unsigned& frameRunningTotalLclNum); PhaseStatus fgVNBasedIntrinsicExpansion(); bool fgVNBasedIntrinsicExpansionForCall(BasicBlock** pBlock, Statement* stmt, GenTreeCall* call); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index d91d1961bc2ba4..8000a3d795f301 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -14532,6 +14532,8 @@ const char* Compiler::gtGetWellKnownArgNameForArgMsg(WellKnownArg arg) return "tail call"; case WellKnownArg::StackArrayLocal: return "&lcl arr"; + case WellKnownArg::StackArrayElemSize: + return "arr elemsz"; case WellKnownArg::RuntimeMethodHandle: return "meth hnd"; case WellKnownArg::AsyncExecutionContext: diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index f0ad4bda78899c..97e7db15e44dcb 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -538,6 +538,8 @@ enum GenTreeFlags : unsigned GTF_ALLOCOBJ_EMPTY_STATIC = 0x80000000, // GT_ALLOCOBJ -- allocation site is part of an empty static pattern + GTF_LCLHEAP_MUSTINIT = 0x80000000, // GT_LCLHEAP -- allocation must be zeroed + #ifdef FEATURE_HW_INTRINSICS GTF_HW_EM_OP = 0x10000000, // GT_HWINTRINSIC -- node is used as an operand to an embedded mask GTF_HW_USER_CALL = 0x20000000, // GT_HWINTRINSIC -- node is implemented via a user call @@ -4775,6 +4777,7 @@ enum class WellKnownArg : unsigned SwiftSelf, X86TailCallSpecialArg, StackArrayLocal, + StackArrayElemSize, RuntimeMethodHandle, AsyncExecutionContext, AsyncSynchronizationContext, diff --git a/src/coreclr/jit/helperexpansion.cpp b/src/coreclr/jit/helperexpansion.cpp index f917dc1d441dc2..ded490dc4cc846 100644 --- a/src/coreclr/jit/helperexpansion.cpp +++ b/src/coreclr/jit/helperexpansion.cpp @@ -2801,6 +2801,12 @@ PhaseStatus Compiler::fgExpandStackArrayAllocations() // bool modified = false; + // Lazily-allocated TYP_I_IMPL local that accumulates the per-invocation + // total bytes of conditional (localloc) stack allocations. Initialized + // on first use by fgExpandStackArrayAllocation. + // + unsigned frameRunningTotalLclNum = BAD_VAR_NUM; + for (BasicBlock* const block : Blocks()) { for (Statement* const stmt : block->Statements()) @@ -2817,7 +2823,7 @@ PhaseStatus Compiler::fgExpandStackArrayAllocations() continue; } - if (fgExpandStackArrayAllocation(block, stmt, tree->AsCall())) + if (fgExpandStackArrayAllocation(block, stmt, tree->AsCall(), frameRunningTotalLclNum)) { // If we expand, we split the statement's tree // so will be done with this statment. @@ -2846,7 +2852,18 @@ PhaseStatus Compiler::fgExpandStackArrayAllocations() // Returns: // true if a runtime lookup was found and expanded. // -bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, GenTreeCall* call) +// Remarks: +// For arrays whose size was large or not known during stack allocation analysis, +// the allocation expands into a runtime check followed by localloc (if small) +// or heapalloc (if big). +// +// For known sized arrays, we assume upstream analysis has limited size to +// something reasonable, and the allocation is into fixed local storage. +// +bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, + Statement* stmt, + GenTreeCall* call, + unsigned& frameRunningTotalLclNum) { if (!call->IsHelperCall()) { @@ -2871,17 +2888,44 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, return false; } - // If this is a local array, the new helper will have an arg for the array's address + // If this is a local array, the new helper will have an arg for the array's address or an arg + // for the array element size // CallArg* const stackLocalAddressArg = call->gtArgs.FindWellKnownArg(WellKnownArg::StackArrayLocal); + CallArg* const elemSizeArg = call->gtArgs.FindWellKnownArg(WellKnownArg::StackArrayElemSize); - if (stackLocalAddressArg == nullptr) + if ((stackLocalAddressArg == nullptr) && (elemSizeArg == nullptr)) { return false; } - JITDUMP("Expanding new array helper for stack allocated array at [%06d] in " FMT_BB ":\n", dspTreeID(call), - block->bbNum); + // If we have an elem size arg, this is intended to be a localloc/heapalloc + // + // Note we may have figured out the array length after we did the + // escape analysis (that is, lengthArg might be a constant), so we + // could possibly change this from a localloc to a fixed alloc, + // if we could show that was sound. + // + bool const isLocAlloc = (elemSizeArg != nullptr); + bool const isAlign8 = isLocAlloc && (helper == CORINFO_HELP_NEWARR_1_ALIGN8); + + // The localloc/heapalloc dispatch path needs to store the heap-fallback + // call result into the same local that consumes the original call's + // result. If the result is unused (e.g. DCE removed the consumer), + // skip the expansion and let later phases drop the dead call. + // + if (isLocAlloc) + { + GenTree* const stmtRoot = stmt->GetRootNode(); + if (!(stmtRoot->OperIs(GT_STORE_LCL_VAR) && (stmtRoot->AsLclVarCommon()->Data() == call))) + { + JITDUMP("Skipping localloc dispatch for [%06d]: call result is unused\n", dspTreeID(call)); + return false; + } + } + + JITDUMP("Expanding new array helper for stack allocated array at [%06d] %sin " FMT_BB ":\n", dspTreeID(call), + isLocAlloc ? " into localloc " : "", block->bbNum); DISPTREE(call); JITDUMP("\n"); @@ -2898,24 +2942,321 @@ bool Compiler::fgExpandStackArrayAllocation(BasicBlock* block, Statement* stmt, } } - GenTree* const stackLocalAddress = stackLocalAddressArg->GetNode(); + GenTree* lengthArg = call->gtArgs.GetArgByIndex(lengthArgIndex)->GetNode(); + GenTree* stackLocalAddress = nullptr; + + // Temps holding the once-evaluated length and method-table args for the + // localloc path. Used by both the dispatch path and the header init, + // so declared at function scope. + // + unsigned lengthTemp = BAD_VAR_NUM; + unsigned typeTemp = BAD_VAR_NUM; + + // If we have a localloc, compute (at runtime) overall size, and check length + // against a threshold. If over, heap allocate. + // + if (isLocAlloc) + { + assert(elemSizeArg != nullptr); + assert(stackLocalAddressArg == nullptr); + GenTree* const elemSize = elemSizeArg->GetNode(); + assert(elemSize->IsCnsIntOrI()); + + // Spill the length and method-table args to fresh temps so all + // downstream consumers (size compute, runtime check, header + // init, heap-fallback call) reference a temp use instead of + // cloning the original (possibly side-effecting / non-clonable) + // expressions. Replace the call's arg slots with a temp use so + // the original expressions live in exactly one place. + // + { + GenTree*& lengthArgRef = call->gtArgs.GetArgByIndex(lengthArgIndex)->NodeRef(); + GenTree* const origLength = lengthArgRef; + lengthTemp = lvaGrabTemp(true DEBUGARG("stack array length")); + lvaTable[lengthTemp].lvType = genActualType(origLength); + + GenTree* const lengthSpill = gtNewStoreLclVarNode(lengthTemp, origLength); + Statement* const lengthSpillStmt = fgNewStmtFromTree(lengthSpill); + gtUpdateStmtSideEffects(lengthSpillStmt); + fgInsertStmtBefore(block, stmt, lengthSpillStmt); + + lengthArgRef = gtNewLclVarNode(lengthTemp); + lengthArg = lengthArgRef; + } + { + GenTree*& typeArgRef = call->gtArgs.GetArgByIndex(typeArgIndex)->NodeRef(); + GenTree* const origType = typeArgRef; + typeTemp = lvaGrabTemp(true DEBUGARG("stack array method table")); + lvaTable[typeTemp].lvType = genActualType(origType); + + GenTree* const typeSpill = gtNewStoreLclVarNode(typeTemp, origType); + Statement* const typeSpillStmt = fgNewStmtFromTree(typeSpill); + gtUpdateStmtSideEffects(typeSpillStmt); + fgInsertStmtBefore(block, stmt, typeSpillStmt); + + typeArgRef = gtNewLclVarNode(typeTemp); + } + + unsigned const locallocTemp = lvaGrabTemp(true DEBUGARG("localloc stack address")); + lvaTable[locallocTemp].lvType = TYP_I_IMPL; + + GenTree* const arrayLength = gtNewLclVarNode(lengthTemp); + GenTree* const baseSize = gtNewIconNode(OFFSETOF__CORINFO_Array__data, TYP_I_IMPL); + GenTree* const payloadSize = gtNewOperNode(GT_MUL, TYP_I_IMPL, elemSize, arrayLength); + GenTree* totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, baseSize, payloadSize); + + unsigned const elemSizeValue = (unsigned)elemSize->AsIntCon()->IconValue(); + + if ((elemSizeValue % TARGET_POINTER_SIZE) != 0) + { + // Round size up to TARGET_POINTER_SIZE. + // size = (size + TPS - 1) & ~(TPS - 1) + // + GenTree* const roundSize = gtNewIconNode(TARGET_POINTER_SIZE - 1, TYP_I_IMPL); + GenTree* const biasedSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, totalSize, roundSize); + GenTree* const mask = gtNewIconNode(TARGET_POINTER_SIZE - 1, TYP_I_IMPL); + GenTree* const invMask = gtNewOperNode(GT_NOT, TYP_I_IMPL, mask); + GenTree* const paddedSize = gtNewOperNode(GT_AND, TYP_I_IMPL, biasedSize, invMask); + + totalSize = paddedSize; + } + +#ifndef TARGET_64BIT + if (isAlign8) + { + // For Align8, allocate an extra TARGET_POINTER_SIZED (4) bytes so + // we can fix alignment below. + // + GenTree* const alignSize = gtNewIconNode(4, TYP_I_IMPL); + totalSize = gtNewOperNode(GT_ADD, TYP_I_IMPL, totalSize, alignSize); + } +#endif + + // We will need total size twice, so spill it to a local + // + unsigned const totalSizeTemp = lvaGrabTemp(false DEBUGARG("lcl/heap alloc size")); + lvaTable[totalSizeTemp].lvType = TYP_I_IMPL; + GenTree* const totalSizeStore = gtNewStoreLclVarNode(totalSizeTemp, totalSize); + + Statement* const totalSizeStmt = fgNewStmtFromTree(totalSizeStore); + gtUpdateStmtSideEffects(totalSizeStmt); + fgInsertStmtBefore(block, stmt, totalSizeStmt); + + // Check the length against a JIT-time-precomputed safe upper bound, + // using an unsigned compare so that negative lengths (which signed + // would treat as "small") are routed to the heap-fallback helper. The + // helper validates length and raises OverflowException for negatives + // or when (length * elemSize) overflows. + // + // maxSafeLength is the largest length for which: + // base + payload (+ optional align8 pad) <= stackLimit + // and for which no intermediate I_IMPL multiply/add can wrap. + // + size_t const stackLimit = (size_t)(unsigned)JitConfig.JitObjectStackAllocationSize(); + size_t const baseBytes = (size_t)OFFSETOF__CORINFO_Array__data; +#ifndef TARGET_64BIT + size_t const align8Pad = isAlign8 ? 4 : 0; +#else + size_t const align8Pad = 0; +#endif + size_t maxSafeLength = 0; + if (stackLimit > baseBytes + align8Pad) + { + assert(elemSizeValue > 0); + maxSafeLength = (stackLimit - baseBytes - align8Pad) / elemSizeValue; + // The pointer-size round-up below can add up to (TPS - 1) bytes; + // trim one element to absorb that slack. + if (((elemSizeValue % TARGET_POINTER_SIZE) != 0) && (maxSafeLength > 0)) + { + maxSafeLength--; + } + } + + GenTree* const lengthForCheck = gtNewLclVarNode(lengthTemp); + var_types const lengthType = genActualType(lengthForCheck); + GenTree* const lengthLimit = gtNewIconNode((ssize_t)maxSafeLength, lengthType); + GenTree* const lengthCompare = gtNewOperNode(GT_GT, TYP_INT, lengthForCheck, lengthLimit); + lengthCompare->gtFlags |= GTF_UNSIGNED; + + // Lazily allocate the per-frame running-total local, and insert an + // explicit zero-init store at the top of fgFirstBB. Independent of + // compInitMem and prolog zero-init policy. + // + if (frameRunningTotalLclNum == BAD_VAR_NUM) + { + frameRunningTotalLclNum = lvaGrabTemp(false DEBUGARG("stack alloc frame running total")); + lvaTable[frameRunningTotalLclNum].lvType = TYP_I_IMPL; + + GenTree* const zeroInit = gtNewStoreLclVarNode(frameRunningTotalLclNum, gtNewIconNode(0, TYP_I_IMPL)); + Statement* const zeroInitStmt = fgNewStmtFromTree(zeroInit); + fgInsertStmtAtBeg(fgFirstBB, zeroInitStmt); + + JITDUMP("Created stack alloc frame running total V%02u, zero-init at " FMT_BB "\n", frameRunningTotalLclNum, + fgFirstBB->bbNum); + } + + // Build the second check: running + totalSize > frameLimit (unsigned). + // Note: when the length check fails the totalSize value computed in + // the temp is irrelevant; OR'ing the two compares preserves correct + // dispatch (the length check forces heap regardless of the second). + // + size_t const frameLimit = (size_t)(unsigned)JitConfig.JitObjectStackAllocationFrameSize(); + GenTree* const runningForCheck = gtNewLclVarNode(frameRunningTotalLclNum); + GenTree* const totalSizeForSum = gtNewLclVarNode(totalSizeTemp); + GenTree* const newRunningTotal = gtNewOperNode(GT_ADD, TYP_I_IMPL, runningForCheck, totalSizeForSum); + GenTree* const frameLimitNode = gtNewIconNode((ssize_t)frameLimit, TYP_I_IMPL); + GenTree* const frameCompare = gtNewOperNode(GT_GT, TYP_INT, newRunningTotal, frameLimitNode); + frameCompare->gtFlags |= GTF_UNSIGNED; + + // Combine the two compares. JTRUE requires a relop child, so wrap + // the OR with NE 0. + // + GenTree* const combinedOr = gtNewOperNode(GT_OR, TYP_INT, lengthCompare, frameCompare); + GenTree* const combinedCond = gtNewOperNode(GT_NE, TYP_INT, combinedOr, gtNewIconNode(0, TYP_INT)); + GenTree* const runtimeSizeCheck = gtNewOperNode(GT_JTRUE, TYP_VOID, combinedCond); + + Statement* const runtimeSizeCheckStmt = fgNewStmtFromTree(runtimeSizeCheck); + gtUpdateStmtSideEffects(runtimeSizeCheckStmt); + fgInsertStmtBefore(block, stmt, runtimeSizeCheckStmt); + + // Split block after the call, and insert blocks for the localloc and the heap alloc + // + BasicBlock* const remainderBlock = fgSplitBlockAfterStatement(block, stmt); + BasicBlock* const locallocBlock = fgNewBBafter(BBJ_ALWAYS, block, /* extendRegion */ true); + BasicBlock* const heapallocBlock = fgNewBBafter(BBJ_ALWAYS, locallocBlock, /* extendRegion */ true); + + // Wire up new flow.... assume (for now) localloc is more likely + // + FlowEdge* const blockRemainderEdge = fgGetPredForBlock(remainderBlock, block); + fgRemoveRefPred(blockRemainderEdge); + + FlowEdge* const locallocInEdge = fgAddRefPred(locallocBlock, block); + FlowEdge* const locallocOutEdge = fgAddRefPred(remainderBlock, locallocBlock); + + locallocInEdge->setLikelihood(0.8); + locallocBlock->inheritWeightPercentage(block, 80); + locallocOutEdge->setLikelihood(1.0); + locallocBlock->SetTargetEdge(locallocOutEdge); + + FlowEdge* const heapallocInEdge = fgAddRefPred(heapallocBlock, block); + FlowEdge* const heapallocOutEdge = fgAddRefPred(remainderBlock, heapallocBlock); + + heapallocInEdge->setLikelihood(0.2); + heapallocBlock->inheritWeightPercentage(block, 20); + heapallocOutEdge->setLikelihood(1.0); + heapallocBlock->SetTargetEdge(heapallocOutEdge); + + block->SetCond(heapallocInEdge, locallocInEdge); + + // Now fill in the heapalloc block. + // + // Create a helper call just like call, but without the extra arguments + // + GenTreeCall* newCall = gtNewCallNode(CT_HELPER, call->gtCallMethHnd, call->TypeGet()); + + newCall->gtArgs.PushBack(this, NewCallArg::Primitive(gtNewLclVarNode(typeTemp))); + newCall->gtArgs.PushBack(this, NewCallArg::Primitive(gtNewLclVarNode(lengthTemp))); + newCall->gtFlags = call->gtFlags; +#if defined(FEATURE_READYTORUN) + newCall->setEntryPoint(call->gtEntryPoint); +#endif // FEATURE_READYTORUN + newCall = fgMorphArgs(newCall); + + // We expect *callUse's user to be a local store. + // + assert((*callUse)->gtNext->OperIs(GT_STORE_LCL_VAR)); + unsigned const useLclNum = (*callUse)->gtNext->AsLclVarCommon()->GetLclNum(); + GenTree* const heapAllocStore = gtNewStoreLclVarNode(useLclNum, newCall); + Statement* const heapAllocStmt = fgNewStmtFromTree(heapAllocStore); + + gtUpdateStmtSideEffects(heapAllocStmt); + fgInsertStmtAtBeg(heapallocBlock, heapAllocStmt); + + // Fill in the first part of the localloc block + // + fgUnlinkStmt(block, stmt); + fgInsertStmtAtBeg(locallocBlock, stmt); + + GenTree* const totalSizeForAlloc = gtNewLclVarNode(totalSizeTemp); + GenTree* const locallocNode = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, totalSizeForAlloc); + + // Allocation might fail. Codegen must zero the allocation + // + locallocNode->gtFlags |= (GTF_EXCEPT | GTF_LCLHEAP_MUSTINIT); + + GenTree* const locallocStore = gtNewStoreLclVarNode(locallocTemp, locallocNode); + Statement* const locallocStmt = fgNewStmtFromTree(locallocStore); + + gtUpdateStmtSideEffects(locallocStmt); + fgInsertStmtBefore(locallocBlock, stmt, locallocStmt); + + // Update the per-frame running total. Only the localloc path + // consumes frame space, so do it here and not on the heap path. + // + GenTree* const runningOld = gtNewLclVarNode(frameRunningTotalLclNum); + GenTree* const totalSizeAdd = gtNewLclVarNode(totalSizeTemp); + GenTree* const runningSum = gtNewOperNode(GT_ADD, TYP_I_IMPL, runningOld, totalSizeAdd); + GenTree* const runningStore = gtNewStoreLclVarNode(frameRunningTotalLclNum, runningSum); + Statement* const runningStmt = fgNewStmtFromTree(runningStore); + gtUpdateStmtSideEffects(runningStmt); + fgInsertStmtBefore(locallocBlock, locallocStmt, runningStmt); + + // Array address is the result of the localloc + // + stackLocalAddress = gtNewLclVarNode(locallocTemp); + compLocallocUsed = true; + +#ifndef TARGET_64BIT + if (isAlign8) + { + // For Align8, adjust address to be suitably aligned. + // Addr = (Localloc + 4) & ~7; + // + GenTree* const alignSize = gtNewIconNode(4, TYP_I_IMPL); + GenTree* const biasedAddress = gtNewOperNode(GT_ADD, TYP_I_IMPL, stackLocalAddress, alignSize); + GenTree* const alignMaskInv = gtNewIconNode(-8, TYP_I_IMPL); + GenTree* const alignedAddress = gtNewOperNode(GT_AND, TYP_I_IMPL, biasedAddress, alignMaskInv); + + stackLocalAddress = alignedAddress; + } +#endif + + // We now require a frame pointer + // + codeGen->setFramePointerRequired(true); + + // Update block so code below finishes initializing the localloc array + // in the localloc block. + // + block = locallocBlock; + } + else + { + assert(elemSizeArg == nullptr); + assert(stackLocalAddressArg != nullptr); + + // Array address is the block local we created earlier + // + stackLocalAddress = stackLocalAddressArg->GetNode(); + } // Initialize the array method table pointer. // - GenTree* const mt = call->gtArgs.GetArgByIndex(typeArgIndex)->GetNode(); - GenTree* const mtStore = gtNewStoreValueNode(TYP_I_IMPL, stackLocalAddress, mt); - Statement* const mtStmt = fgNewStmtFromTree(mtStore); + GenTree* const mt = call->gtArgs.GetArgByIndex(typeArgIndex)->GetNode(); + GenTree* const mtToStore = isLocAlloc ? gtNewLclVarNode(typeTemp) : mt; + GenTree* const mtStore = gtNewStoreValueNode(TYP_I_IMPL, stackLocalAddress, mtToStore); + Statement* const mtStmt = fgNewStmtFromTree(mtStore); fgInsertStmtBefore(block, stmt, mtStmt); // Initialize the array length. // - GenTree* const lengthArg = call->gtArgs.GetArgByIndex(lengthArgIndex)->GetNode(); - GenTree* const lengthArgInt = fgOptimizeCast(gtNewCastNode(TYP_INT, lengthArg, false, TYP_INT)); - GenTree* const lengthAddress = gtNewOperNode(GT_ADD, TYP_I_IMPL, gtCloneExpr(stackLocalAddress), - gtNewIconNode(OFFSETOF__CORINFO_Array__length, TYP_I_IMPL)); - GenTree* const lengthStore = gtNewStoreValueNode(TYP_INT, lengthAddress, lengthArgInt); - Statement* const lenStmt = fgNewStmtFromTree(lengthStore); + GenTree* const arrayLengthToStore = isLocAlloc ? gtNewLclVarNode(lengthTemp) : lengthArg; + GenTree* const lengthArgInt = fgOptimizeCast(gtNewCastNode(TYP_INT, arrayLengthToStore, false, TYP_INT)); + GenTree* const lengthAddress = gtNewOperNode(GT_ADD, TYP_I_IMPL, gtCloneExpr(stackLocalAddress), + gtNewIconNode(OFFSETOF__CORINFO_Array__length, TYP_I_IMPL)); + GenTree* const lengthStore = gtNewStoreValueNode(TYP_INT, lengthAddress, lengthArgInt); + Statement* const lenStmt = fgNewStmtFromTree(lengthStore); fgInsertStmtBefore(block, stmt, lenStmt); diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index a87b20f8491c78..a6996c7d2549fd 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -702,6 +702,12 @@ RELEASE_CONFIG_INTEGER(JitObjectStackAllocationConditionalEscape, "JitObjectStac CONFIG_STRING(JitObjectStackAllocationConditionalEscapeRange, "JitObjectStackAllocationConditionalEscapeRange") RELEASE_CONFIG_INTEGER(JitObjectStackAllocationArray, "JitObjectStackAllocationArray", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationSize, "JitObjectStackAllocationSize", 528) +RELEASE_CONFIG_INTEGER(JitObjectStackAllocationLocalloc, "JitObjectStackAllocationLocalloc", 1) +// Maximum cumulative bytes of conditional (localloc) stack allocations per method invocation. +// Once the running total would exceed this, further conditional allocations fall back to heap. +// Default is 8x JitObjectStackAllocationSize. +RELEASE_CONFIG_INTEGER(JitObjectStackAllocationFrameSize, "JitObjectStackAllocationFrameSize", 8 * 528) +RELEASE_CONFIG_INTEGER(JitObjectStackAllocationInLoop, "JitObjectStackAllocationInLoop", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationTrackFields, "JitObjectStackAllocationTrackFields", 1) CONFIG_STRING(JitObjectStackAllocationTrackFieldsRange, "JitObjectStackAllocationTrackFieldsRange") CONFIG_INTEGER(JitObjectStackAllocationDumpConnGraph, "JitObjectStackAllocationDumpConnGraph", 0) diff --git a/src/coreclr/jit/jitmetadatalist.h b/src/coreclr/jit/jitmetadatalist.h index 215ea82ead5606..6c03726f968529 100644 --- a/src/coreclr/jit/jitmetadatalist.h +++ b/src/coreclr/jit/jitmetadatalist.h @@ -89,6 +89,7 @@ JITMETADATAMETRIC(NewBoxedValueClassHelperCalls, int, 0) JITMETADATAMETRIC(StackAllocatedBoxedValueClasses, int, 0) JITMETADATAMETRIC(NewArrayHelperCalls, int, 0) JITMETADATAMETRIC(StackAllocatedArrays, int, 0) +JITMETADATAMETRIC(LocallocAllocatedArrays, int, 0) JITMETADATAMETRIC(LocalAssertionCount, int, 0) JITMETADATAMETRIC(LocalAssertionOverflow, int, 0) JITMETADATAMETRIC(MorphTrackedLocals, int, 0) diff --git a/src/coreclr/jit/lsraarm.cpp b/src/coreclr/jit/lsraarm.cpp index 63f634d92c59f7..958c1a9ef7e3bf 100644 --- a/src/coreclr/jit/lsraarm.cpp +++ b/src/coreclr/jit/lsraarm.cpp @@ -68,7 +68,7 @@ int LinearScan::BuildLclHeap(GenTree* tree) { internalIntCount = 0; } - else if (!m_compiler->info.compInitMem) + else if (!m_compiler->gtMustZeroLocalloc(tree)) { // No need to initialize allocated stack space. if (sizeVal < m_compiler->eeGetPageSize()) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 0d74688745910d..309f214077de42 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1186,7 +1186,7 @@ int LinearScan::BuildNode(GenTree* tree) else { srcCount = 1; - if (!m_compiler->info.compInitMem) + if (!m_compiler->gtMustZeroLocalloc(tree)) { buildInternalIntRegisterDefForNode(tree); buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/lsraloongarch64.cpp b/src/coreclr/jit/lsraloongarch64.cpp index 6d76450142a0b1..394a768e20d889 100644 --- a/src/coreclr/jit/lsraloongarch64.cpp +++ b/src/coreclr/jit/lsraloongarch64.cpp @@ -452,7 +452,7 @@ int LinearScan::BuildNode(GenTree* tree) { // Need no internal registers } - else if (!m_compiler->info.compInitMem) + else if (!m_compiler->gtMustZeroLocalloc(tree)) { // No need to initialize allocated stack space. if (sizeVal < m_compiler->eeGetPageSize()) @@ -471,7 +471,7 @@ int LinearScan::BuildNode(GenTree* tree) else { srcCount = 1; - if (!m_compiler->info.compInitMem) + if (!m_compiler->gtMustZeroLocalloc(tree)) { buildInternalIntRegisterDefForNode(tree); buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/lsrariscv64.cpp b/src/coreclr/jit/lsrariscv64.cpp index 54d2b2f399f788..7b94b12d101ef7 100644 --- a/src/coreclr/jit/lsrariscv64.cpp +++ b/src/coreclr/jit/lsrariscv64.cpp @@ -619,7 +619,7 @@ int LinearScan::BuildNode(GenTree* tree) { // Need no internal registers } - else if (!m_compiler->info.compInitMem) + else if (!m_compiler->gtMustZeroLocalloc(tree)) { // No need to initialize allocated stack space. if (sizeVal < m_compiler->eeGetPageSize()) @@ -640,7 +640,7 @@ int LinearScan::BuildNode(GenTree* tree) else { srcCount = 1; - if (!m_compiler->info.compInitMem) + if (!m_compiler->gtMustZeroLocalloc(tree)) { buildInternalIntRegisterDefForNode(tree); buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 02aee985e6ef71..c289c71c8319d8 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1859,14 +1859,15 @@ int LinearScan::BuildLclHeap(GenTree* tree) size_t sizeVal = AlignUp((size_t)size->AsIntCon()->gtIconVal, STACK_ALIGN); // Explicitly zeroed LCLHEAP also needs a regCnt in case of x86 or large page - if ((TARGET_POINTER_SIZE == 4) || (sizeVal >= m_compiler->eeGetPageSize())) + if ((TARGET_POINTER_SIZE == 4) || (sizeVal >= m_compiler->eeGetPageSize()) || + (tree->gtFlags & GTF_LCLHEAP_MUSTINIT)) { buildInternalIntRegisterDefForNode(tree); } } else { - if (!m_compiler->info.compInitMem) + if (!m_compiler->gtMustZeroLocalloc(tree)) { // For regCnt buildInternalIntRegisterDefForNode(tree); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 0a8073a84b3d23..e67ddf26973555 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -627,6 +627,8 @@ const char* getWellKnownArgName(WellKnownArg arg) return "X86TailCallSpecialArg"; case WellKnownArg::StackArrayLocal: return "StackArrayLocal"; + case WellKnownArg::StackArrayElemSize: + return "StackArrayElemSize"; case WellKnownArg::RuntimeMethodHandle: return "RuntimeMethodHandle"; case WellKnownArg::AsyncExecutionContext: diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 2fab92081bb9ce..6b99cdfa03e3b9 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -47,6 +47,8 @@ ObjectAllocator::ObjectAllocator(Compiler* comp) , m_ConnGraphAdjacencyMatrix(nullptr) , m_StackAllocMaxSize(0) , m_stackAllocationCount(0) + , m_UseLocalloc(false) + , m_UseLocallocInLoop(false) , m_EnumeratorLocalToPseudoIndexMap(comp->getAllocator(CMK_ObjectAllocator)) , m_CloneMap(comp->getAllocator(CMK_ObjectAllocator)) , m_nextLocalIndex(0) @@ -64,6 +66,11 @@ ObjectAllocator::ObjectAllocator(Compiler* comp) m_ConnGraphAdjacencyMatrix = nullptr; m_StackAllocMaxSize = (unsigned)JitConfig.JitObjectStackAllocationSize(); m_trackFields = JitConfig.JitObjectStackAllocationTrackFields() > 0; + + // OSR does not support localloc (though seems like late-introduced localloc might be ok) + // + m_UseLocalloc = JitConfig.JitObjectStackAllocationLocalloc() && !comp->opts.IsOSR(); + m_UseLocallocInLoop = m_UseLocalloc && JitConfig.JitObjectStackAllocationInLoop(); } //------------------------------------------------------------------------ @@ -1057,11 +1064,12 @@ void ObjectAllocator::ComputeStackObjectPointers(BitVecTraits* bitVecTraits) // lclNum - Local variable number // clsHnd - Class/struct handle of the variable class // allocType - Type of allocation (newobj or newarr) -// length - Length of the array (for newarr) +// length - Length of the array (for newarr); 1 for runtime-determined size // blockSize - [out, optional] exact size of the object // reason - [out, required] if result is false, reason why // preliminaryCheck - if true, allow checking before analysis is done // (for things that inherently disqualify the local) +// lengthKnown - true if length is known at compile time (default true) // // Return Value: // Returns true iff local variable can be allocated on the stack. @@ -1072,7 +1080,8 @@ bool ObjectAllocator::CanAllocateLclVarOnStack(unsigned int lclNum, ssize_t length, unsigned int* blockSize, const char** reason, - bool preliminaryCheck) + bool preliminaryCheck, + bool lengthKnown) { assert(preliminaryCheck || m_AnalysisDone); @@ -1117,6 +1126,15 @@ bool ObjectAllocator::CanAllocateLclVarOnStack(unsigned int lclNum, ClassLayout* const layout = m_compiler->typGetArrayLayout(clsHnd, (unsigned)length); classSize = layout->GetSize(); + + if (!lengthKnown && layout->HasGCPtr()) + { + // We can't represent GC info for runtime-sized stack arrays yet. + // + assert(length == 1); + *reason = "[unknown length, gc elements]"; + return false; + } } else if (allocType == OAT_NEWOBJ) { @@ -1206,9 +1224,13 @@ ObjectAllocator::ObjectAllocationType ObjectAllocator::AllocationKind(GenTree* t case CORINFO_HELP_NEWARR_1_DIRECT: case CORINFO_HELP_NEWARR_1_ALIGN8: { - if ((call->gtArgs.CountUserArgs() == 2) && call->gtArgs.GetUserArgByIndex(1)->GetNode()->IsCnsIntOrI()) + if (call->gtArgs.CountUserArgs() == 2) { - allocType = OAT_NEWARR; + GenTree* const lenArg = call->gtArgs.GetUserArgByIndex(1)->GetNode(); + if (lenArg->IsCnsIntOrI() || m_UseLocalloc) + { + allocType = OAT_NEWARR; + } } break; } @@ -1241,9 +1263,8 @@ bool ObjectAllocator::MorphAllocObjNodes() for (BasicBlock* const block : m_compiler->Blocks()) { - const bool basicBlockHasNewObj = block->HasFlag(BBF_HAS_NEWOBJ); - const bool basicBlockHasNewArr = block->HasFlag(BBF_HAS_NEWARR); - const bool basicBlockHasBackwardJump = block->HasFlag(BBF_BACKWARD_JUMP); + const bool basicBlockHasNewObj = block->HasFlag(BBF_HAS_NEWOBJ); + const bool basicBlockHasNewArr = block->HasFlag(BBF_HAS_NEWARR); if (!basicBlockHasNewObj && !basicBlockHasNewArr) { @@ -1298,8 +1319,11 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) // We keep the set of possibly-stack-pointing pointers as a superset of the set of // definitely-stack-pointing pointers. All definitely-stack-pointing pointers are in both // sets. - MarkLclVarAsDefinitelyStackPointing(lclNum); MarkLclVarAsPossiblyStackPointing(lclNum); + if (candidate.m_definitelyStackPointing) + { + MarkLclVarAsDefinitelyStackPointing(lclNum); + } // If this was conditionally escaping enumerator, establish a connection between this local // and the enumeratorLocal we already allocated. This is needed because we do early rewriting @@ -1370,8 +1394,11 @@ bool ObjectAllocator::MorphAllocObjNodeHelper(AllocationCandidate& candidate) } // Don't attempt to do stack allocations inside basic blocks that may be in a loop. + // Exception: runtime-sized newarrs may go via localloc when m_UseLocallocInLoop is set; + // those are filtered later in MorphAllocObjNodeHelperArr. // - if (candidate.m_block->HasFlag(BBF_BACKWARD_JUMP)) + if (candidate.m_block->HasFlag(BBF_BACKWARD_JUMP) && + !((candidate.m_allocType == OAT_NEWARR) && m_UseLocallocInLoop)) { candidate.m_onHeapReason = "[alloc in loop]"; return false; @@ -1512,8 +1539,36 @@ bool ObjectAllocator::MorphAllocObjNodeHelperArr(AllocationCandidate& candidate) if (!len->IsCnsIntOrI()) { - candidate.m_onHeapReason = "[non-constant array size]"; - return false; + if (!m_UseLocalloc) + { + candidate.m_onHeapReason = "[non-constant array size]"; + return false; + } + + if (candidate.m_block->hasHndIndex()) + { + candidate.m_onHeapReason = "[non-constant array size, in handler]"; + return false; + } + + // Runtime-sized array: try to allocate via localloc. + // Pass length=1 with lengthKnown=false so layout-based checks (e.g. GC pointer guard) run. + // + if (!CanAllocateLclVarOnStack(candidate.m_lclNum, clsHnd, candidate.m_allocType, /* length */ 1, &blockSize, + &candidate.m_onHeapReason, /* preliminaryCheck */ false, + /* lengthKnown */ false)) + { + // reason set by the call + return false; + } + + JITDUMP("Allocating V%02u on the stack [via localloc]\n", candidate.m_lclNum); + MorphNewArrNodeIntoLocAlloc(data->AsCall(), clsHnd, len, candidate.m_block, candidate.m_statement); + m_compiler->Metrics.LocallocAllocatedArrays++; + // helperexpansion may take the heap fallback at runtime, so the local is only + // possibly (not definitely) stack-pointing and must remain GC-reportable. + candidate.m_definitelyStackPointing = false; + return true; } if (!CanAllocateLclVarOnStack(candidate.m_lclNum, clsHnd, candidate.m_allocType, len->AsIntCon()->IconValue(), @@ -1523,6 +1578,28 @@ bool ObjectAllocator::MorphAllocObjNodeHelperArr(AllocationCandidate& candidate) return false; } + // If a constant-sized newarr ended up here despite being in a loop, we must dispatch + // through localloc (the loop-check exemption only applies to OAT_NEWARR with m_UseLocallocInLoop). + // + if (candidate.m_block->HasFlag(BBF_BACKWARD_JUMP)) + { + assert(m_UseLocallocInLoop); + + if (candidate.m_block->hasHndIndex()) + { + candidate.m_onHeapReason = "[alloc in loop, in handler]"; + return false; + } + + JITDUMP("Allocating V%02u on the stack [via localloc, in loop]\n", candidate.m_lclNum); + MorphNewArrNodeIntoLocAlloc(data->AsCall(), clsHnd, len, candidate.m_block, candidate.m_statement); + m_compiler->Metrics.LocallocAllocatedArrays++; + // helperexpansion may take the heap fallback at runtime, so the local is only + // possibly (not definitely) stack-pointing and must remain GC-reportable. + candidate.m_definitelyStackPointing = false; + return true; + } + JITDUMP("Allocating V%02u on the stack\n", candidate.m_lclNum); const unsigned int stackLclNum = MorphNewArrNodeIntoStackAlloc(data->AsCall(), clsHnd, (unsigned int)len->AsIntCon()->IconValue(), blockSize, @@ -1746,6 +1823,63 @@ unsigned int ObjectAllocator::MorphNewArrNodeIntoStackAlloc(GenTreeCall* return lclNum; } +//------------------------------------------------------------------------ +// MorphNewArrNodeIntoLocAlloc: Morph a newarray helper call node into a local frame allocation. +// +// Arguments: +// newArr - GT_CALL that will be replaced by helper call. +// clsHnd - class representing the type of the array +// length - operand for length of the array +// block - a basic block where newArr is +// stmt - a statement where newArr is +// +void ObjectAllocator::MorphNewArrNodeIntoLocAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* length, BasicBlock* block, Statement* stmt) +{ + assert(newArr != nullptr); + assert(m_AnalysisDone); + assert(clsHnd != NO_CLASS_HANDLE); + assert(newArr->IsHelperCall()); + assert(newArr->GetHelperNum() != CORINFO_HELP_NEWARR_1_MAYBEFROZEN); + + // Get element size + // + CORINFO_CLASS_HANDLE elemClsHnd = NO_CLASS_HANDLE; + CorInfoType corType = m_compiler->info.compCompHnd->getChildType(clsHnd, &elemClsHnd); + var_types type = JITtype2varType(corType); + ClassLayout* elemLayout = type == TYP_STRUCT ? m_compiler->typGetObjLayout(elemClsHnd) : nullptr; + + const unsigned elemSize = elemLayout != nullptr ? elemLayout->GetSize() : genTypeSize(type); + + // Mark the newarr call as being "on stack", and add the element size + // operand for the stack local as an argument + // + GenTree* const elemSizeNode = m_compiler->gtNewIconNode(elemSize, TYP_I_IMPL); + newArr->gtArgs.PushBack(m_compiler, + NewCallArg::Primitive(elemSizeNode).WellKnown(WellKnownArg::StackArrayElemSize)); + newArr->gtCallMoreFlags |= GTF_CALL_M_STACK_ARRAY; + + // Retype the call result as a byref (we may decide to heap allocate at runtime). + // + newArr->ChangeType(TYP_BYREF); + newArr->gtReturnType = TYP_BYREF; + + // Note that we have stack allocated arrays in this method + // + m_compiler->setMethodHasStackAllocatedArray(); + + // Notify the compiler; this disables fast tail calls (for now) + // + m_compiler->compLocallocUsed = true; + +#ifdef UNIX_AMD64_ABI + // Ensure we don't end up with misaligned frames, + // if we manage to dead code this newarr. + // + m_compiler->opts.compNeedToAlignFrame = true; +#endif +} + //------------------------------------------------------------------------ // MorphAllocObjNodeIntoStackAlloc: Morph a GT_ALLOCOBJ node into stack // allocation. @@ -2569,6 +2703,7 @@ void ObjectAllocator::UpdateAncestorTypes( } case GT_CALL: + // Watch for helper calls that have retyped operands...? break; default: diff --git a/src/coreclr/jit/objectalloc.h b/src/coreclr/jit/objectalloc.h index 4f8e6fc20c1748..cedc50d6ed2fbd 100644 --- a/src/coreclr/jit/objectalloc.h +++ b/src/coreclr/jit/objectalloc.h @@ -149,6 +149,7 @@ class ObjectAllocator final : public Phase , m_allocType(allocType) , m_onHeapReason(nullptr) , m_bashCall(false) + , m_definitelyStackPointing(true) { } @@ -159,6 +160,12 @@ class ObjectAllocator final : public Phase ObjectAllocationType const m_allocType; const char* m_onHeapReason; bool m_bashCall; + // True if a successful stack-allocation of this candidate yields a local that + // definitely points at stack memory. False when the morph leaves a runtime + // heap fallback in place (e.g. the localloc/heapalloc split for runtime-sized + // arrays); in that case the local is only possibly stack-pointing and must + // remain GC-reportable. + bool m_definitelyStackPointing; }; typedef SmallHashTable LocalToLocalMap; @@ -181,6 +188,8 @@ class ObjectAllocator final : public Phase BitSetShortLongRep* m_ConnGraphAdjacencyMatrix; unsigned int m_StackAllocMaxSize; unsigned m_stackAllocationCount; + bool m_UseLocalloc; + bool m_UseLocallocInLoop; // Info for conditionally-escaping locals LocalToLocalMap m_EnumeratorLocalToPseudoIndexMap; @@ -210,7 +219,8 @@ class ObjectAllocator final : public Phase ssize_t length, unsigned int* blockSize, const char** reason, - bool preliminaryCheck = false); + bool preliminaryCheck = false, + bool lengthKnown = true); static GenTree* IsGuard(BasicBlock* block, GuardInfo* info); @@ -258,6 +268,8 @@ class ObjectAllocator final : public Phase unsigned int blockSize, BasicBlock* block, Statement* stmt); + void MorphNewArrNodeIntoLocAlloc( + GenTreeCall* newArr, CORINFO_CLASS_HANDLE clsHnd, GenTree* length, BasicBlock* block, Statement* stmt); struct BuildConnGraphVisitorCallbackData; void AnalyzeParentStack(ArrayStack* parentStack, unsigned int lclNum, BasicBlock* block); void UpdateAncestorTypes( diff --git a/src/tests/JIT/opt/ObjectStackAllocation/LocallocStackAlloc.cs b/src/tests/JIT/opt/ObjectStackAllocation/LocallocStackAlloc.cs new file mode 100644 index 00000000000000..f1a9a86973f1b1 --- /dev/null +++ b/src/tests/JIT/opt/ObjectStackAllocation/LocallocStackAlloc.cs @@ -0,0 +1,226 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using TestLibrary; +using Xunit; + +enum AllocationKind +{ + Heap, + Stack, + Undefined +} + +delegate int Test(); + +public class LocallocStackAlloc +{ + static bool GCStressEnabled() + { + return Environment.GetEnvironmentVariable("DOTNET_GCStress") != null; + } + + static AllocationKind StackAllocation() + { + AllocationKind expectedAllocationKind = AllocationKind.Stack; + if (GCStressEnabled()) + { + Console.WriteLine("GCStress is enabled"); + expectedAllocationKind = AllocationKind.Undefined; + } + return expectedAllocationKind; + } + + static AllocationKind HeapAllocation() + { + AllocationKind expectedAllocationKind = AllocationKind.Heap; + if (GCStressEnabled()) + { + Console.WriteLine("GCStress is enabled"); + expectedAllocationKind = AllocationKind.Undefined; + } + return expectedAllocationKind; + } + + static int CallTestAndVerifyAllocation(Test test, int expectedResult, AllocationKind expectedAllocationsKind, bool throws = false) + { + string methodName = test.Method.Name; + try + { + long allocatedBytesBefore = GC.GetAllocatedBytesForCurrentThread(); + int testResult = test(); + long allocatedBytesAfter = GC.GetAllocatedBytesForCurrentThread(); + + if (throws) + { + Console.WriteLine($"FAILURE ({methodName}): expected exception, got {testResult}"); + return -1; + } + + if (testResult != expectedResult) + { + Console.WriteLine($"FAILURE ({methodName}): expected {expectedResult}, got {testResult}"); + return -1; + } + + if ((expectedAllocationsKind == AllocationKind.Stack) && (allocatedBytesBefore != allocatedBytesAfter)) + { + Console.WriteLine($"FAILURE ({methodName}): unexpected allocation of {allocatedBytesAfter - allocatedBytesBefore} bytes"); + return -1; + } + + if ((expectedAllocationsKind == AllocationKind.Heap) && (allocatedBytesBefore == allocatedBytesAfter)) + { + Console.WriteLine($"FAILURE ({methodName}): unexpected stack allocation"); + return -1; + } + + Console.WriteLine($"SUCCESS ({methodName})"); + return 100; + } + catch (Exception e) + { + if (throws) + { + Console.WriteLine($"SUCCESS ({methodName}) caught {e.GetType().Name}"); + return 100; + } + Console.WriteLine($"FAILURE ({methodName}): unexpected {e.GetType().Name}: {e.Message}"); + return -1; + } + } + + // Keep JIT from constant-folding the length. + [MethodImpl(MethodImplOptions.NoInlining)] + static int OpaqueLength(int n) => n; + + // Variable-length stack-allocated int[] within the localloc threshold. + // Sums the elements after writing them. + [MethodImpl(MethodImplOptions.NoInlining)] + static int VariableLengthSmall() + { + int n = OpaqueLength(8); + int[] array = new int[n]; + int sum = 0; + for (int i = 0; i < array.Length; i++) + { + array[i] = i + 1; + } + for (int i = 0; i < array.Length; i++) + { + sum += array[i]; + } + return sum + array.Length; + } + + // Variable-length newarr that exceeds the stack-alloc threshold; should be + // routed to the heap helper at runtime instead of corrupting the stack. + [MethodImpl(MethodImplOptions.NoInlining)] + static int VariableLengthLarge() + { + int n = OpaqueLength(10_000); + int[] array = new int[n]; + int sum = 0; + for (int i = 0; i < array.Length; i++) + { + array[i] = 1; + } + for (int i = 0; i < array.Length; i++) + { + sum += array[i]; + } + return sum; + } + + // Negative length must throw OverflowException via the heap helper + // even when the localloc dispatch path is selected. + [MethodImpl(MethodImplOptions.NoInlining)] + static int VariableLengthNegative() + { + int n = OpaqueLength(-1); + int[] array = new int[n]; + return array.Length; + } + + // int.MinValue length must also throw OverflowException; this is the case + // where signed totalSize wraps to a small value if not guarded properly. + [MethodImpl(MethodImplOptions.NoInlining)] + static int VariableLengthIntMin() + { + int n = OpaqueLength(int.MinValue); + int[] array = new int[n]; + return array.Length; + } + + // Length near INT32_MAX with large element size: elemSize * length overflows. + // Helper should raise OutOfMemoryException; no stack corruption. + [MethodImpl(MethodImplOptions.NoInlining)] + static int VariableLengthHuge() + { + int n = OpaqueLength(int.MaxValue); + long[] array = new long[n]; + return array.Length; + } + + // Repeatedly allocate a small variable-length array within a single + // method invocation. The per-frame budget caps total localloc bytes, so + // after enough iterations the remaining allocations must fall back to + // the heap rather than growing the frame without bound. + [MethodImpl(MethodImplOptions.NoInlining)] + static int VariableLengthFrameBudget() + { + int sum = 0; + for (int iter = 0; iter < 200; iter++) + { + int n = OpaqueLength(64); + int[] array = new int[n]; + for (int i = 0; i < array.Length; i++) + { + array[i] = i + 1; + } + for (int i = 0; i < array.Length; i++) + { + sum += array[i]; + } + } + return sum; + } + + [ActiveIssue("needs triage", TestRuntimes.Mono)] + [Fact] + public static int TestSmall() + { + VariableLengthSmall(); + return CallTestAndVerifyAllocation(VariableLengthSmall, 8 + (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8), StackAllocation()); + } + + [ActiveIssue("needs triage", TestRuntimes.Mono)] + [Fact] + public static int TestLarge() + { + VariableLengthLarge(); + return CallTestAndVerifyAllocation(VariableLengthLarge, 10_000, HeapAllocation()); + } + + [ActiveIssue("needs triage", TestRuntimes.Mono)] + [Fact] + public static int TestNegative() => CallTestAndVerifyAllocation(VariableLengthNegative, 0, AllocationKind.Undefined, throws: true); + + [ActiveIssue("needs triage", TestRuntimes.Mono)] + [Fact] + public static int TestIntMin() => CallTestAndVerifyAllocation(VariableLengthIntMin, 0, AllocationKind.Undefined, throws: true); + + [ActiveIssue("needs triage", TestRuntimes.Mono)] + [Fact] + public static int TestHuge() => CallTestAndVerifyAllocation(VariableLengthHuge, 0, AllocationKind.Undefined, throws: true); + + [ActiveIssue("needs triage", TestRuntimes.Mono)] + [Fact] + public static int TestFrameBudget() + { + VariableLengthFrameBudget(); + return CallTestAndVerifyAllocation(VariableLengthFrameBudget, 200 * ((64 * 65) / 2), HeapAllocation()); + } +} diff --git a/src/tests/JIT/opt/ObjectStackAllocation/LocallocStackAlloc.csproj b/src/tests/JIT/opt/ObjectStackAllocation/LocallocStackAlloc.csproj new file mode 100644 index 00000000000000..993c32962762b9 --- /dev/null +++ b/src/tests/JIT/opt/ObjectStackAllocation/LocallocStackAlloc.csproj @@ -0,0 +1,15 @@ + + + + true + None + True + true + + + + + + + +