From e0b6d21bb6755372709cb47ee69da8e713ef10e8 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Fri, 8 May 2026 21:48:36 +0200 Subject: [PATCH 01/23] JIT: PGO value-profiling for non-constant zero-init stackalloc When stackalloc has a non-constant size and the method has compInitMem, codegen falls back to a slow per-stack-alignment zero-init loop. Use PGO value probing to record the most popular size, and in Tier1 specialize: size == popularSize ? LCLHEAP(popularSize) : LCLHEAP(size) The constant-size LCLHEAP fast path is unrolled by Lowering as STORE_BLK, producing efficient SIMD zero-init. Generalized value-probe infrastructure: * Introduce GenTreeOpWithILOffset (a GenTreeOp that carries an IL offset), used by GT_LCLHEAP so it can participate in value-histogram instrumentation alongside calls without a side hash table. * Centralize value-probe candidate identification in fgprofile.cpp via a single IsValueHistogramProbeCandidate helper used by the visitor, schema builder, and probe inserter. * Extract pickProfiledValue helper for sharing between specializations. * Make GT_LCLHEAP report its potential stack-overflow exception via OperExceptions so GTF_EXCEPT survives gtUpdateNodeOperSideEffects and if-conversion does not collapse the QMARK arms into a CMOV. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/compiler.h | 6 ++ src/coreclr/jit/fgprofile.cpp | 109 ++++++++++++++++++----- src/coreclr/jit/gentree.cpp | 21 +++++ src/coreclr/jit/gentree.h | 44 +++++++++- src/coreclr/jit/gtlist.h | 2 +- src/coreclr/jit/gtstructs.h | 1 + src/coreclr/jit/importer.cpp | 9 +- src/coreclr/jit/importercalls.cpp | 138 ++++++++++++++++++++++++++++++ 8 files changed, 301 insertions(+), 29 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index b5f4a002ef7a6a..f2dc0f116c3de2 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3220,6 +3220,10 @@ class Compiler GenTreeColon* gtNewColonNode(var_types type, GenTree* thenNode, GenTree* elseNode); GenTreeQmark* gtNewQmarkNode(var_types type, GenTree* cond, GenTreeColon* colon); + GenTreeOpWithILOffset* gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOffset = 0); + + bool pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssize_t* pValue); + GenTree* gtNewLargeOperNode(genTreeOps oper, var_types type = TYP_I_IMPL, GenTree* op1 = nullptr, @@ -5089,6 +5093,8 @@ class Compiler GenTree* impDuplicateWithProfiledArg(GenTreeCall* call, IL_OFFSET ilOffset); + GenTree* impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset); + GenTree* impThrowIfNull(GenTreeCall* call); #ifdef DEBUG diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index 00a3661c5f6dac..f898fc216109f5 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1893,6 +1893,64 @@ class HandleHistogramProbeVisitor final : public GenTreeVisitorOperIs(GT_LCLHEAP)) + { + if (ilOffset != nullptr) + { + *ilOffset = node->AsOpWithILOffset()->GetILOffset(); + } + if (operandUseRef != nullptr) + { + *operandUseRef = &node->AsOp()->gtOp1; + } + return true; + } + + if (node->IsCall() && node->AsCall()->IsSpecialIntrinsic()) + { + const NamedIntrinsic ni = compiler->lookupNamedIntrinsic(node->AsCall()->gtCallMethHnd); + if ((ni == NI_System_SpanHelpers_Memmove) || (ni == NI_System_SpanHelpers_SequenceEqual)) + { + if (ilOffset != nullptr) + { + assert(node->AsCall()->gtHandleHistogramProfileCandidateInfo != nullptr); + *ilOffset = node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; + } + if (operandUseRef != nullptr) + { + // Memmove(dst, src, len) and SequenceEqual(left, right, len) -- profile len. + *operandUseRef = &node->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(); + } + return true; + } + } + + return false; +} + //------------------------------------------------------------------------ // ValueHistogramProbeVisitor: invoke functor on each node requiring a generic value probe // @@ -1918,13 +1976,9 @@ class ValueHistogramProbeVisitor final : public GenTreeVisitorIsCall() && node->AsCall()->IsSpecialIntrinsic()) + if (IsValueHistogramProbeCandidate(m_compiler, node)) { - const NamedIntrinsic ni = m_compiler->lookupNamedIntrinsic(node->AsCall()->gtCallMethHnd); - if ((ni == NI_System_SpanHelpers_Memmove) || (ni == NI_System_SpanHelpers_SequenceEqual)) - { - m_functor(m_compiler, node); - } + m_functor(m_compiler, node); } return Compiler::WALK_CONTINUE; } @@ -2009,14 +2063,18 @@ class BuildValueHistogramProbeSchemaGen { } - void operator()(Compiler* compiler, GenTree* call) + void operator()(Compiler* compiler, GenTree* tree) { + IL_OFFSET ilOffset = 0; + bool isCandidate = IsValueHistogramProbeCandidate(compiler, tree, &ilOffset); + assert(isCandidate); + ICorJitInfo::PgoInstrumentationSchema schemaElem = {}; schemaElem.Count = 1; schemaElem.InstrumentationKind = compiler->opts.compCollect64BitCounts ? ICorJitInfo::PgoInstrumentationKind::ValueHistogramLongCount : ICorJitInfo::PgoInstrumentationKind::ValueHistogramIntCount; - schemaElem.ILOffset = (int32_t)call->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; + schemaElem.ILOffset = static_cast(ilOffset); m_schema.push_back(schemaElem); m_schemaCount++; @@ -2250,12 +2308,13 @@ class ValueHistogramProbeInserter return; } - assert(node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_Memmove) || - node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_SequenceEqual)); + IL_OFFSET candidateIlOffset = 0; + GenTree** operandUseRef = nullptr; + bool isCandidate = IsValueHistogramProbeCandidate(compiler, node, &candidateIlOffset, &operandUseRef); + assert(isCandidate); const ICorJitInfo::PgoInstrumentationSchema& countEntry = m_schema[*m_currentSchemaIndex]; - if (countEntry.ILOffset != - static_cast(node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset)) + if (countEntry.ILOffset != static_cast(candidateIlOffset)) { return; } @@ -2275,9 +2334,8 @@ class ValueHistogramProbeInserter *m_currentSchemaIndex += 2; - GenTree** lenArgRef = &node->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(); - - // We have Memmove(dst, src, len) and we want to insert a call to CORINFO_HELP_VALUEPROFILE for the len: + // We have either Memmove(dst, src, len)/SequenceEqual(...,len) or LCLHEAP(len) and we want to insert a + // call to CORINFO_HELP_VALUEPROFILE for the profiled operand: // // \--* COMMA long // +--* CALL help void CORINFO_HELP_VALUEPROFILE @@ -2288,17 +2346,22 @@ class ValueHistogramProbeInserter // | \--* CNS_INT long // \--* LCL_VAR long tmp // - const unsigned lenTmpNum = compiler->lvaGrabTemp(true DEBUGARG("length histogram profile tmp")); - GenTree* storeLenToTemp = compiler->gtNewTempStore(lenTmpNum, *lenArgRef); - GenTree* lengthLocal = compiler->gtNewLclvNode(lenTmpNum, genActualType(*lenArgRef)); - GenTreeOp* lengthNode = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), storeLenToTemp, lengthLocal); - GenTree* histNode = compiler->gtNewIconNode(reinterpret_cast(hist), TYP_I_IMPL); - unsigned helper = is32 ? CORINFO_HELP_VALUEPROFILE32 : CORINFO_HELP_VALUEPROFILE64; + GenTree* storeLenToTemp = compiler->gtNewTempStore(lenTmpNum, *operandUseRef); + GenTree* lengthLocal = compiler->gtNewLclvNode(lenTmpNum, genActualType(*operandUseRef)); + GenTree* lengthNode = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), storeLenToTemp, lengthLocal); + GenTree* histNode = compiler->gtNewIconNode(reinterpret_cast(hist), TYP_I_IMPL); + unsigned helper = is32 ? CORINFO_HELP_VALUEPROFILE32 : CORINFO_HELP_VALUEPROFILE64; + + if (!lengthNode->TypeIs(TYP_I_IMPL)) + { + lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /* isUnsigned */ false, TYP_I_IMPL); + } + GenTreeCall* helperCallNode = compiler->gtNewHelperCallNode(helper, TYP_VOID, lengthNode, histNode); - *lenArgRef = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), helperCallNode, - compiler->gtCloneExpr(lengthLocal)); + *operandUseRef = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), helperCallNode, + compiler->gtCloneExpr(lengthLocal)); m_instrCount++; } }; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index d91d1961bc2ba4..20eefe9e42e7d6 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -2874,6 +2874,7 @@ bool GenTree::Compare(GenTree* op1, GenTree* op2, bool swapOK) break; // For the ones below no extra argument matters for comparison. + case GT_LCLHEAP: case GT_BOX: case GT_RUNTIMELOOKUP: case GT_ARR_ADDR: @@ -3447,6 +3448,7 @@ unsigned Compiler::gtHashValue(GenTree* tree) break; // For the ones below no extra argument matters for comparison. + case GT_LCLHEAP: case GT_BOX: case GT_ARR_ADDR: break; @@ -8565,6 +8567,10 @@ ExceptionSetFlags GenTree::OperExceptions(Compiler* comp) case GT_INDEX_ADDR: return ExceptionSetFlags::NullReferenceException | ExceptionSetFlags::IndexOutOfRangeException; + case GT_LCLHEAP: + // Stack overflow on a localloc with a large or unknown size. + return ExceptionSetFlags::UnknownException; + case GT_CKFINITE: return ExceptionSetFlags::ArithmeticException; @@ -9031,6 +9037,16 @@ GenTreeQmark* Compiler::gtNewQmarkNode(var_types type, GenTree* cond, GenTreeCol return result; } +GenTreeOpWithILOffset* Compiler::gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOffset) +{ + assert(size != nullptr); + GenTreeOpWithILOffset* node = + new (this, GT_LCLHEAP) GenTreeOpWithILOffset(GT_LCLHEAP, TYP_I_IMPL, size, nullptr, ilOffset); + // May throw a stack overflow exception. Obviously, we don't want locallocs to be CSE'd. + node->gtFlags |= (GTF_EXCEPT | GTF_DONT_CSE); + return node; +} + GenTreeIntCon* Compiler::gtNewIconNode(ssize_t value, var_types type) { assert(genActualType(type) == type); @@ -11143,6 +11159,11 @@ GenTree* Compiler::gtCloneExpr(GenTree* tree) } break; + case GT_LCLHEAP: + copy = new (this, GT_LCLHEAP) GenTreeOpWithILOffset(GT_LCLHEAP, tree->TypeGet(), tree->gtGetOp1(), + nullptr, tree->AsOpWithILOffset()->GetILOffset()); + break; + case GT_SWIFT_ERROR_RET: copy = new (this, oper) GenTreeOp(oper, tree->TypeGet(), tree->gtGetOp1(), tree->gtGetOp2()); break; diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 85b8693ce996a3..f7e4787810b018 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -3249,7 +3249,6 @@ struct GenTreeOp : public GenTreeUnOp // Unary operators with optional arguments: assert(oper == GT_RETURN || oper == GT_RETFILT || OperIsBlk(oper)); } - // returns true if we will use the division by constant optimization for this node. bool UsesDivideByConstOptimized(Compiler* comp); @@ -3315,6 +3314,49 @@ struct GenTreeOp : public GenTreeUnOp #endif }; +// GenTreeOpWithILOffset - a GenTreeOp that additionally carries an IL offset. +// +// Used by JIT phases that need to associate a node with a specific IL offset +// independently of any debug-info side tables. In particular, it allows +// non-call value-profile probe candidates (e.g. GT_LCLHEAP) to participate +// in the same value-histogram instrumentation pipeline as calls without a +// side hash table. +// +struct GenTreeOpWithILOffset : public GenTreeOp +{ +private: + IL_OFFSET gtILOffset; + +public: + IL_OFFSET GetILOffset() const + { + return gtILOffset; + } + + void SetILOffset(IL_OFFSET ilOffset) + { + gtILOffset = ilOffset; + } + + GenTreeOpWithILOffset(genTreeOps oper, + var_types type, + GenTree* op1, + GenTree* op2, + IL_OFFSET ilOffset DEBUGARG(bool largeNode = false)) + : GenTreeOp(oper, type, op1, op2 DEBUGARG(largeNode)) + , gtILOffset(ilOffset) + { + } + +#if DEBUGGABLE_GENTREE + GenTreeOpWithILOffset() + : GenTreeOp() + , gtILOffset(0) + { + } +#endif +}; + struct GenTreeVal : public GenTree { size_t gtVal1; diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h index b293df525ed695..40f623a408c102 100644 --- a/src/coreclr/jit/gtlist.h +++ b/src/coreclr/jit/gtlist.h @@ -70,7 +70,7 @@ GTNODE(KEEPALIVE , GenTree ,0,0,GTK_UNOP|GTK_NOVALUE) // kee GTNODE(CAST , GenTreeCast ,0,0,GTK_UNOP|GTK_EXOP) // conversion to another type GTNODE(BITCAST , GenTreeOp ,0,1,GTK_UNOP) // reinterpretation of bits as another type GTNODE(CKFINITE , GenTreeOp ,0,1,GTK_UNOP|DBK_NOCONTAIN) // Check for NaN -GTNODE(LCLHEAP , GenTreeOp ,0,1,GTK_UNOP|DBK_NOCONTAIN) // alloca() +GTNODE(LCLHEAP , GenTreeOpWithILOffset, 0,1,GTK_UNOP|GTK_EXOP|DBK_NOCONTAIN) // alloca() GTNODE(BOUNDS_CHECK , GenTreeBoundsChk ,0,1,GTK_BINOP|GTK_EXOP|GTK_NOVALUE) // a bounds check - for arrays/spans/SIMDs/HWINTRINSICs diff --git a/src/coreclr/jit/gtstructs.h b/src/coreclr/jit/gtstructs.h index 6e7d62e496f038..d8afe98424ee48 100644 --- a/src/coreclr/jit/gtstructs.h +++ b/src/coreclr/jit/gtstructs.h @@ -74,6 +74,7 @@ GTSTRUCT_1(Colon , GT_COLON) GTSTRUCT_1(FptrVal , GT_FTN_ADDR) GTSTRUCT_1(Intrinsic , GT_INTRINSIC) GTSTRUCT_1(IndexAddr , GT_INDEX_ADDR) +GTSTRUCT_1(OpWithILOffset, GT_LCLHEAP) #if defined(FEATURE_HW_INTRINSICS) GTSTRUCT_N(MultiOp , GT_HWINTRINSIC) #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index 84749718f92d18..254f753d835d2d 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -10134,10 +10134,11 @@ void Compiler::impImportBlockCode(BasicBlock* block) return; } - op1 = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, op2); - // We do not model stack overflow from localloc as an exception side effect. - // Obviously, we don't want locallocs to be CSE'd. - op1->gtFlags |= GTF_DONT_CSE; + op1 = gtNewLclHeapNode(op2, opcodeOffs); + + // PGO value-profiling for non-constant zero-init stackalloc: + // optimize / instrument based on the most likely size. + op1 = impProfileLclHeap(op1, opcodeOffs); // Request stack security for this method. setNeedsGSSecurityCookie(); diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 65243e7ca9e636..fb7728adbbedc0 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1806,6 +1806,92 @@ GenTree* Compiler::impDuplicateWithProfiledArg(GenTreeCall* call, IL_OFFSET ilOf return call; } +//------------------------------------------------------------------------ +// impProfileLclHeap: Apply PGO value-profiling to a non-constant GT_LCLHEAP. +// +// Arguments: +// lclHeap - the GT_LCLHEAP node to profile/specialize +// ilOffset - IL offset of the LOCALLOC opcode +// +// Return Value: +// Either the original `lclHeap` node, or a fresh tree that produces the +// same value but is specialized via a profiled-size guard: +// +// size == popularSize ? LCLHEAP(popularSize) : LCLHEAP(size) +// +// Notes: +// The constant-size LCLHEAP fast path is unrolled by Lowering (efficient +// zero-init via STORE_BLK), so specializing for a popular size avoids the +// slow per-stack-alignment zeroing loop in the non-constant LCLHEAP codegen. +// +// We don't need that optimization if SkipInitLocals is set as even +// non-constant locallocs are cheap in that case; we only want to speed up +// zeroing. +// +GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) +{ + assert(lclHeap->OperIs(GT_LCLHEAP)); + + GenTree* size = lclHeap->AsOp()->gtOp1; + if (!info.compInitMem || size->IsIntegralConst()) + { + return lclHeap; + } + + // Consuming the existing profile (optimizing). + if (opts.IsOptimizedWithProfile()) + { + ssize_t profiledValue = 0; + uint32_t likelihood = 0; + if (!pickProfiledValue(ilOffset, &likelihood, &profiledValue) || (likelihood < 50) || + ((uint32_t)profiledValue > INT_MAX)) + { + return lclHeap; + } + assert(FitsIn(profiledValue)); + + GenTree* sizeNode = size; + GenTree* clonedSizeNode = + impCloneExpr(sizeNode, &sizeNode, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling sizeNode")); + GenTree* profiledValueNode = gtNewIconNode(profiledValue, size->TypeGet()); + GenTree* fallback = gtNewLclHeapNode(clonedSizeNode, ilOffset); + + GenTree* fastpath; + if (profiledValue == 0) + { + // Just nullptr. + fastpath = gtNewIconNode(0, TYP_I_IMPL); + } + else + { + // NOTE: we don't want to convert the fastpath stackalloc to a local like we + // normally do, because it would be additional overhead for the fallback path + // (a redundant local to clear). + fastpath = gtNewLclHeapNode(profiledValueNode, ilOffset); + } + + // TODO: Specify weights for the branches in the Qmark node. + GenTreeColon* colon = new (this, GT_COLON) GenTreeColon(TYP_I_IMPL, fastpath, fallback); + GenTreeOp* cond = gtNewOperNode(GT_EQ, TYP_INT, sizeNode, gtCloneExpr(profiledValueNode)); + GenTreeQmark* qmark = gtNewQmarkNode(TYP_I_IMPL, cond, colon); + + // QMARK has to be a root node. + unsigned tmp = lvaGrabTemp(true DEBUGARG("Grabbing temp for Qmark")); + impStoreToTemp(tmp, qmark, CHECK_SPILL_ALL); + return gtNewLclvNode(tmp, qmark->TypeGet()); + } + + // Instrumenting LCLHEAP for value profile. + if (opts.IsInstrumented() && !compIsForInlining()) + { + JITDUMP("\n ... marking [%06u] in " FMT_BB " for value profile instrumentation\n", dspTreeID(lclHeap), + compCurBB->bbNum); + compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); + } + + return lclHeap; +} + #ifdef DEBUG // var_types Compiler::impImportJitTestLabelMark(int numArgs) @@ -7151,6 +7237,58 @@ void Compiler::addFatPointerCandidate(GenTreeCall* call) helper.StoreRetExprResultsInArgs(call); } +//------------------------------------------------------------------------ +// pickProfiledValue: Use profile information to pick a value candidate for the given IL offset. +// +// Arguments: +// ilOffset - exact IL offset of the call +// pLikelihood - [out] likelihood of the picked value +// pValue - [out] the picked value +// +// Return Value: +// true if a value was picked, false otherwise +// +bool Compiler::pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssize_t* pValue) +{ +#if DEBUG + const unsigned MaxLikelyValues = 8; +#else + const unsigned MaxLikelyValues = 1; +#endif + + LikelyValueRecord likelyValues[MaxLikelyValues]; + UINT32 valuesCount = getLikelyValues(likelyValues, MaxLikelyValues, fgPgoSchema, fgPgoSchemaCount, fgPgoData, + static_cast(ilOffset)); + + LikelyValueRecord likelyValue = likelyValues[0]; +#if DEBUG + JITDUMP("%u likely values:\n", valuesCount); + for (UINT32 i = 0; i < valuesCount; i++) + { + JITDUMP(" %u) %u - %u%%\n", i, likelyValues[i].value, likelyValues[i].likelihood) + } + + // Re-use JitRandomGuardedDevirtualization for stress-testing. + if (JitConfig.JitRandomGuardedDevirtualization() != 0) + { + CLRRandom* random = impInlineRoot()->m_inlineStrategy->GetRandom(JitConfig.JitRandomGuardedDevirtualization()); + + valuesCount = 1; + likelyValue.value = random->Next(256); + likelyValue.likelihood = 100; + } +#endif + + if (valuesCount < 1) + { + return false; + } + + *pValue = likelyValue.value; + *pLikelihood = likelyValue.likelihood; + return true; +} + //------------------------------------------------------------------------ // pickGDV: Use profile information to pick a GDV/cast type candidate for a call site. // From f32bbffcb955a7d008f9a6c9354c93ef1188ea67 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Fri, 8 May 2026 22:12:18 +0200 Subject: [PATCH 02/23] Address Copilot reviewer feedback * pickProfiledValue: avoid copying likelyValues[0] before the empty-result check; read directly from likelyValues[0] after the early return. * pickProfiledValue: fix %u format specifier for ssize_t value (use %zd). * pickProfiledValue: doc comment no longer implies the helper is call-only. * impProfileLclHeap: replace truncating ((uint32_t)profiledValue > INT_MAX) range check with !FitsIn(profiledValue). * GT_LCLHEAP OperExceptions: reword comment to reflect unconditional modeling of stack overflow. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 2 +- src/coreclr/jit/importercalls.cpp | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 20eefe9e42e7d6..32b1b74ed42517 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -8568,7 +8568,7 @@ ExceptionSetFlags GenTree::OperExceptions(Compiler* comp) return ExceptionSetFlags::NullReferenceException | ExceptionSetFlags::IndexOutOfRangeException; case GT_LCLHEAP: - // Stack overflow on a localloc with a large or unknown size. + // Localloc may always overflow the stack; model unconditionally. return ExceptionSetFlags::UnknownException; case GT_CKFINITE: diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index fb7728adbbedc0..671d52e2a16b8a 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1844,11 +1844,10 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) ssize_t profiledValue = 0; uint32_t likelihood = 0; if (!pickProfiledValue(ilOffset, &likelihood, &profiledValue) || (likelihood < 50) || - ((uint32_t)profiledValue > INT_MAX)) + !FitsIn(profiledValue)) { return lclHeap; } - assert(FitsIn(profiledValue)); GenTree* sizeNode = size; GenTree* clonedSizeNode = @@ -7241,7 +7240,7 @@ void Compiler::addFatPointerCandidate(GenTreeCall* call) // pickProfiledValue: Use profile information to pick a value candidate for the given IL offset. // // Arguments: -// ilOffset - exact IL offset of the call +// ilOffset - exact IL offset of the value-profile probe site // pLikelihood - [out] likelihood of the picked value // pValue - [out] the picked value // @@ -7260,12 +7259,11 @@ bool Compiler::pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssiz UINT32 valuesCount = getLikelyValues(likelyValues, MaxLikelyValues, fgPgoSchema, fgPgoSchemaCount, fgPgoData, static_cast(ilOffset)); - LikelyValueRecord likelyValue = likelyValues[0]; #if DEBUG JITDUMP("%u likely values:\n", valuesCount); for (UINT32 i = 0; i < valuesCount; i++) { - JITDUMP(" %u) %u - %u%%\n", i, likelyValues[i].value, likelyValues[i].likelihood) + JITDUMP(" %u) %zd - %u%%\n", i, likelyValues[i].value, likelyValues[i].likelihood) } // Re-use JitRandomGuardedDevirtualization for stress-testing. @@ -7273,9 +7271,12 @@ bool Compiler::pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssiz { CLRRandom* random = impInlineRoot()->m_inlineStrategy->GetRandom(JitConfig.JitRandomGuardedDevirtualization()); - valuesCount = 1; - likelyValue.value = random->Next(256); - likelyValue.likelihood = 100; + if (valuesCount < 1) + { + valuesCount = 1; + } + likelyValues[0].value = random->Next(256); + likelyValues[0].likelihood = 100; } #endif @@ -7284,8 +7285,8 @@ bool Compiler::pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssiz return false; } - *pValue = likelyValue.value; - *pLikelihood = likelyValue.likelihood; + *pValue = likelyValues[0].value; + *pLikelihood = likelyValues[0].likelihood; return true; } From fe8750af4ba6187daa37c199baf86d95124b1886 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Fri, 8 May 2026 22:39:48 +0200 Subject: [PATCH 03/23] Add JitMetrics counters and tighten profiled LCLHEAP guard * Add ValueProfiledLclHeap, ValueProfiledMemmove, ValueProfiledSequenceEqual metrics so SPMI replay/diff can show how often value-profile specialization fires per shape. * impProfileLclHeap: clamp profiledValue to [0, getUnrollThreshold(Memset)] so we only specialize when the constant-size LCLHEAP fast path is actually unrolled by Lowering. Outside that range the cmp/jne guard adds overhead with no codegen win. Also reject negative values that FitsIn would otherwise allow. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/importercalls.cpp | 46 +++++++++++++++++++++---------- src/coreclr/jit/jitmetadatalist.h | 3 ++ 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 671d52e2a16b8a..354ad3baf531b1 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1800,6 +1800,16 @@ GenTree* Compiler::impDuplicateWithProfiledArg(GenTreeCall* call, IL_OFFSET ilOf JITDUMP("\n\nResulting tree:\n") DISPTREE(qmark) + if (call->IsSpecialIntrinsic(this, NI_System_SpanHelpers_Memmove)) + { + Metrics.ValueProfiledMemmove++; + } + else + { + assert(call->IsSpecialIntrinsic(this, NI_System_SpanHelpers_SequenceEqual)); + Metrics.ValueProfiledSequenceEqual++; + } + return qmark; } } @@ -1843,11 +1853,21 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) { ssize_t profiledValue = 0; uint32_t likelihood = 0; - if (!pickProfiledValue(ilOffset, &likelihood, &profiledValue) || (likelihood < 50) || - !FitsIn(profiledValue)) + if (!pickProfiledValue(ilOffset, &likelihood, &profiledValue) || (likelihood < 50)) + { + return lclHeap; + } + + // The fast path is profitable only when the constant-size LCLHEAP zero-init can + // actually be unrolled by Lowering. Outside that window the cmp/jne guard adds + // overhead with no codegen win. + const ssize_t maxValue = (ssize_t)getUnrollThreshold(UnrollKind::Memset); + if ((profiledValue < 0) || (profiledValue > maxValue)) { + JITDUMP("Profiled LCLHEAP size %zd is out of range [0, %zd] - skipping\n", profiledValue, maxValue); return lclHeap; } + assert(FitsIn(profiledValue)); GenTree* sizeNode = size; GenTree* clonedSizeNode = @@ -1877,6 +1897,7 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) // QMARK has to be a root node. unsigned tmp = lvaGrabTemp(true DEBUGARG("Grabbing temp for Qmark")); impStoreToTemp(tmp, qmark, CHECK_SPILL_ALL); + Metrics.ValueProfiledLclHeap++; return gtNewLclvNode(tmp, qmark->TypeGet()); } @@ -7250,14 +7271,16 @@ void Compiler::addFatPointerCandidate(GenTreeCall* call) bool Compiler::pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssize_t* pValue) { #if DEBUG + // Request 8 likely values in debug to get more information in JitDump. const unsigned MaxLikelyValues = 8; #else const unsigned MaxLikelyValues = 1; #endif - LikelyValueRecord likelyValues[MaxLikelyValues]; + LikelyValueRecord likelyValues[MaxLikelyValues] = {}; UINT32 valuesCount = getLikelyValues(likelyValues, MaxLikelyValues, fgPgoSchema, fgPgoSchemaCount, fgPgoData, static_cast(ilOffset)); + assert(valuesCount <= MaxLikelyValues); #if DEBUG JITDUMP("%u likely values:\n", valuesCount); @@ -7270,24 +7293,19 @@ bool Compiler::pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssiz if (JitConfig.JitRandomGuardedDevirtualization() != 0) { CLRRandom* random = impInlineRoot()->m_inlineStrategy->GetRandom(JitConfig.JitRandomGuardedDevirtualization()); - - if (valuesCount < 1) - { - valuesCount = 1; - } + valuesCount = max(valuesCount, 1u); likelyValues[0].value = random->Next(256); likelyValues[0].likelihood = 100; } #endif - if (valuesCount < 1) + if (valuesCount >= 1) { - return false; + *pValue = likelyValues[0].value; + *pLikelihood = likelyValues[0].likelihood; + return true; } - - *pValue = likelyValues[0].value; - *pLikelihood = likelyValues[0].likelihood; - return true; + return false; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/jitmetadatalist.h b/src/coreclr/jit/jitmetadatalist.h index 215ea82ead5606..73e540437e3318 100644 --- a/src/coreclr/jit/jitmetadatalist.h +++ b/src/coreclr/jit/jitmetadatalist.h @@ -95,6 +95,9 @@ JITMETADATAMETRIC(MorphTrackedLocals, int, 0) JITMETADATAMETRIC(MorphLocals, int, 0) JITMETADATAMETRIC(EnumeratorGDVProvisionalNoEscape, int, 0) JITMETADATAMETRIC(EnumeratorGDVCanCloneToEnsureNoEscape, int, 0) +JITMETADATAMETRIC(ValueProfiledLclHeap, int, JIT_METADATA_HIGHER_IS_BETTER) +JITMETADATAMETRIC(ValueProfiledMemmove, int, JIT_METADATA_HIGHER_IS_BETTER) +JITMETADATAMETRIC(ValueProfiledSequenceEqual, int, JIT_METADATA_HIGHER_IS_BETTER) #undef JITMETADATA #undef JITMETADATAINFO From 5a3d7d1444c1db64c4adea82185f1a76b5bd471f Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Fri, 8 May 2026 23:14:29 +0200 Subject: [PATCH 04/23] Refactor impDuplicateWithProfiledArg to use pickProfiledValue Removes the duplicated getLikelyValues+stress-test+threshold-check block and the unguarded likelyValues[0] read that could observe uninitialized data when getLikelyValues returns 0. Now goes through the shared pickProfiledValue helper which handles the empty-result case correctly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/importercalls.cpp | 32 +++---------------------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 354ad3baf531b1..66ca7b509940ce 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1704,37 +1704,11 @@ GenTree* Compiler::impDuplicateWithProfiledArg(GenTreeCall* call, IL_OFFSET ilOf return call; } - const unsigned MaxLikelyValues = 8; - LikelyValueRecord likelyValues[MaxLikelyValues]; - UINT32 valuesCount = - getLikelyValues(likelyValues, MaxLikelyValues, fgPgoSchema, fgPgoSchemaCount, fgPgoData, ilOffset); - - JITDUMP("%u likely values:\n", valuesCount) - for (UINT32 i = 0; i < valuesCount; i++) - { - JITDUMP(" %u) %u - %u%%\n", i, likelyValues[i].value, likelyValues[i].likelihood) - } - - // For now, we only do a single guess, but it's pretty straightforward to - // extend it to support multiple guesses. - LikelyValueRecord likelyValue = likelyValues[0]; -#if DEBUG - // Re-use JitRandomGuardedDevirtualization for stress-testing. - if (JitConfig.JitRandomGuardedDevirtualization() != 0) - { - CLRRandom* random = impInlineRoot()->m_inlineStrategy->GetRandom(JitConfig.JitRandomGuardedDevirtualization()); - - valuesCount = 1; - likelyValue.value = random->Next(256); - likelyValue.likelihood = 100; - } -#endif - + ssize_t profiledValue = 0; + uint32_t likelihood = 0; // TODO: Tune the likelihood threshold, for now it's 50% - if ((valuesCount > 0) && (likelyValue.likelihood >= 50)) + if (pickProfiledValue(ilOffset, &likelihood, &profiledValue) && (likelihood >= 50)) { - const ssize_t profiledValue = likelyValue.value; - unsigned argNum = 0; ssize_t minValue = 0; ssize_t maxValue = 0; From a712e10a27e345f233ccb3e3c718fe1ccf3d22c7 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Fri, 8 May 2026 23:23:13 +0200 Subject: [PATCH 05/23] Promote tiny-popular-value LCLHEAP fast paths to a local For profiled values <= DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE (32 bytes), use a zero-init must-init local for the fast path arm instead of LCLHEAP(N). On ARM64 (and to a lesser extent on x64), the contained-constant LCLHEAP codegen carries non-trivial overhead (stack probes, outgoing-arg-area adjustment, separate STORE_BLK) that dwarfs the cost of the slow variable-size loop for very small allocations. The local is zeroed at function entry by must-init regardless of which arm runs; for sizes <= 32 bytes that's a single SIMD store at most. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/importercalls.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 66ca7b509940ce..0c031e0d5df1e2 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1855,11 +1855,26 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) // Just nullptr. fastpath = gtNewIconNode(0, TYP_I_IMPL); } + else if (profiledValue <= DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE) + { + // For tiny popular sizes, route the fast path through a fixed-size + // zero-init local (the same trick the importer uses for constant + // stackalloc). This avoids the LCLHEAP codegen overhead (stack + // probing, outgoing-arg-area dance, etc.) that dominates for very + // small allocations. + // + // The local is zeroed at function entry by must-init even when the + // fallback path is taken; that cost is negligible at this size. + const unsigned stackallocAsLocal = lvaGrabTemp(false DEBUGARG("profiled stackallocLocal")); + lvaSetStruct(stackallocAsLocal, typGetBlkLayout((unsigned)profiledValue), false); + lvaTable[stackallocAsLocal].lvHasLdAddrOp = true; + lvaTable[stackallocAsLocal].lvIsUnsafeBuffer = true; + fastpath = gtNewLclVarAddrNode(stackallocAsLocal); + } else { - // NOTE: we don't want to convert the fastpath stackalloc to a local like we - // normally do, because it would be additional overhead for the fallback path - // (a redundant local to clear). + // For larger popular sizes, keep an actual LCLHEAP so Lowering can + // unroll the zero-init via STORE_BLK. fastpath = gtNewLclHeapNode(profiledValueNode, ilOffset); } From fec094d0d937b42ffde0de3ec2dcf5fc22d23dff Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Fri, 8 May 2026 23:37:49 +0200 Subject: [PATCH 06/23] Set Qmark then-branch likelihood from profiled likelihood Both impDuplicateWithProfiledArg and impProfileLclHeap had a TODO to set weights for the QMARK branches and were leaving the default 50/50 split. Use the histogram likelihood (already computed and the gating threshold for creating the QMARK in the first place) so morph's QMARK->control-flow expansion sets accurate edge probabilities for the cond/then/else blocks. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/importercalls.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 0c031e0d5df1e2..b815e8e8d39fa1 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1766,10 +1766,10 @@ GenTree* Compiler::impDuplicateWithProfiledArg(GenTreeCall* call, IL_OFFSET ilOf GenTree* profiledValueNode = gtNewIconNode(profiledValue, argClone->TypeGet()); *argRef = profiledValueNode; - // TODO: Specify weights for the branches in the Qmark node. GenTreeColon* colon = new (this, GT_COLON) GenTreeColon(call->TypeGet(), call, fallbackCall); GenTreeOp* cond = gtNewOperNode(GT_EQ, TYP_INT, argClone, gtCloneExpr(profiledValueNode)); GenTreeQmark* qmark = gtNewQmarkNode(call->TypeGet(), cond, colon); + qmark->SetThenNodeLikelihood(likelihood); JITDUMP("\n\nResulting tree:\n") DISPTREE(qmark) @@ -1878,10 +1878,10 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) fastpath = gtNewLclHeapNode(profiledValueNode, ilOffset); } - // TODO: Specify weights for the branches in the Qmark node. GenTreeColon* colon = new (this, GT_COLON) GenTreeColon(TYP_I_IMPL, fastpath, fallback); GenTreeOp* cond = gtNewOperNode(GT_EQ, TYP_INT, sizeNode, gtCloneExpr(profiledValueNode)); GenTreeQmark* qmark = gtNewQmarkNode(TYP_I_IMPL, cond, colon); + qmark->SetThenNodeLikelihood(likelihood); // QMARK has to be a root node. unsigned tmp = lvaGrabTemp(true DEBUGARG("Grabbing temp for Qmark")); From 305e88cdedc3933ec816b580820f0d3f3c6dead2 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 00:01:06 +0200 Subject: [PATCH 07/23] Skip LCLHEAP value-profile specialization for sizes <= 32 bytes The promote-to-local fast path turned out to be more complexity than it was worth: at sizes <= DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE (32) the variable-size LCLHEAP loop is already fast enough that the cmp/jne guard plus any constant-size codegen does not pay off. Just bail out in that range. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/importercalls.cpp | 43 ++++++++----------------------- 1 file changed, 11 insertions(+), 32 deletions(-) diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index b815e8e8d39fa1..11fd36a6db6bcd 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1833,12 +1833,17 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) } // The fast path is profitable only when the constant-size LCLHEAP zero-init can - // actually be unrolled by Lowering. Outside that window the cmp/jne guard adds - // overhead with no codegen win. + // actually be unrolled by Lowering. Skip very small popular values, since the + // cmp/jne guard plus contained-constant LCLHEAP codegen overhead (stack probe, + // outgoing-arg-area dance) costs more than the variable-size path saves at this + // size. Skip very large values for the symmetric reason: above the unroll + // threshold there is no constant-size codegen win to offset the guard. + const ssize_t minValue = (ssize_t)DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE; const ssize_t maxValue = (ssize_t)getUnrollThreshold(UnrollKind::Memset); - if ((profiledValue < 0) || (profiledValue > maxValue)) + if ((profiledValue <= minValue) || (profiledValue > maxValue)) { - JITDUMP("Profiled LCLHEAP size %zd is out of range [0, %zd] - skipping\n", profiledValue, maxValue); + JITDUMP("Profiled LCLHEAP size %zd is out of range (%zd, %zd] - skipping\n", profiledValue, minValue, + maxValue); return lclHeap; } assert(FitsIn(profiledValue)); @@ -1849,34 +1854,8 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) GenTree* profiledValueNode = gtNewIconNode(profiledValue, size->TypeGet()); GenTree* fallback = gtNewLclHeapNode(clonedSizeNode, ilOffset); - GenTree* fastpath; - if (profiledValue == 0) - { - // Just nullptr. - fastpath = gtNewIconNode(0, TYP_I_IMPL); - } - else if (profiledValue <= DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE) - { - // For tiny popular sizes, route the fast path through a fixed-size - // zero-init local (the same trick the importer uses for constant - // stackalloc). This avoids the LCLHEAP codegen overhead (stack - // probing, outgoing-arg-area dance, etc.) that dominates for very - // small allocations. - // - // The local is zeroed at function entry by must-init even when the - // fallback path is taken; that cost is negligible at this size. - const unsigned stackallocAsLocal = lvaGrabTemp(false DEBUGARG("profiled stackallocLocal")); - lvaSetStruct(stackallocAsLocal, typGetBlkLayout((unsigned)profiledValue), false); - lvaTable[stackallocAsLocal].lvHasLdAddrOp = true; - lvaTable[stackallocAsLocal].lvIsUnsafeBuffer = true; - fastpath = gtNewLclVarAddrNode(stackallocAsLocal); - } - else - { - // For larger popular sizes, keep an actual LCLHEAP so Lowering can - // unroll the zero-init via STORE_BLK. - fastpath = gtNewLclHeapNode(profiledValueNode, ilOffset); - } + // Fast path: a constant-size LCLHEAP that Lowering will unroll into STORE_BLK. + GenTree* fastpath = gtNewLclHeapNode(profiledValueNode, ilOffset); GenTreeColon* colon = new (this, GT_COLON) GenTreeColon(TYP_I_IMPL, fastpath, fallback); GenTreeOp* cond = gtNewOperNode(GT_EQ, TYP_INT, sizeNode, gtCloneExpr(profiledValueNode)); From 1fcbcb2e0a42eb49ae8af964e4259b5769d2ae7f Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 00:23:02 +0200 Subject: [PATCH 08/23] clean up --- src/coreclr/jit/fgprofile.cpp | 2 ++ src/coreclr/jit/gentree.h | 2 +- src/coreclr/jit/importercalls.cpp | 10 ++++------ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index f898fc216109f5..cee3dc5291196a 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -2355,6 +2355,8 @@ class ValueHistogramProbeInserter if (!lengthNode->TypeIs(TYP_I_IMPL)) { + // CORINFO_HELP_VALUEPROFILE always expects nint. + assert(genActualType(lengthNode) == TYP_INT); lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /* isUnsigned */ false, TYP_I_IMPL); } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index f7e4787810b018..ddd8b650f02332 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -3351,7 +3351,7 @@ struct GenTreeOpWithILOffset : public GenTreeOp #if DEBUGGABLE_GENTREE GenTreeOpWithILOffset() : GenTreeOp() - , gtILOffset(0) + , gtILOffset(BAD_IL_OFFSET) { } #endif diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 11fd36a6db6bcd..5bbaa827e433c9 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1836,14 +1836,12 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) // actually be unrolled by Lowering. Skip very small popular values, since the // cmp/jne guard plus contained-constant LCLHEAP codegen overhead (stack probe, // outgoing-arg-area dance) costs more than the variable-size path saves at this - // size. Skip very large values for the symmetric reason: above the unroll - // threshold there is no constant-size codegen win to offset the guard. + // size. We don't need the upper bound because for constant-sized LCLHEAPs of large size we + // emit memzero call which is a lot faster than the loop we would get for variable sized LCLHEAP. const ssize_t minValue = (ssize_t)DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE; - const ssize_t maxValue = (ssize_t)getUnrollThreshold(UnrollKind::Memset); - if ((profiledValue <= minValue) || (profiledValue > maxValue)) + if (profiledValue <= minValue) { - JITDUMP("Profiled LCLHEAP size %zd is out of range (%zd, %zd] - skipping\n", profiledValue, minValue, - maxValue); + JITDUMP("Profiled LCLHEAP size %zd is smaller than %zd - skipping\n", profiledValue, minValue); return lclHeap; } assert(FitsIn(profiledValue)); From c1a08792641d450d2d607d678533a31c9881e44c Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 00:40:49 +0200 Subject: [PATCH 09/23] Address remaining Copilot reviewer feedback * fgprofile.cpp: IsValueHistogramProbeCandidate now mirrors the consumer conditions for GT_LCLHEAP (compInitMem + non-constant op), so we don't waste schema slots / instrumentation overhead on locallocs whose data impProfileLclHeap would discard. * importercalls.cpp: introduce a dedicated minProfitableSize constant so the cutoff is no longer coupled to DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE (which serves an unrelated heuristic). * importercalls.cpp: pickProfiledValue defensively initializes its output parameters on entry so future callers cannot observe garbage on the failure path. * compiler.h: drop the default ilOffset = 0 on gtNewLclHeapNode to force callers to explicitly supply the IL offset (0 is a valid offset; an accidentally-omitted argument would silently desync the PGO schema). * gentree.cpp: extend the GT_LCLHEAP comment in OperExceptions to spell out why we now model StackOverflow (required to keep if-conversion from collapsing the QMARK arms in impProfileLclHeap into a SELECT). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/fgprofile.cpp | 8 ++++++++ src/coreclr/jit/gentree.cpp | 5 +++++ src/coreclr/jit/importercalls.cpp | 22 +++++++++++++++++----- 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index f2dc0f116c3de2..969211ac04e87c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3220,7 +3220,7 @@ class Compiler GenTreeColon* gtNewColonNode(var_types type, GenTree* thenNode, GenTree* elseNode); GenTreeQmark* gtNewQmarkNode(var_types type, GenTree* cond, GenTreeColon* colon); - GenTreeOpWithILOffset* gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOffset = 0); + GenTreeOpWithILOffset* gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOffset); bool pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssize_t* pValue); diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index cee3dc5291196a..7ad4387b71d80b 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1918,6 +1918,14 @@ static bool IsValueHistogramProbeCandidate(Compiler* compiler, { if (node->OperIs(GT_LCLHEAP)) { + // Mirror the gating in impProfileLclHeap: only zero-init, non-constant + // locallocs are value-profile candidates. Anything else would have its + // probe data ignored by the consumer and just waste a schema slot. + if (!compiler->info.compInitMem || node->AsOp()->gtOp1->OperIsConst()) + { + return false; + } + if (ilOffset != nullptr) { *ilOffset = node->AsOpWithILOffset()->GetILOffset(); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 32b1b74ed42517..fb9c8bba634b98 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -8569,6 +8569,11 @@ ExceptionSetFlags GenTree::OperExceptions(Compiler* comp) case GT_LCLHEAP: // Localloc may always overflow the stack; model unconditionally. + // + // The JIT historically did not model StackOverflow on GT_LCLHEAP. We model + // it now because impProfileLclHeap places two GT_LCLHEAP nodes in QMARK + // arms; without GTF_EXCEPT, if-conversion (see ifconversion.cpp) collapses + // the QMARK into a SELECT/CMOV that executes both LCLHEAPs unconditionally. return ExceptionSetFlags::UnknownException; case GT_CKFINITE: diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 5bbaa827e433c9..fa9434b1bfef87 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1836,12 +1836,16 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) // actually be unrolled by Lowering. Skip very small popular values, since the // cmp/jne guard plus contained-constant LCLHEAP codegen overhead (stack probe, // outgoing-arg-area dance) costs more than the variable-size path saves at this - // size. We don't need the upper bound because for constant-sized LCLHEAPs of large size we - // emit memzero call which is a lot faster than the loop we would get for variable sized LCLHEAP. - const ssize_t minValue = (ssize_t)DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE; - if (profiledValue <= minValue) + // size. We don't need an upper bound because for large constant-sized LCLHEAPs + // we emit a memzero call which is a lot faster than the loop we would get for + // variable sized LCLHEAP. + // + // The cutoff is empirical (per benchmark sweep on x64/arm64); it is unrelated to + // DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE despite the value coincidence today. + const ssize_t minProfitableSize = 32; + if (profiledValue <= minProfitableSize) { - JITDUMP("Profiled LCLHEAP size %zd is smaller than %zd - skipping\n", profiledValue, minValue); + JITDUMP("Profiled LCLHEAP size %zd is smaller than %zd - skipping\n", profiledValue, minProfitableSize); return lclHeap; } assert(FitsIn(profiledValue)); @@ -7236,6 +7240,14 @@ void Compiler::addFatPointerCandidate(GenTreeCall* call) // bool Compiler::pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssize_t* pValue) { + assert(pLikelihood != nullptr); + assert(pValue != nullptr); + + // Default the outputs to safe values so callers that ignore the return value + // (or pass uninitialized locals) can't observe garbage. + *pLikelihood = 0; + *pValue = 0; + #if DEBUG // Request 8 likely values in debug to get more information in JitDump. const unsigned MaxLikelyValues = 8; From ea55d69db96971ec2b8edc1f353a9dd42b79db51 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 00:59:17 +0200 Subject: [PATCH 10/23] Use GTF_ORDER_SIDEEFF instead of GTF_EXCEPT for GT_LCLHEAP Stack overflow from localloc is process-fatal, not a catchable C# exception, so modeling GT_LCLHEAP as GTF_EXCEPT was overstating its semantics and risked pessimizing JIT phases that special-case GTF_EXCEPT (morph spill logic, optExtractSideEffList, parent-flag propagation, etc.). GTF_ORDER_SIDEEFF is the more accurate signal: it just tells the optimizer not to reorder/fold across the node, which is exactly what we need to keep if-conversion from collapsing the QMARK arms in impProfileLclHeap into a SELECT/CMOV. * gtNewLclHeapNode: set GTF_ORDER_SIDEEFF | GTF_DONT_CSE. * OperSupportsOrderingSideEffect: add GT_LCLHEAP so OperEffects doesn't strip the flag. * OperExceptions: revert the GT_LCLHEAP entry so we are back to not reporting any throwable exception (matches the existing intent of valuenum.cpp's "It is not necessary to model the StackOverflow exception for GT_LCLHEAP" comment). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index fb9c8bba634b98..7f43b5b364fbfa 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -8567,15 +8567,6 @@ ExceptionSetFlags GenTree::OperExceptions(Compiler* comp) case GT_INDEX_ADDR: return ExceptionSetFlags::NullReferenceException | ExceptionSetFlags::IndexOutOfRangeException; - case GT_LCLHEAP: - // Localloc may always overflow the stack; model unconditionally. - // - // The JIT historically did not model StackOverflow on GT_LCLHEAP. We model - // it now because impProfileLclHeap places two GT_LCLHEAP nodes in QMARK - // arms; without GTF_EXCEPT, if-conversion (see ifconversion.cpp) collapses - // the QMARK into a SELECT/CMOV that executes both LCLHEAPs unconditionally. - return ExceptionSetFlags::UnknownException; - case GT_CKFINITE: return ExceptionSetFlags::ArithmeticException; @@ -8787,6 +8778,7 @@ bool GenTree::OperSupportsOrderingSideEffect() const case GT_PATCHPOINT_FORCED: case GT_NONLOCAL_JMP: case GT_SWIFT_ERROR: + case GT_LCLHEAP: return true; default: return false; @@ -9047,8 +9039,11 @@ GenTreeOpWithILOffset* Compiler::gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOff assert(size != nullptr); GenTreeOpWithILOffset* node = new (this, GT_LCLHEAP) GenTreeOpWithILOffset(GT_LCLHEAP, TYP_I_IMPL, size, nullptr, ilOffset); - // May throw a stack overflow exception. Obviously, we don't want locallocs to be CSE'd. - node->gtFlags |= (GTF_EXCEPT | GTF_DONT_CSE); + // GTF_ORDER_SIDEEFF prevents reordering / if-conversion across LCLHEAP without + // making OperMayThrow true (stack overflow on localloc is process-fatal, not a + // catchable C# exception, so we don't want the broader GTF_EXCEPT pessimizations). + // GTF_DONT_CSE keeps CSE from sharing locallocs. + node->gtFlags |= (GTF_ORDER_SIDEEFF | GTF_DONT_CSE); return node; } From a57f9bd8b55c735db2276c9538dbe885d365b588 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 01:13:23 +0200 Subject: [PATCH 11/23] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/coreclr/jit/importercalls.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index fa9434b1bfef87..c152d84136d604 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1848,7 +1848,12 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) JITDUMP("Profiled LCLHEAP size %zd is smaller than %zd - skipping\n", profiledValue, minProfitableSize); return lclHeap; } - assert(FitsIn(profiledValue)); + + if (!FitsIn(profiledValue)) + { + JITDUMP("Profiled LCLHEAP size %zd does not fit in int - skipping\n", profiledValue); + return lclHeap; + } GenTree* sizeNode = size; GenTree* clonedSizeNode = From c1f31c6b2f523c8f72e12741446462e0bb93eb01 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 01:27:01 +0200 Subject: [PATCH 12/23] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/coreclr/jit/importercalls.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index c152d84136d604..d80a3cd90cbfcb 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1836,9 +1836,10 @@ GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) // actually be unrolled by Lowering. Skip very small popular values, since the // cmp/jne guard plus contained-constant LCLHEAP codegen overhead (stack probe, // outgoing-arg-area dance) costs more than the variable-size path saves at this - // size. We don't need an upper bound because for large constant-sized LCLHEAPs - // we emit a memzero call which is a lot faster than the loop we would get for - // variable sized LCLHEAP. + // size. We don't need a separate profitability upper bound here: for large + // constant-sized LCLHEAPs we emit a memzero call which is a lot faster than the + // loop we would get for variable sized LCLHEAP. Note that the profiled value + // must still satisfy the representability constraint checked below. // // The cutoff is empirical (per benchmark sweep on x64/arm64); it is unrelated to // DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE despite the value coincidence today. From c7bf9787928521d4dfc8cee7cea1767011e27c39 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 01:39:02 +0200 Subject: [PATCH 13/23] Use unsigned widening for value-profile length cast The non-nint length / size operands fed into CORINFO_HELP_VALUEPROFILE32/64 (Memmove len, SequenceEqual len, LCLHEAP size) are non-negative. Widening them with a signed cast would sign-extend any int value with the high bit set into a huge negative ssize_t and skew the recorded histogram (and the subsequent getLikelyValues consumption). Switch to an unsigned widening. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/fgprofile.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index 7ad4387b71d80b..a27796064ee27d 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -2363,9 +2363,12 @@ class ValueHistogramProbeInserter if (!lengthNode->TypeIs(TYP_I_IMPL)) { - // CORINFO_HELP_VALUEPROFILE always expects nint. + // CORINFO_HELP_VALUEPROFILE always expects nint. Use an unsigned widening + // because the operands we profile here (Memmove/SequenceEqual length and + // LCLHEAP size) are non-negative; sign-extending values with the high bit + // set would produce huge negative ssize_t and skew the histogram. assert(genActualType(lengthNode) == TYP_INT); - lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /* isUnsigned */ false, TYP_I_IMPL); + lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /* isUnsigned */ true, TYP_I_IMPL); } GenTreeCall* helperCallNode = compiler->gtNewHelperCallNode(helper, TYP_VOID, lengthNode, histNode); From 5bad7eff97c094696fa342b1515bc5aeec8e4b37 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 11:54:25 +0200 Subject: [PATCH 14/23] Block GT_LCLHEAP if-conversion via cost rather than GTF_ORDER_SIDEEFF Marking GT_LCLHEAP with GTF_ORDER_SIDEEFF (and adding it to OperSupportsOrderingSideEffect) was overstating its semantics: the only phase that actually misbehaves on two LCLHEAPs in QMARK arms today is if-conversion, and that pass already has a CostEx > 7 cutoff designed exactly for "this would be too expensive to evaluate unconditionally". Drop the GTF_ORDER_SIDEEFF marker and the OperSupportsOrderingSideEffect entry; instead give GT_LCLHEAP a realistic cost in gtSetEvalOrder: * For constant size with compInitMem we use 8 + alignedSize/REGSIZE (Lowering will unroll the zero-init via STORE_BLK). * For non-constant size we use a fixed high cost representing the runtime zero-init loop. Both branches stay above the cutoff, so if-conversion declines on the two-LCLHEAP QMARK shape impProfileLclHeap produces. Verified locally: Skipping if-conversion that will evaluate RHS unconditionally at costs 23,37 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 7f43b5b364fbfa..5d80c3df26b5e2 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -6921,6 +6921,32 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) costSz = 2 * 2; break; + case GT_LCLHEAP: + { + // GT_LCLHEAP is more expensive than a typical unary op (it adjusts + // SP, may probe the stack, and -- under compInitMem -- zeros the + // allocated range). Give it a high cost so phases like if-conversion + // (which has a CostEx > 7 cutoff) won't speculatively evaluate it + // unconditionally. + // + // For constant sizes the zero-init is unrolled by Lowering, so the + // cost scales with the size; for non-constant sizes we use a fixed + // upper bound representing the runtime loop. + if (op1->IsCnsIntOrI() && info.compInitMem) + { + const ssize_t size = op1->AsIntCon()->IconValue(); + const ssize_t alignedSize = (size + (STACK_ALIGN - 1)) & ~(ssize_t)(STACK_ALIGN - 1); + costEx = 8 + (int)(alignedSize / REGSIZE_BYTES); // > 7 to block if-conversion + costSz = 4 + (int)(alignedSize / REGSIZE_BYTES); + } + else + { + costEx = 36; + costSz = 8; + } + break; + } + case GT_ARR_ADDR: costEx = 0; costSz = 0; @@ -8778,7 +8804,6 @@ bool GenTree::OperSupportsOrderingSideEffect() const case GT_PATCHPOINT_FORCED: case GT_NONLOCAL_JMP: case GT_SWIFT_ERROR: - case GT_LCLHEAP: return true; default: return false; @@ -9039,11 +9064,8 @@ GenTreeOpWithILOffset* Compiler::gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOff assert(size != nullptr); GenTreeOpWithILOffset* node = new (this, GT_LCLHEAP) GenTreeOpWithILOffset(GT_LCLHEAP, TYP_I_IMPL, size, nullptr, ilOffset); - // GTF_ORDER_SIDEEFF prevents reordering / if-conversion across LCLHEAP without - // making OperMayThrow true (stack overflow on localloc is process-fatal, not a - // catchable C# exception, so we don't want the broader GTF_EXCEPT pessimizations). - // GTF_DONT_CSE keeps CSE from sharing locallocs. - node->gtFlags |= (GTF_ORDER_SIDEEFF | GTF_DONT_CSE); + // Don't allow CSE to share locallocs. + node->gtFlags |= GTF_DONT_CSE; return node; } From d2fbd77a20da95923d8507744c1e6622c3e64d3d Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 11:59:01 +0200 Subject: [PATCH 15/23] [DO NOT MERGE] Disable global SkipLocalsInit default to stress CI Temporary change to flip the implicit module-level [SkipLocalsInit] off for all NETCoreApp libraries, so CI exercises compInitMem=true code paths end-to-end against this PR's GT_LCLHEAP changes (cost model, value-profile specialization, QMARK arms). Must be reverted before merge. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/libraries/Directory.Build.targets | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/libraries/Directory.Build.targets b/src/libraries/Directory.Build.targets index b6fee26af35e3d..07060c51f8ecdc 100644 --- a/src/libraries/Directory.Build.targets +++ b/src/libraries/Directory.Build.targets @@ -245,7 +245,9 @@ - true + + false From 1e17a51fc9f266396f940edf8e45a8bf343aaacc Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 12:27:09 +0200 Subject: [PATCH 16/23] [DO NOT MERGE] Fix CI: keep AllowUnsafeBlocks side-effect when stressing compInitMem The previous attempt to flip the SkipLocalsInit default to false broke CI because the same block also implicitly sets AllowUnsafeBlocks=true (which many libraries silently rely on). Without it, every project that uses unsafe code without explicitly setting the flag fails with CS0227. Restructure: keep SkipLocalsInit=true (so AllowUnsafeBlocks stays set), but skip including the SkipLocalsInit.cs module-attribute file. That way [module: SkipLocalsInit] is not emitted -> compInitMem becomes true, without breaking any unsafe-code-using libraries. Also apply the JIT format patch (clang-format alignment in gentree.cpp's new GT_LCLHEAP cost case). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/gentree.cpp | 6 +++--- src/libraries/Directory.Build.targets | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 5d80c3df26b5e2..f85cc1ddf36216 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -6934,10 +6934,10 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) // upper bound representing the runtime loop. if (op1->IsCnsIntOrI() && info.compInitMem) { - const ssize_t size = op1->AsIntCon()->IconValue(); + const ssize_t size = op1->AsIntCon()->IconValue(); const ssize_t alignedSize = (size + (STACK_ALIGN - 1)) & ~(ssize_t)(STACK_ALIGN - 1); - costEx = 8 + (int)(alignedSize / REGSIZE_BYTES); // > 7 to block if-conversion - costSz = 4 + (int)(alignedSize / REGSIZE_BYTES); + costEx = 8 + (int)(alignedSize / REGSIZE_BYTES); // > 7 to block if-conversion + costSz = 4 + (int)(alignedSize / REGSIZE_BYTES); } else { diff --git a/src/libraries/Directory.Build.targets b/src/libraries/Directory.Build.targets index 07060c51f8ecdc..0affd212e630a8 100644 --- a/src/libraries/Directory.Build.targets +++ b/src/libraries/Directory.Build.targets @@ -245,9 +245,17 @@ - - false + true + + + + + true @@ -258,7 +266,7 @@ true - + From f4de6763be6dabdfb89f48bb5cd2074c36bb981c Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 16:02:46 +0200 Subject: [PATCH 17/23] clean up --- src/coreclr/jit/fgprofile.cpp | 39 +++++++++++------------------------ 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index a27796064ee27d..cde54cdff47577 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1913,7 +1913,7 @@ class HandleHistogramProbeVisitor final : public GenTreeVisitorOperIs(GT_LCLHEAP)) @@ -1921,15 +1921,12 @@ static bool IsValueHistogramProbeCandidate(Compiler* compiler, // Mirror the gating in impProfileLclHeap: only zero-init, non-constant // locallocs are value-profile candidates. Anything else would have its // probe data ignored by the consumer and just waste a schema slot. - if (!compiler->info.compInitMem || node->AsOp()->gtOp1->OperIsConst()) + if (!compiler->info.compInitMem || node->gtGetOp1()->OperIsConst()) { return false; } - if (ilOffset != nullptr) - { - *ilOffset = node->AsOpWithILOffset()->GetILOffset(); - } + *ilOffset = node->AsOpWithILOffset()->GetILOffset(); if (operandUseRef != nullptr) { *operandUseRef = &node->AsOp()->gtOp1; @@ -1942,11 +1939,9 @@ static bool IsValueHistogramProbeCandidate(Compiler* compiler, const NamedIntrinsic ni = compiler->lookupNamedIntrinsic(node->AsCall()->gtCallMethHnd); if ((ni == NI_System_SpanHelpers_Memmove) || (ni == NI_System_SpanHelpers_SequenceEqual)) { - if (ilOffset != nullptr) - { - assert(node->AsCall()->gtHandleHistogramProfileCandidateInfo != nullptr); - *ilOffset = node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; - } + assert(node->AsCall()->gtHandleHistogramProfileCandidateInfo != nullptr); + *ilOffset = node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; + if (operandUseRef != nullptr) { // Memmove(dst, src, len) and SequenceEqual(left, right, len) -- profile len. @@ -2342,17 +2337,8 @@ class ValueHistogramProbeInserter *m_currentSchemaIndex += 2; - // We have either Memmove(dst, src, len)/SequenceEqual(...,len) or LCLHEAP(len) and we want to insert a - // call to CORINFO_HELP_VALUEPROFILE for the profiled operand: - // - // \--* COMMA long - // +--* CALL help void CORINFO_HELP_VALUEPROFILE - // | +--* COMMA long - // | | +--* STORE_LCL_VAR long tmp - // | | | \--* (node to poll) - // | | \--* LCL_VAR long tmp - // | \--* CNS_INT long - // \--* LCL_VAR long tmp + // Inject CORINFO_HELP_VALUEPROFILE helper call to record the value in the histogram. + // The helper call is injected as a comma node that stores the value to a temp. // const unsigned lenTmpNum = compiler->lvaGrabTemp(true DEBUGARG("length histogram profile tmp")); GenTree* storeLenToTemp = compiler->gtNewTempStore(lenTmpNum, *operandUseRef); @@ -2363,12 +2349,11 @@ class ValueHistogramProbeInserter if (!lengthNode->TypeIs(TYP_I_IMPL)) { - // CORINFO_HELP_VALUEPROFILE always expects nint. Use an unsigned widening - // because the operands we profile here (Memmove/SequenceEqual length and - // LCLHEAP size) are non-negative; sign-extending values with the high bit - // set would produce huge negative ssize_t and skew the histogram. + // CORINFO_HELP_VALUEPROFILE always expects nint. + // Zero-extend for now due to LCLHEAP being actually uint32_t-typed, but in the future we may want to + // support with sign-extension as well. assert(genActualType(lengthNode) == TYP_INT); - lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /* isUnsigned */ true, TYP_I_IMPL); + lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /*fromUnsigned*/ true, TYP_I_IMPL); } GenTreeCall* helperCallNode = compiler->gtNewHelperCallNode(helper, TYP_VOID, lengthNode, histNode); From c1f7f8613504cbadead2a6c31abf037c170328d6 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 17:38:24 +0200 Subject: [PATCH 18/23] Generalize value-profile guarded specialization into one helper Collapse the previous per-shape impDuplicateWithProfiledArg / impProfileLclHeap into a single Compiler::impProfileValueGuardedTree that callers parameterize declaratively: op1 = impProfileValueGuardedTree(node, &operandRef, ilOffset, minProfitable, maxProfitable, &successMetric DEBUGARG(tmpName)); The helper handles both the Instrumented Tier0 marking (sets BBF_HAS_VALUE_PROFILE; for calls also attaches gtHandleHistogramProfileCandidateInfo) and the Tier1+PGO specialization: * pickProfiledValue + likelihood/range filtering. * For calls, pre-spills the non-profiled args so their side effects stay in evaluation order. * Spills *operandRef into a temp. * slowTree = gtCloneExpr(node) (clone references the spilled temp). * Mutates *operandRef to a constant -> node itself becomes fastTree. * Builds (operand == popular) ? fastTree : slowTree, sets the QMARK likelihood from the histogram, bumps the supplied metric, and wraps value-typed results in a STORE_LCL_VAR/LCL_VAR pair. No oper switch, no lambdas, no IsValueHistogramProbeCandidate dependency on the importer side. Per-shape concerns (which operand is profiled, the profitable range, which metric to bump) live with the caller. Adding a new probe site is now a single declarative call. IsValueHistogramProbeCandidate stays in fgprofile.cpp as the per-oper registry needed by the value-histogram instrumentor visitor (which walks all nodes in BBF_HAS_VALUE_PROFILE blocks). Verified: * SPMI benchmarks.run.: All replays clean. * LCLHEAP repro: Tier1 still emits cmp/je guard + unrolled vmovdqu32 fast path + push-loop slow path (identical codegen to pre-refactor). * Memmove repro under JitRandomGuardedDevirtualization=1 stress: builds QMARK and bumps ValueProfiledMemmove. Also picks up an incidental fix for the IsValueHistogramProbeCandidate default-arg/null-guard regression introduced by the prior 'clean up' commit: the visitor at fgprofile.cpp:1969 calls with two args, but 'clean up' had removed both the default value and the null guard, breaking the build at HEAD. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/compiler.h | 20 +- src/coreclr/jit/fgprofile.cpp | 45 ++-- src/coreclr/jit/gentree.cpp | 4 +- src/coreclr/jit/importer.cpp | 13 +- src/coreclr/jit/importercalls.cpp | 346 ++++++++++++------------------ 5 files changed, 179 insertions(+), 249 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 969211ac04e87c..2b056cfa0f3971 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3224,6 +3224,17 @@ class Compiler bool pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssize_t* pValue); + //------------------------------------------------------------------------ + // IsValueHistogramProbeCandidate: Determine if a node is a value-histogram + // probe candidate, and report the IL offset and operand to profile. + // + // Centralized so the importer (impProfile*), the value-instrumentor visitor, + // schema-builder and probe-inserter all agree on which trees are value-probe + // candidates and where their profiled operand lives. To extend value + // profiling to a new node kind, add a case here. + // + bool IsValueHistogramProbeCandidate(GenTree* node, IL_OFFSET* ilOffset = nullptr, GenTree*** operandUseRef = nullptr); + GenTree* gtNewLargeOperNode(genTreeOps oper, var_types type = TYP_I_IMPL, GenTree* op1 = nullptr, @@ -5091,9 +5102,12 @@ class Compiler GenTree* impFixupStructReturnType(GenTree* op); - GenTree* impDuplicateWithProfiledArg(GenTreeCall* call, IL_OFFSET ilOffset); - - GenTree* impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset); + GenTree* impProfileValueGuardedTree(GenTree* node, + GenTree** operandRef, + IL_OFFSET ilOffset, + ssize_t minProfitable, + ssize_t maxProfitable, + int* successMetric DEBUGARG(const char* tmpName)); GenTree* impThrowIfNull(GenTreeCall* call); diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index cde54cdff47577..fc62a51b20b848 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1894,39 +1894,24 @@ class HandleHistogramProbeVisitor final : public GenTreeVisitorOperIs(GT_LCLHEAP)) { // Mirror the gating in impProfileLclHeap: only zero-init, non-constant // locallocs are value-profile candidates. Anything else would have its // probe data ignored by the consumer and just waste a schema slot. - if (!compiler->info.compInitMem || node->gtGetOp1()->OperIsConst()) + if (!info.compInitMem || node->gtGetOp1()->OperIsConst()) { return false; } - *ilOffset = node->AsOpWithILOffset()->GetILOffset(); + if (ilOffset != nullptr) + { + *ilOffset = node->AsOpWithILOffset()->GetILOffset(); + } if (operandUseRef != nullptr) { *operandUseRef = &node->AsOp()->gtOp1; @@ -1936,12 +1921,14 @@ static bool IsValueHistogramProbeCandidate(Compiler* compiler, if (node->IsCall() && node->AsCall()->IsSpecialIntrinsic()) { - const NamedIntrinsic ni = compiler->lookupNamedIntrinsic(node->AsCall()->gtCallMethHnd); + const NamedIntrinsic ni = lookupNamedIntrinsic(node->AsCall()->gtCallMethHnd); if ((ni == NI_System_SpanHelpers_Memmove) || (ni == NI_System_SpanHelpers_SequenceEqual)) { - assert(node->AsCall()->gtHandleHistogramProfileCandidateInfo != nullptr); - *ilOffset = node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; - + if (ilOffset != nullptr) + { + assert(node->AsCall()->gtHandleHistogramProfileCandidateInfo != nullptr); + *ilOffset = node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; + } if (operandUseRef != nullptr) { // Memmove(dst, src, len) and SequenceEqual(left, right, len) -- profile len. @@ -1979,7 +1966,7 @@ class ValueHistogramProbeVisitor final : public GenTreeVisitorIsValueHistogramProbeCandidate(node)) { m_functor(m_compiler, node); } @@ -2069,7 +2056,7 @@ class BuildValueHistogramProbeSchemaGen void operator()(Compiler* compiler, GenTree* tree) { IL_OFFSET ilOffset = 0; - bool isCandidate = IsValueHistogramProbeCandidate(compiler, tree, &ilOffset); + bool isCandidate = compiler->IsValueHistogramProbeCandidate(tree, &ilOffset); assert(isCandidate); ICorJitInfo::PgoInstrumentationSchema schemaElem = {}; @@ -2313,7 +2300,7 @@ class ValueHistogramProbeInserter IL_OFFSET candidateIlOffset = 0; GenTree** operandUseRef = nullptr; - bool isCandidate = IsValueHistogramProbeCandidate(compiler, node, &candidateIlOffset, &operandUseRef); + bool isCandidate = compiler->IsValueHistogramProbeCandidate(node, &candidateIlOffset, &operandUseRef); assert(isCandidate); const ICorJitInfo::PgoInstrumentationSchema& countEntry = m_schema[*m_currentSchemaIndex]; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index f85cc1ddf36216..4afce91ec3c39a 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -6936,8 +6936,8 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) { const ssize_t size = op1->AsIntCon()->IconValue(); const ssize_t alignedSize = (size + (STACK_ALIGN - 1)) & ~(ssize_t)(STACK_ALIGN - 1); - costEx = 8 + (int)(alignedSize / REGSIZE_BYTES); // > 7 to block if-conversion - costSz = 4 + (int)(alignedSize / REGSIZE_BYTES); + costEx = 8 + (int)(alignedSize / REGSIZE_BYTES); // > 7 to block if-conversion + costSz = 4 + (int)(alignedSize / REGSIZE_BYTES); } else { diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index 254f753d835d2d..bb6738de22b226 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -10136,9 +10136,16 @@ void Compiler::impImportBlockCode(BasicBlock* block) op1 = gtNewLclHeapNode(op2, opcodeOffs); - // PGO value-profiling for non-constant zero-init stackalloc: - // optimize / instrument based on the most likely size. - op1 = impProfileLclHeap(op1, opcodeOffs); + // PGO value-profiling: optimize / instrument based on the most likely size. + // Only profitable for zero-init (compInitMem) and when the popular size is + // bigger than the variable-size loop's break-even (~32 bytes per benchmark). + if (info.compInitMem && JitConfig.JitProfileValues() && !op2->IsIntegralConst()) + { + op1 = impProfileValueGuardedTree(op1, &op1->AsOp()->gtOp1, opcodeOffs, + /* minProfitable */ 32, /* maxProfitable */ INT_MAX, + &Metrics.ValueProfiledLclHeap DEBUGARG( + "Profiled LCLHEAP Qmark")); + } // Request stack security for this method. setNeedsGSSecurityCookie(); diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index d80a3cd90cbfcb..5a5932e15f7de4 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1373,20 +1373,11 @@ var_types Compiler::impImportCall(OPCODE opcode, else if (JitConfig.JitProfileValues() && call->IsCall() && call->AsCall()->IsSpecialIntrinsic(this, NI_System_SpanHelpers_Memmove)) { - if (opts.IsOptimizedWithProfile()) - { - call = impDuplicateWithProfiledArg(call->AsCall(), rawILOffset); - } - else if (opts.IsInstrumented()) - { - // We might want to instrument it for optimized versions too, but we don't currently. - HandleHistogramProfileCandidateInfo* pInfo = - new (this, CMK_Inlining) HandleHistogramProfileCandidateInfo; - pInfo->ilOffset = rawILOffset; - pInfo->probeIndex = 0; - call->AsCall()->gtHandleHistogramProfileCandidateInfo = pInfo; - compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); - } + call = impProfileValueGuardedTree(call, &call->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(), + rawILOffset, + /* minProfitable */ 1, + /* maxProfitable */ (ssize_t)getUnrollThreshold(ProfiledMemmove), + &Metrics.ValueProfiledMemmove DEBUGARG("Profiled Memmove Qmark")); impAppendTree(call, CHECK_SPILL_ALL, impCurStmtDI); } else @@ -1529,27 +1520,13 @@ var_types Compiler::impImportCall(OPCODE opcode, if (JitConfig.JitProfileValues() && call->IsCall() && call->AsCall()->IsSpecialIntrinsic(this, NI_System_SpanHelpers_SequenceEqual)) { - if (opts.IsOptimizedWithProfile()) - { - call = impDuplicateWithProfiledArg(call->AsCall(), rawILOffset); - if (call->OperIs(GT_QMARK)) - { - // QMARK has to be a root node - unsigned tmp = lvaGrabTemp(true DEBUGARG("Grabbing temp for Qmark")); - impStoreToTemp(tmp, call, CHECK_SPILL_ALL); - call = gtNewLclvNode(tmp, call->TypeGet()); - } - } - else if (opts.IsInstrumented()) - { - // We might want to instrument it for optimized versions too, but we don't currently. - HandleHistogramProfileCandidateInfo* pInfo = - new (this, CMK_Inlining) HandleHistogramProfileCandidateInfo; - pInfo->ilOffset = rawILOffset; - pInfo->probeIndex = 0; - call->AsCall()->gtHandleHistogramProfileCandidateInfo = pInfo; - compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); - } + call = + impProfileValueGuardedTree(call, &call->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(), + rawILOffset, + /* minProfitable */ 1, + /* maxProfitable */ (ssize_t)getUnrollThreshold(ProfiledMemcmp), + &Metrics.ValueProfiledSequenceEqual DEBUGARG( + "Profiled SeqEqual Qmark")); } } @@ -1675,217 +1652,162 @@ GenTree* Compiler::impThrowIfNull(GenTreeCall* call) } //------------------------------------------------------------------------ -// impDuplicateWithProfiledArg: duplicates a call with a profiled argument, e.g.: -// Given `Buffer.Memmove(dst, src, len)` call, -// optimize it to: +// impProfileValueGuardedTree: PGO value-profile entry point. +// +// In Instrumented Tier0 (or OSR / Tier1Instrumented): mark `node` as a +// value-probe candidate so the value-histogram instrumentor inserts a +// CORINFO_HELP_VALUEPROFILE* call around the operand at `*operandRef`. +// +// In Tier1 with PGO: look up the popular value of the operand via +// pickProfiledValue and, if it's within [minProfitable, maxProfitable], +// replace `node` with a guard of the form: // -// if (len == popularSize) -// Buffer.Memmove(dst, src, popularSize); // can be unrolled now -// else -// Buffer.Memmove(dst, src, len); // fallback +// (operand == popularValue) ? +// : // -// if we can obtain the popular size from PGO data. +// The fast arm is `node` itself with `*operandRef` overwritten by a +// constant; the slow arm is a clone of `node` (which references the +// spilled operand temp). // // Arguments: -// call -- call to optimize with profiled argument -// ilOffset -- Raw IL offset of the call +// node - the tree containing the operand to profile. +// Either TYP_VOID (e.g. an unused-result call) or value-typed. +// operandRef - GenTree** to the operand-use within `node`. Mutated in place +// (replaced with the spilled-temp ref, then with the constant +// on the fast arm). +// ilOffset - IL offset of the originating opcode. +// minProfitable - inclusive lower bound on profitable popular values +// (caller-supplied, per shape). +// maxProfitable - inclusive upper bound on profitable popular values. +// successMetric - optional pointer to a JitMetrics counter; bumped on +// successful specialization. May be nullptr. +// tmpName - DEBUG-only name for the QMARK-result temp (only used when +// `node` is value-typed). // // Return Value: -// Optimized tree (or the original call tree if we can't optimize it). -// -GenTree* Compiler::impDuplicateWithProfiledArg(GenTreeCall* call, IL_OFFSET ilOffset) +// The original `node` if no specialization fires (or if we only marked it +// for instrumentation). Otherwise, a fresh tree (a QMARK for TYP_VOID, +// or an LCL_VAR fed by a STORE_LCL_VAR(QMARK) statement otherwise). +// +GenTree* Compiler::impProfileValueGuardedTree(GenTree* node, + GenTree** operandRef, + IL_OFFSET ilOffset, + ssize_t minProfitable, + ssize_t maxProfitable, + int* successMetric DEBUGARG(const char* tmpName)) { - assert(call->IsSpecialIntrinsic()); - assert(opts.IsOptimizedWithProfile()); - - if (call->IsInlineCandidate()) - { - // We decided to inline the whole thing? We won't be able to clone it then. - return call; - } + assert(node != nullptr); + assert(operandRef != nullptr); + assert(*operandRef != nullptr); - ssize_t profiledValue = 0; - uint32_t likelihood = 0; - // TODO: Tune the likelihood threshold, for now it's 50% - if (pickProfiledValue(ilOffset, &likelihood, &profiledValue) && (likelihood >= 50)) + // Instrumented Tier0 / OSR / Tier1Instrumented: mark this node as a value-profile + // candidate so the value-histogram instrumentor can later insert a probe. + if (opts.IsInstrumented() && !compIsForInlining()) { - unsigned argNum = 0; - ssize_t minValue = 0; - ssize_t maxValue = 0; - if (call->IsSpecialIntrinsic(this, NI_System_SpanHelpers_Memmove)) + if (node->IsCall()) { - // dst(0), src(1), len(2) - argNum = 2; - - minValue = 1; // TODO: enable for 0 as well. - maxValue = (ssize_t)getUnrollThreshold(ProfiledMemmove); + // Calls carry the IL offset via gtHandleHistogramProfileCandidateInfo. + // Other node kinds (e.g. GT_LCLHEAP) carry it on the node itself + // (e.g. via GenTreeOpWithILOffset); the caller is responsible for that. + HandleHistogramProfileCandidateInfo* pInfo = new (this, CMK_Inlining) HandleHistogramProfileCandidateInfo; + pInfo->ilOffset = ilOffset; + pInfo->probeIndex = 0; + node->AsCall()->gtHandleHistogramProfileCandidateInfo = pInfo; } - else if (call->IsSpecialIntrinsic(this, NI_System_SpanHelpers_SequenceEqual)) - { - // dst(0), src(1), len(2) - argNum = 2; - - minValue = 1; // TODO: enable for 0 as well. - maxValue = (ssize_t)getUnrollThreshold(ProfiledMemcmp); - } - else - { - // only Memmove is expected at the moment. - // Possible future extensions: Memset, Memcpy - unreached(); - } - - if ((profiledValue >= minValue) && (profiledValue <= maxValue)) - { - JITDUMP("Duplicating for popular value = %u\n", profiledValue) - DISPTREE(call) - - if (call->gtArgs.GetUserArgByIndex(argNum)->GetNode()->OperIsConst()) - { - JITDUMP("Profiled arg is already a constant - bail out.\n") - return call; - } - - // Spill all the arguments to temp locals to preserve the execution order - GenTree** argRef = nullptr; - GenTree* argClone = nullptr; - for (unsigned i = 0; i < call->gtArgs.CountUserArgs(); i++) - { - GenTree** node = &call->gtArgs.GetUserArgByIndex(i)->EarlyNodeRef(); - GenTree* cloned = impCloneExpr(*node, node, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling arg")); - - // Record the reference to the argument we're going to replace. - if (i == argNum) - { - argRef = node; - argClone = cloned; - } - } - - GenTree* fallbackCall = gtCloneExpr(call); - GenTree* profiledValueNode = gtNewIconNode(profiledValue, argClone->TypeGet()); - *argRef = profiledValueNode; - - GenTreeColon* colon = new (this, GT_COLON) GenTreeColon(call->TypeGet(), call, fallbackCall); - GenTreeOp* cond = gtNewOperNode(GT_EQ, TYP_INT, argClone, gtCloneExpr(profiledValueNode)); - GenTreeQmark* qmark = gtNewQmarkNode(call->TypeGet(), cond, colon); - qmark->SetThenNodeLikelihood(likelihood); + compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); + JITDUMP("\n ... marking [%06u] in " FMT_BB " for value profile instrumentation\n", dspTreeID(node), + compCurBB->bbNum); + return node; + } - JITDUMP("\n\nResulting tree:\n") - DISPTREE(qmark) + // Tier1 + PGO: try to specialize. + if (!opts.IsOptimizedWithProfile()) + { + return node; + } - if (call->IsSpecialIntrinsic(this, NI_System_SpanHelpers_Memmove)) - { - Metrics.ValueProfiledMemmove++; - } - else - { - assert(call->IsSpecialIntrinsic(this, NI_System_SpanHelpers_SequenceEqual)); - Metrics.ValueProfiledSequenceEqual++; - } + // Don't specialize calls about to be inlined: cloning then is unsafe. + if (node->IsCall() && node->AsCall()->IsInlineCandidate()) + { + return node; + } - return qmark; - } + if ((*operandRef)->OperIsConst()) + { + return node; } - return call; -} -//------------------------------------------------------------------------ -// impProfileLclHeap: Apply PGO value-profiling to a non-constant GT_LCLHEAP. -// -// Arguments: -// lclHeap - the GT_LCLHEAP node to profile/specialize -// ilOffset - IL offset of the LOCALLOC opcode -// -// Return Value: -// Either the original `lclHeap` node, or a fresh tree that produces the -// same value but is specialized via a profiled-size guard: -// -// size == popularSize ? LCLHEAP(popularSize) : LCLHEAP(size) -// -// Notes: -// The constant-size LCLHEAP fast path is unrolled by Lowering (efficient -// zero-init via STORE_BLK), so specializing for a popular size avoids the -// slow per-stack-alignment zeroing loop in the non-constant LCLHEAP codegen. -// -// We don't need that optimization if SkipInitLocals is set as even -// non-constant locallocs are cheap in that case; we only want to speed up -// zeroing. -// -GenTree* Compiler::impProfileLclHeap(GenTree* lclHeap, IL_OFFSET ilOffset) -{ - assert(lclHeap->OperIs(GT_LCLHEAP)); + ssize_t profiledValue = 0; + uint32_t likelihood = 0; + // TODO: Tune the likelihood threshold, for now it's 50%. + if (!pickProfiledValue(ilOffset, &likelihood, &profiledValue) || (likelihood < 50)) + { + return node; + } - GenTree* size = lclHeap->AsOp()->gtOp1; - if (!info.compInitMem || size->IsIntegralConst()) + if (!FitsIn(profiledValue) || (profiledValue < minProfitable) || (profiledValue > maxProfitable)) { - return lclHeap; + JITDUMP("Profiled value %zd outside [%zd, %zd] - skipping\n", profiledValue, minProfitable, maxProfitable); + return node; } - // Consuming the existing profile (optimizing). - if (opts.IsOptimizedWithProfile()) + JITDUMP("Building profile-guarded tree for popular value = %zd (%u%%)\n", profiledValue, likelihood); + DISPTREE(node); + + // For calls with multiple operands, pre-spill the non-profiled args so their + // side effects keep firing in evaluation order; otherwise the QMARK guard + // would delay them until one of the arms is taken. (The profiled operand is + // spilled below.) + if (node->IsCall()) { - ssize_t profiledValue = 0; - uint32_t likelihood = 0; - if (!pickProfiledValue(ilOffset, &likelihood, &profiledValue) || (likelihood < 50)) + GenTreeCall* const call = node->AsCall(); + for (unsigned i = 0; i < call->gtArgs.CountUserArgs(); i++) { - return lclHeap; + GenTree** otherRef = &call->gtArgs.GetUserArgByIndex(i)->EarlyNodeRef(); + if (otherRef == operandRef) + { + continue; + } + impCloneExpr(*otherRef, otherRef, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling other call arg")); } + } - // The fast path is profitable only when the constant-size LCLHEAP zero-init can - // actually be unrolled by Lowering. Skip very small popular values, since the - // cmp/jne guard plus contained-constant LCLHEAP codegen overhead (stack probe, - // outgoing-arg-area dance) costs more than the variable-size path saves at this - // size. We don't need a separate profitability upper bound here: for large - // constant-sized LCLHEAPs we emit a memzero call which is a lot faster than the - // loop we would get for variable sized LCLHEAP. Note that the profiled value - // must still satisfy the representability constraint checked below. - // - // The cutoff is empirical (per benchmark sweep on x64/arm64); it is unrelated to - // DEFAULT_MAX_LOCALLOC_TO_LOCAL_SIZE despite the value coincidence today. - const ssize_t minProfitableSize = 32; - if (profiledValue <= minProfitableSize) - { - JITDUMP("Profiled LCLHEAP size %zd is smaller than %zd - skipping\n", profiledValue, minProfitableSize); - return lclHeap; - } + // Spill the profiled operand so we can reference it both in the comparison + // and in the slow arm. + GenTree* const operandClone = + impCloneExpr(*operandRef, operandRef, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling profiled operand")); + const var_types operandType = genActualType(operandClone->TypeGet()); - if (!FitsIn(profiledValue)) - { - JITDUMP("Profiled LCLHEAP size %zd does not fit in int - skipping\n", profiledValue); - return lclHeap; - } + // Slow tree: clone of `node` (which now references the spilled-temp at *operandRef). + GenTree* const slowTree = gtCloneExpr(node); - GenTree* sizeNode = size; - GenTree* clonedSizeNode = - impCloneExpr(sizeNode, &sizeNode, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling sizeNode")); - GenTree* profiledValueNode = gtNewIconNode(profiledValue, size->TypeGet()); - GenTree* fallback = gtNewLclHeapNode(clonedSizeNode, ilOffset); + // Fast tree: `node` itself, with the profiled operand replaced by the constant. + *operandRef = gtNewIconNode(profiledValue, operandType); + GenTree* const fastTree = node; - // Fast path: a constant-size LCLHEAP that Lowering will unroll into STORE_BLK. - GenTree* fastpath = gtNewLclHeapNode(profiledValueNode, ilOffset); + const var_types resultType = node->TypeGet(); + GenTreeColon* colon = new (this, GT_COLON) GenTreeColon(resultType, fastTree, slowTree); + GenTreeOp* cond = gtNewOperNode(GT_EQ, TYP_INT, operandClone, gtNewIconNode(profiledValue, operandType)); + GenTreeQmark* qmark = gtNewQmarkNode(resultType, cond, colon); + qmark->SetThenNodeLikelihood(likelihood); - GenTreeColon* colon = new (this, GT_COLON) GenTreeColon(TYP_I_IMPL, fastpath, fallback); - GenTreeOp* cond = gtNewOperNode(GT_EQ, TYP_INT, sizeNode, gtCloneExpr(profiledValueNode)); - GenTreeQmark* qmark = gtNewQmarkNode(TYP_I_IMPL, cond, colon); - qmark->SetThenNodeLikelihood(likelihood); + JITDUMP("\nResulting tree:\n"); + DISPTREE(qmark); - // QMARK has to be a root node. - unsigned tmp = lvaGrabTemp(true DEBUGARG("Grabbing temp for Qmark")); - impStoreToTemp(tmp, qmark, CHECK_SPILL_ALL); - Metrics.ValueProfiledLclHeap++; - return gtNewLclvNode(tmp, qmark->TypeGet()); + if (successMetric != nullptr) + { + (*successMetric)++; } - // Instrumenting LCLHEAP for value profile. - if (opts.IsInstrumented() && !compIsForInlining()) + if (resultType == TYP_VOID) { - JITDUMP("\n ... marking [%06u] in " FMT_BB " for value profile instrumentation\n", dspTreeID(lclHeap), - compCurBB->bbNum); - compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); + return qmark; } - return lclHeap; + // QMARKs producing a value cannot stand on their own; spill into a temp. + const unsigned tmp = lvaGrabTemp(true DEBUGARG(tmpName)); + impStoreToTemp(tmp, qmark, CHECK_SPILL_ALL); + return gtNewLclvNode(tmp, resultType); } #ifdef DEBUG From c7f908e3b12f049c6a2e48aa443f6cf29b3afc28 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 17:49:04 +0200 Subject: [PATCH 19/23] Spill all side-effecting siblings via UseEdges() iterator Previously the sibling-spill loop in impProfileValueGuardedTree only walked GetUserArgByIndex(i) over CountUserArgs(), which misses: * gtControlExpr (indirect / fat / expanded-virtual call targets) * Non-user CallArg slots (this pointer, ret buffer, well-known args like PInvoke cookie / stack-array-local / etc.) Replace with a generic GenTreeUseEdgeIterator walk. Documented as "each use edge of a GenTree node in the order in which they are used" (GTF_REVERSE_OPS-aware), so eval order is preserved. As a bonus the helper now works uniformly across node kinds (calls, unary nodes, HWINTRINSICs, ...) without an IsCall() special case. Also skip side-effect-free siblings (e.g. CNS_INT, fresh LCL_VARs): no reordering hazard, no need to allocate a temp. Verified: * SPMI benchmarks.run.: All replays clean. * LCLHEAP repro: ValueProfiledLclHeap = 1, Tier1 codegen unchanged. * Memmove repro under JitRandomGuardedDevirtualization=1 stress: builds QMARK and bumps ValueProfiledMemmove. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/importercalls.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 5a5932e15f7de4..d91e8254caa4db 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1754,22 +1754,23 @@ GenTree* Compiler::impProfileValueGuardedTree(GenTree* node, JITDUMP("Building profile-guarded tree for popular value = %zd (%u%%)\n", profiledValue, likelihood); DISPTREE(node); - // For calls with multiple operands, pre-spill the non-profiled args so their - // side effects keep firing in evaluation order; otherwise the QMARK guard - // would delay them until one of the arms is taken. (The profiled operand is - // spilled below.) - if (node->IsCall()) + // Pre-spill any side-effecting sibling operands so their effects fire before + // the QMARK guard. This is generic across node kinds (calls, unary nodes, + // HWINTRINSICs, ...): GenTreeUseEdgeIterator walks all use-edges in evaluation + // order, including gtControlExpr and every CallArg slot for calls. + // (The profiled operand is spilled separately below.) + for (GenTree** useEdge : node->UseEdges()) { - GenTreeCall* const call = node->AsCall(); - for (unsigned i = 0; i < call->gtArgs.CountUserArgs(); i++) + if (useEdge == operandRef) { - GenTree** otherRef = &call->gtArgs.GetUserArgByIndex(i)->EarlyNodeRef(); - if (otherRef == operandRef) - { - continue; - } - impCloneExpr(*otherRef, otherRef, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling other call arg")); + continue; + } + if (((*useEdge)->gtFlags & GTF_SIDE_EFFECT) == 0) + { + // No side effects -> no reordering hazard, no need for a temp. + continue; } + impCloneExpr(*useEdge, useEdge, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling sibling operand")); } // Spill the profiled operand so we can reference it both in the comparison From 6ead091f38b230d07d140d22381f8e4c75c26130 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 9 May 2026 17:53:56 +0200 Subject: [PATCH 20/23] Address Copilot reviewer: gate call candidacy on non-null candidate info IsValueHistogramProbeCandidate matched Memmove/SequenceEqual purely on NamedIntrinsic. Since impProfileValueGuardedTree skips marking during inline import (compIsForInlining), an inlined Memmove can survive into a caller's BB without gtHandleHistogramProfileCandidateInfo. If that caller's BB is marked BBF_HAS_VALUE_PROFILE because of its own probe site, the value-instrumentor visitor walks the inlined call, matches on the intrinsic, and then derefs/asserts the null candidate info in the schema-builder / inserter. Treat presence of gtHandleHistogramProfileCandidateInfo as the explicit "importer marked this call for value profiling" signal: if it's null, return false. Inlined unmarked sites are now skipped cleanly. (LCLHEAP doesn't have an analogous explicit-mark bit -- it always carries an IL offset on GenTreeOpWithILOffset -- so an inlined LCLHEAP in a marked BB still gets a schema slot. That's wasted instrumentation, but not a crash; needs a separate mechanism to fully suppress.) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/fgprofile.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index fc62a51b20b848..cf74484c7db2fc 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1921,12 +1921,21 @@ bool Compiler::IsValueHistogramProbeCandidate(GenTree* node, IL_OFFSET* ilOffset if (node->IsCall() && node->AsCall()->IsSpecialIntrinsic()) { + // Require gtHandleHistogramProfileCandidateInfo to be set: that's the + // explicit "the importer marked this site for value profiling" signal. + // Without this check, an inlined Memmove/SequenceEqual (which the importer + // intentionally skips) would still match purely on NamedIntrinsic and the + // visitor would later deref the null candidate info. + if (node->AsCall()->gtHandleHistogramProfileCandidateInfo == nullptr) + { + return false; + } + const NamedIntrinsic ni = lookupNamedIntrinsic(node->AsCall()->gtCallMethHnd); if ((ni == NI_System_SpanHelpers_Memmove) || (ni == NI_System_SpanHelpers_SequenceEqual)) { if (ilOffset != nullptr) { - assert(node->AsCall()->gtHandleHistogramProfileCandidateInfo != nullptr); *ilOffset = node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; } if (operandUseRef != nullptr) From f392b2c00f066b2e5b915681d8f94d09f446fe48 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 10 May 2026 00:13:28 +0200 Subject: [PATCH 21/23] Fix CI: don't overwrite gtInlineCandidateInfo via union when marking value-profile probe For PR #127970. CI was failing with Assertion failed 'inlineContext != nullptr' in fginline.cpp:33 for methods like System.IO.UnmanagedMemoryStream.ReadCore in Instrumented Tier1, e.g. ISSUE: #417766 fginline.cpp (33) - Assertion failed 'inlineContext != nullptr' in 'System.IO.UnmanagedMemoryStream:ReadCore(System.Span\1[byte]):int:this' during 'Morph - Inlining' (IL size 186; hash 0x595f5e98; Instrumented Tier1) Root cause: GenTreeCall has a union slot shared between gtCallCookie / gtInlineCandidateInfo / gtInlineCandidateInfoList / gtHandleHistogramProfileCandidateInfo / etc. When a call (e.g. Buffer.Memmove) is already an inline / GDV candidate -- which is common in Tier1Instrumented where both BBINSTR and BBOPT are set -- the new impProfileValueGuardedTree IsInstrumented branch was allocating a fresh HandleHistogramProfileCandidateInfo and assigning it through gtHandleHistogramProfileCandidateInfo, overwriting the InlineCandidateInfo* in the union. The inliner would then read past the smaller HandleHistogramProfileCandidateInfo struct when accessing inlineCandidateInfo->inlinersContext and hit the assertion (or worse, garbage pointers). The pre-existing else-if (opts.IsInstrumented()) branch in the OLD code only fired in pure Tier0Instrumented mode where calls aren't yet inline candidates, so the conflict didn't manifest. The PR's unification fires it in Tier1Instrumented too, exposing the bug. Fix: 1. impProfileValueGuardedTree (importercalls.cpp): in the IsInstrumented branch for calls, skip the value-profile marking when the call is already an inline or GDV candidate -- preserving the union contents. 2. IsValueHistogramProbeCandidate (fgprofile.cpp): for the same reason, also bail out for inline/GDV candidate calls -- the non-null pointer check is not a sufficient discriminator when the union may hold inline metadata; reading the inherited ilOffset would be reading the inline candidate struct's bytes. Verification: SuperPMI replay across all windows.x64 collections reproduces the assertion 3x (in coreclr_tests, libraries_tests x2) without the fix and is fully clean with the fix. Also clean under JitStress=2. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/fgprofile.cpp | 21 +++++++++++++++++---- src/coreclr/jit/importercalls.cpp | 24 ++++++++++++++++++++---- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index cf74484c7db2fc..ee975801aacef8 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1921,27 +1921,40 @@ bool Compiler::IsValueHistogramProbeCandidate(GenTree* node, IL_OFFSET* ilOffset if (node->IsCall() && node->AsCall()->IsSpecialIntrinsic()) { + GenTreeCall* const call = node->AsCall(); + + // gtHandleHistogramProfileCandidateInfo aliases gtInlineCandidateInfo / + // gtInlineCandidateInfoList / gtCallCookie / gtDirectCallAddress in a union + // (see GenTreeCall in gentree.h). If the call is an inline or GDV candidate, + // the union slot holds inline metadata, NOT a HandleHistogramProfileCandidateInfo, + // so a non-null pointer here would not be a real value-profile marker. + // Skip such calls to avoid reading inline metadata as histogram metadata. + if (call->IsInlineCandidate() || call->IsGuardedDevirtualizationCandidate()) + { + return false; + } + // Require gtHandleHistogramProfileCandidateInfo to be set: that's the // explicit "the importer marked this site for value profiling" signal. // Without this check, an inlined Memmove/SequenceEqual (which the importer // intentionally skips) would still match purely on NamedIntrinsic and the // visitor would later deref the null candidate info. - if (node->AsCall()->gtHandleHistogramProfileCandidateInfo == nullptr) + if (call->gtHandleHistogramProfileCandidateInfo == nullptr) { return false; } - const NamedIntrinsic ni = lookupNamedIntrinsic(node->AsCall()->gtCallMethHnd); + const NamedIntrinsic ni = lookupNamedIntrinsic(call->gtCallMethHnd); if ((ni == NI_System_SpanHelpers_Memmove) || (ni == NI_System_SpanHelpers_SequenceEqual)) { if (ilOffset != nullptr) { - *ilOffset = node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; + *ilOffset = call->gtHandleHistogramProfileCandidateInfo->ilOffset; } if (operandUseRef != nullptr) { // Memmove(dst, src, len) and SequenceEqual(left, right, len) -- profile len. - *operandUseRef = &node->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(); + *operandUseRef = &call->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(); } return true; } diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index d91e8254caa4db..71934309e1ffa2 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1706,13 +1706,29 @@ GenTree* Compiler::impProfileValueGuardedTree(GenTree* node, { if (node->IsCall()) { + GenTreeCall* const call = node->AsCall(); + + // gtHandleHistogramProfileCandidateInfo shares a union slot with + // gtInlineCandidateInfo / gtInlineCandidateInfoList / gtCallCookie / + // gtDirectCallAddress / etc. (see GenTreeCall in gentree.h). If the call + // already has inline metadata in that slot, allocating a fresh + // HandleHistogramProfileCandidateInfo here would overwrite the inline + // metadata pointer. The inliner would then read garbage for fields it + // expects on InlineCandidateInfo (e.g. inlinersContext) and assert. + // Skip the marking in that case; we preserve the inline metadata, and + // if the call ultimately fails to inline a future re-jit can revisit it. + if (call->IsInlineCandidate() || call->IsGuardedDevirtualizationCandidate()) + { + return node; + } + // Calls carry the IL offset via gtHandleHistogramProfileCandidateInfo. // Other node kinds (e.g. GT_LCLHEAP) carry it on the node itself // (e.g. via GenTreeOpWithILOffset); the caller is responsible for that. - HandleHistogramProfileCandidateInfo* pInfo = new (this, CMK_Inlining) HandleHistogramProfileCandidateInfo; - pInfo->ilOffset = ilOffset; - pInfo->probeIndex = 0; - node->AsCall()->gtHandleHistogramProfileCandidateInfo = pInfo; + HandleHistogramProfileCandidateInfo* pInfo = new (this, CMK_Inlining) HandleHistogramProfileCandidateInfo; + pInfo->ilOffset = ilOffset; + pInfo->probeIndex = 0; + call->gtHandleHistogramProfileCandidateInfo = pInfo; } compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); JITDUMP("\n ... marking [%06u] in " FMT_BB " for value profile instrumentation\n", dspTreeID(node), From c6e3250e1b6f8db1993ea97aba174d6c6d20fbdc Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 10 May 2026 19:39:53 +0200 Subject: [PATCH 22/23] Update Directory.Build.targets --- src/libraries/Directory.Build.targets | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/libraries/Directory.Build.targets b/src/libraries/Directory.Build.targets index 0affd212e630a8..b6fee26af35e3d 100644 --- a/src/libraries/Directory.Build.targets +++ b/src/libraries/Directory.Build.targets @@ -248,16 +248,6 @@ true - - - true - - @@ -266,7 +256,7 @@ true - + From b10e52d8fb9f72c6da916af6ada71428c98ae7c5 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 10 May 2026 19:54:10 +0200 Subject: [PATCH 23/23] Address remaining Copilot reviewer feedback - fgprofile.cpp: update stale comment referencing the obsolete impProfileLclHeap; reference impProfileValueGuardedTree call site instead. - fgprofile.cpp: clarify the value-profile cast comment - the operand is a 32-bit length/size value, not 'LCLHEAP being uint32_t-typed'. - gentree.cpp: clamp constant-size cost computation for GT_LCLHEAP. Pathological large/non-positive constants would overflow the int cast and underflow the cost into a small/negative value, defeating the if-conversion block; fall back to the non-constant cost in that case. - importercalls.cpp: add missing ';' on a JITDUMP for consistency. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/fgprofile.cpp | 16 ++++++++++------ src/coreclr/jit/gentree.cpp | 27 ++++++++++++++++----------- src/coreclr/jit/importercalls.cpp | 2 +- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index ee975801aacef8..7991a96da4530f 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1900,9 +1900,11 @@ bool Compiler::IsValueHistogramProbeCandidate(GenTree* node, IL_OFFSET* ilOffset { if (node->OperIs(GT_LCLHEAP)) { - // Mirror the gating in impProfileLclHeap: only zero-init, non-constant - // locallocs are value-profile candidates. Anything else would have its - // probe data ignored by the consumer and just waste a schema slot. + // Mirror the gating at the impProfileValueGuardedTree call site for + // CEE_LOCALLOC (see Compiler::impImportBlockCode): only zero-init, + // non-constant locallocs are value-profile candidates. Anything else + // would have its probe data ignored by the consumer and just waste a + // schema slot. if (!info.compInitMem || node->gtGetOp1()->OperIsConst()) { return false; @@ -2358,9 +2360,11 @@ class ValueHistogramProbeInserter if (!lengthNode->TypeIs(TYP_I_IMPL)) { - // CORINFO_HELP_VALUEPROFILE always expects nint. - // Zero-extend for now due to LCLHEAP being actually uint32_t-typed, but in the future we may want to - // support with sign-extension as well. + // CORINFO_HELP_VALUEPROFILE always expects nint, but the operand here may be + // a 32-bit value (e.g. the int-typed size operand of GT_LCLHEAP, or a TYP_INT + // length on 64-bit). Zero-extend to nint so the recorded histogram values are + // unsigned and not skewed by sign extension; sign-extension can be added later + // for operands where it makes sense. assert(genActualType(lengthNode) == TYP_INT); lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /*fromUnsigned*/ true, TYP_I_IMPL); } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 4afce91ec3c39a..5dda2c502abfa3 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -6930,19 +6930,24 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) // unconditionally. // // For constant sizes the zero-init is unrolled by Lowering, so the - // cost scales with the size; for non-constant sizes we use a fixed - // upper bound representing the runtime loop. + // cost scales with the size; for non-constant sizes (or sizes that + // would overflow our int cost) we use a fixed upper bound representing + // the runtime loop. + costEx = 36; + costSz = 8; if (op1->IsCnsIntOrI() && info.compInitMem) { - const ssize_t size = op1->AsIntCon()->IconValue(); - const ssize_t alignedSize = (size + (STACK_ALIGN - 1)) & ~(ssize_t)(STACK_ALIGN - 1); - costEx = 8 + (int)(alignedSize / REGSIZE_BYTES); // > 7 to block if-conversion - costSz = 4 + (int)(alignedSize / REGSIZE_BYTES); - } - else - { - costEx = 36; - costSz = 8; + const ssize_t size = op1->AsIntCon()->IconValue(); + // Guard against pathological constant sizes: very large or + // non-positive values would overflow the cost computation + // and could underflow into a small/negative cost, defeating + // the if-conversion block above. + if ((size > 0) && (size <= INT_MAX)) + { + const ssize_t alignedSize = (size + (STACK_ALIGN - 1)) & ~(ssize_t)(STACK_ALIGN - 1); + costEx = 8 + (int)(alignedSize / REGSIZE_BYTES); // > 7 to block if-conversion + costSz = 4 + (int)(alignedSize / REGSIZE_BYTES); + } } break; } diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 71934309e1ffa2..0f87ddd7c13eaf 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -7209,7 +7209,7 @@ bool Compiler::pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssiz JITDUMP("%u likely values:\n", valuesCount); for (UINT32 i = 0; i < valuesCount; i++) { - JITDUMP(" %u) %zd - %u%%\n", i, likelyValues[i].value, likelyValues[i].likelihood) + JITDUMP(" %u) %zd - %u%%\n", i, likelyValues[i].value, likelyValues[i].likelihood); } // Re-use JitRandomGuardedDevirtualization for stress-testing.