diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index cc2e46f8203e8c..98a7f9aa0caeda 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3220,6 +3220,21 @@ class Compiler GenTreeColon* gtNewColonNode(var_types type, GenTree* thenNode, GenTree* elseNode); GenTreeQmark* gtNewQmarkNode(var_types type, GenTree* cond, GenTreeColon* colon); + GenTreeOpWithILOffset* gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOffset); + + bool pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssize_t* pValue); + + //------------------------------------------------------------------------ + // IsValueHistogramProbeCandidate: Determine if a node is a value-histogram + // probe candidate, and report the IL offset and operand to profile. + // + // Centralized so the importer (impProfile*), the value-instrumentor visitor, + // schema-builder and probe-inserter all agree on which trees are value-probe + // candidates and where their profiled operand lives. To extend value + // profiling to a new node kind, add a case here. + // + bool IsValueHistogramProbeCandidate(GenTree* node, IL_OFFSET* ilOffset = nullptr, GenTree*** operandUseRef = nullptr); + GenTree* gtNewLargeOperNode(genTreeOps oper, var_types type = TYP_I_IMPL, GenTree* op1 = nullptr, @@ -5085,7 +5100,12 @@ class Compiler GenTree* impFixupStructReturnType(GenTree* op); - GenTree* impDuplicateWithProfiledArg(GenTreeCall* call, IL_OFFSET ilOffset); + GenTree* impProfileValueGuardedTree(GenTree* node, + GenTree** operandRef, + IL_OFFSET ilOffset, + ssize_t minProfitable, + ssize_t maxProfitable, + int* successMetric DEBUGARG(const char* tmpName)); GenTree* impThrowIfNull(GenTreeCall* call); diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index 00a3661c5f6dac..7991a96da4530f 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1893,6 +1893,78 @@ class HandleHistogramProbeVisitor final : public GenTreeVisitorOperIs(GT_LCLHEAP)) + { + // Mirror the gating at the impProfileValueGuardedTree call site for + // CEE_LOCALLOC (see Compiler::impImportBlockCode): only zero-init, + // non-constant locallocs are value-profile candidates. Anything else + // would have its probe data ignored by the consumer and just waste a + // schema slot. + if (!info.compInitMem || node->gtGetOp1()->OperIsConst()) + { + return false; + } + + if (ilOffset != nullptr) + { + *ilOffset = node->AsOpWithILOffset()->GetILOffset(); + } + if (operandUseRef != nullptr) + { + *operandUseRef = &node->AsOp()->gtOp1; + } + return true; + } + + if (node->IsCall() && node->AsCall()->IsSpecialIntrinsic()) + { + GenTreeCall* const call = node->AsCall(); + + // gtHandleHistogramProfileCandidateInfo aliases gtInlineCandidateInfo / + // gtInlineCandidateInfoList / gtCallCookie / gtDirectCallAddress in a union + // (see GenTreeCall in gentree.h). If the call is an inline or GDV candidate, + // the union slot holds inline metadata, NOT a HandleHistogramProfileCandidateInfo, + // so a non-null pointer here would not be a real value-profile marker. + // Skip such calls to avoid reading inline metadata as histogram metadata. + if (call->IsInlineCandidate() || call->IsGuardedDevirtualizationCandidate()) + { + return false; + } + + // Require gtHandleHistogramProfileCandidateInfo to be set: that's the + // explicit "the importer marked this site for value profiling" signal. + // Without this check, an inlined Memmove/SequenceEqual (which the importer + // intentionally skips) would still match purely on NamedIntrinsic and the + // visitor would later deref the null candidate info. + if (call->gtHandleHistogramProfileCandidateInfo == nullptr) + { + return false; + } + + const NamedIntrinsic ni = lookupNamedIntrinsic(call->gtCallMethHnd); + if ((ni == NI_System_SpanHelpers_Memmove) || (ni == NI_System_SpanHelpers_SequenceEqual)) + { + if (ilOffset != nullptr) + { + *ilOffset = call->gtHandleHistogramProfileCandidateInfo->ilOffset; + } + if (operandUseRef != nullptr) + { + // Memmove(dst, src, len) and SequenceEqual(left, right, len) -- profile len. + *operandUseRef = &call->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(); + } + return true; + } + } + + return false; +} + //------------------------------------------------------------------------ // ValueHistogramProbeVisitor: invoke functor on each node requiring a generic value probe // @@ -1918,13 +1990,9 @@ class ValueHistogramProbeVisitor final : public GenTreeVisitorIsCall() && node->AsCall()->IsSpecialIntrinsic()) + if (m_compiler->IsValueHistogramProbeCandidate(node)) { - const NamedIntrinsic ni = m_compiler->lookupNamedIntrinsic(node->AsCall()->gtCallMethHnd); - if ((ni == NI_System_SpanHelpers_Memmove) || (ni == NI_System_SpanHelpers_SequenceEqual)) - { - m_functor(m_compiler, node); - } + m_functor(m_compiler, node); } return Compiler::WALK_CONTINUE; } @@ -2009,14 +2077,18 @@ class BuildValueHistogramProbeSchemaGen { } - void operator()(Compiler* compiler, GenTree* call) + void operator()(Compiler* compiler, GenTree* tree) { + IL_OFFSET ilOffset = 0; + bool isCandidate = compiler->IsValueHistogramProbeCandidate(tree, &ilOffset); + assert(isCandidate); + ICorJitInfo::PgoInstrumentationSchema schemaElem = {}; schemaElem.Count = 1; schemaElem.InstrumentationKind = compiler->opts.compCollect64BitCounts ? ICorJitInfo::PgoInstrumentationKind::ValueHistogramLongCount : ICorJitInfo::PgoInstrumentationKind::ValueHistogramIntCount; - schemaElem.ILOffset = (int32_t)call->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; + schemaElem.ILOffset = static_cast(ilOffset); m_schema.push_back(schemaElem); m_schemaCount++; @@ -2250,12 +2322,13 @@ class ValueHistogramProbeInserter return; } - assert(node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_Memmove) || - node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_SequenceEqual)); + IL_OFFSET candidateIlOffset = 0; + GenTree** operandUseRef = nullptr; + bool isCandidate = compiler->IsValueHistogramProbeCandidate(node, &candidateIlOffset, &operandUseRef); + assert(isCandidate); const ICorJitInfo::PgoInstrumentationSchema& countEntry = m_schema[*m_currentSchemaIndex]; - if (countEntry.ILOffset != - static_cast(node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset)) + if (countEntry.ILOffset != static_cast(candidateIlOffset)) { return; } @@ -2275,30 +2348,31 @@ class ValueHistogramProbeInserter *m_currentSchemaIndex += 2; - GenTree** lenArgRef = &node->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(); - - // We have Memmove(dst, src, len) and we want to insert a call to CORINFO_HELP_VALUEPROFILE for the len: - // - // \--* COMMA long - // +--* CALL help void CORINFO_HELP_VALUEPROFILE - // | +--* COMMA long - // | | +--* STORE_LCL_VAR long tmp - // | | | \--* (node to poll) - // | | \--* LCL_VAR long tmp - // | \--* CNS_INT long - // \--* LCL_VAR long tmp + // Inject CORINFO_HELP_VALUEPROFILE helper call to record the value in the histogram. + // The helper call is injected as a comma node that stores the value to a temp. // - const unsigned lenTmpNum = compiler->lvaGrabTemp(true DEBUGARG("length histogram profile tmp")); - GenTree* storeLenToTemp = compiler->gtNewTempStore(lenTmpNum, *lenArgRef); - GenTree* lengthLocal = compiler->gtNewLclvNode(lenTmpNum, genActualType(*lenArgRef)); - GenTreeOp* lengthNode = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), storeLenToTemp, lengthLocal); - GenTree* histNode = compiler->gtNewIconNode(reinterpret_cast(hist), TYP_I_IMPL); - unsigned helper = is32 ? CORINFO_HELP_VALUEPROFILE32 : CORINFO_HELP_VALUEPROFILE64; + GenTree* storeLenToTemp = compiler->gtNewTempStore(lenTmpNum, *operandUseRef); + GenTree* lengthLocal = compiler->gtNewLclvNode(lenTmpNum, genActualType(*operandUseRef)); + GenTree* lengthNode = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), storeLenToTemp, lengthLocal); + GenTree* histNode = compiler->gtNewIconNode(reinterpret_cast(hist), TYP_I_IMPL); + unsigned helper = is32 ? CORINFO_HELP_VALUEPROFILE32 : CORINFO_HELP_VALUEPROFILE64; + + if (!lengthNode->TypeIs(TYP_I_IMPL)) + { + // CORINFO_HELP_VALUEPROFILE always expects nint, but the operand here may be + // a 32-bit value (e.g. the int-typed size operand of GT_LCLHEAP, or a TYP_INT + // length on 64-bit). Zero-extend to nint so the recorded histogram values are + // unsigned and not skewed by sign extension; sign-extension can be added later + // for operands where it makes sense. + assert(genActualType(lengthNode) == TYP_INT); + lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /*fromUnsigned*/ true, TYP_I_IMPL); + } + GenTreeCall* helperCallNode = compiler->gtNewHelperCallNode(helper, TYP_VOID, lengthNode, histNode); - *lenArgRef = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), helperCallNode, - compiler->gtCloneExpr(lengthLocal)); + *operandUseRef = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), helperCallNode, + compiler->gtCloneExpr(lengthLocal)); m_instrCount++; } }; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9fa3d6534c50c8..6dba6e9b5067b5 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -2876,6 +2876,7 @@ bool GenTree::Compare(GenTree* op1, GenTree* op2, bool swapOK) break; // For the ones below no extra argument matters for comparison. + case GT_LCLHEAP: case GT_BOX: case GT_RUNTIMELOOKUP: case GT_ARR_ADDR: @@ -3449,6 +3450,7 @@ unsigned Compiler::gtHashValue(GenTree* tree) break; // For the ones below no extra argument matters for comparison. + case GT_LCLHEAP: case GT_BOX: case GT_ARR_ADDR: break; @@ -6921,6 +6923,37 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree) costSz = 2 * 2; break; + case GT_LCLHEAP: + { + // GT_LCLHEAP is more expensive than a typical unary op (it adjusts + // SP, may probe the stack, and -- under compInitMem -- zeros the + // allocated range). Give it a high cost so phases like if-conversion + // (which has a CostEx > 7 cutoff) won't speculatively evaluate it + // unconditionally. + // + // For constant sizes the zero-init is unrolled by Lowering, so the + // cost scales with the size; for non-constant sizes (or sizes that + // would overflow our int cost) we use a fixed upper bound representing + // the runtime loop. + costEx = 36; + costSz = 8; + if (op1->IsCnsIntOrI() && info.compInitMem) + { + const ssize_t size = op1->AsIntCon()->IconValue(); + // Guard against pathological constant sizes: very large or + // non-positive values would overflow the cost computation + // and could underflow into a small/negative cost, defeating + // the if-conversion block above. + if ((size > 0) && (size <= INT_MAX)) + { + const ssize_t alignedSize = (size + (STACK_ALIGN - 1)) & ~(ssize_t)(STACK_ALIGN - 1); + costEx = 8 + (int)(alignedSize / REGSIZE_BYTES); // > 7 to block if-conversion + costSz = 4 + (int)(alignedSize / REGSIZE_BYTES); + } + } + break; + } + case GT_ARR_ADDR: costEx = 0; costSz = 0; @@ -9033,6 +9066,16 @@ GenTreeQmark* Compiler::gtNewQmarkNode(var_types type, GenTree* cond, GenTreeCol return result; } +GenTreeOpWithILOffset* Compiler::gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOffset) +{ + assert(size != nullptr); + GenTreeOpWithILOffset* node = + new (this, GT_LCLHEAP) GenTreeOpWithILOffset(GT_LCLHEAP, TYP_I_IMPL, size, nullptr, ilOffset); + // Don't allow CSE to share locallocs. + node->gtFlags |= GTF_DONT_CSE; + return node; +} + GenTreeIntCon* Compiler::gtNewIconNode(ssize_t value, var_types type) { assert(genActualType(type) == type); @@ -11128,6 +11171,11 @@ GenTree* Compiler::gtCloneExpr(GenTree* tree) } break; + case GT_LCLHEAP: + copy = new (this, GT_LCLHEAP) GenTreeOpWithILOffset(GT_LCLHEAP, tree->TypeGet(), tree->gtGetOp1(), + nullptr, tree->AsOpWithILOffset()->GetILOffset()); + break; + case GT_SWIFT_ERROR_RET: copy = new (this, oper) GenTreeOp(oper, tree->TypeGet(), tree->gtGetOp1(), tree->gtGetOp2()); break; diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 1897b43014f17d..a74b0eccf80582 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -3250,7 +3250,6 @@ struct GenTreeOp : public GenTreeUnOp // Unary operators with optional arguments: assert(oper == GT_RETURN || oper == GT_RETFILT || OperIsBlk(oper)); } - // returns true if we will use the division by constant optimization for this node. bool UsesDivideByConstOptimized(Compiler* comp); @@ -3316,6 +3315,49 @@ struct GenTreeOp : public GenTreeUnOp #endif }; +// GenTreeOpWithILOffset - a GenTreeOp that additionally carries an IL offset. +// +// Used by JIT phases that need to associate a node with a specific IL offset +// independently of any debug-info side tables. In particular, it allows +// non-call value-profile probe candidates (e.g. GT_LCLHEAP) to participate +// in the same value-histogram instrumentation pipeline as calls without a +// side hash table. +// +struct GenTreeOpWithILOffset : public GenTreeOp +{ +private: + IL_OFFSET gtILOffset; + +public: + IL_OFFSET GetILOffset() const + { + return gtILOffset; + } + + void SetILOffset(IL_OFFSET ilOffset) + { + gtILOffset = ilOffset; + } + + GenTreeOpWithILOffset(genTreeOps oper, + var_types type, + GenTree* op1, + GenTree* op2, + IL_OFFSET ilOffset DEBUGARG(bool largeNode = false)) + : GenTreeOp(oper, type, op1, op2 DEBUGARG(largeNode)) + , gtILOffset(ilOffset) + { + } + +#if DEBUGGABLE_GENTREE + GenTreeOpWithILOffset() + : GenTreeOp() + , gtILOffset(BAD_IL_OFFSET) + { + } +#endif +}; + struct GenTreeVal : public GenTree { size_t gtVal1; diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h index b293df525ed695..40f623a408c102 100644 --- a/src/coreclr/jit/gtlist.h +++ b/src/coreclr/jit/gtlist.h @@ -70,7 +70,7 @@ GTNODE(KEEPALIVE , GenTree ,0,0,GTK_UNOP|GTK_NOVALUE) // kee GTNODE(CAST , GenTreeCast ,0,0,GTK_UNOP|GTK_EXOP) // conversion to another type GTNODE(BITCAST , GenTreeOp ,0,1,GTK_UNOP) // reinterpretation of bits as another type GTNODE(CKFINITE , GenTreeOp ,0,1,GTK_UNOP|DBK_NOCONTAIN) // Check for NaN -GTNODE(LCLHEAP , GenTreeOp ,0,1,GTK_UNOP|DBK_NOCONTAIN) // alloca() +GTNODE(LCLHEAP , GenTreeOpWithILOffset, 0,1,GTK_UNOP|GTK_EXOP|DBK_NOCONTAIN) // alloca() GTNODE(BOUNDS_CHECK , GenTreeBoundsChk ,0,1,GTK_BINOP|GTK_EXOP|GTK_NOVALUE) // a bounds check - for arrays/spans/SIMDs/HWINTRINSICs diff --git a/src/coreclr/jit/gtstructs.h b/src/coreclr/jit/gtstructs.h index 6e7d62e496f038..d8afe98424ee48 100644 --- a/src/coreclr/jit/gtstructs.h +++ b/src/coreclr/jit/gtstructs.h @@ -74,6 +74,7 @@ GTSTRUCT_1(Colon , GT_COLON) GTSTRUCT_1(FptrVal , GT_FTN_ADDR) GTSTRUCT_1(Intrinsic , GT_INTRINSIC) GTSTRUCT_1(IndexAddr , GT_INDEX_ADDR) +GTSTRUCT_1(OpWithILOffset, GT_LCLHEAP) #if defined(FEATURE_HW_INTRINSICS) GTSTRUCT_N(MultiOp , GT_HWINTRINSIC) #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index a83b16e1591cb9..d867d86c965da2 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -10119,10 +10119,18 @@ void Compiler::impImportBlockCode(BasicBlock* block) return; } - op1 = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, op2); - // We do not model stack overflow from localloc as an exception side effect. - // Obviously, we don't want locallocs to be CSE'd. - op1->gtFlags |= GTF_DONT_CSE; + op1 = gtNewLclHeapNode(op2, opcodeOffs); + + // PGO value-profiling: optimize / instrument based on the most likely size. + // Only profitable for zero-init (compInitMem) and when the popular size is + // bigger than the variable-size loop's break-even (~32 bytes per benchmark). + if (info.compInitMem && JitConfig.JitProfileValues() && !op2->IsIntegralConst()) + { + op1 = impProfileValueGuardedTree(op1, &op1->AsOp()->gtOp1, opcodeOffs, + /* minProfitable */ 32, /* maxProfitable */ INT_MAX, + &Metrics.ValueProfiledLclHeap DEBUGARG( + "Profiled LCLHEAP Qmark")); + } // Request stack security for this method. setNeedsGSSecurityCookie(); diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 69d7066826707e..8f8ee9e44d8796 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -1377,20 +1377,11 @@ var_types Compiler::impImportCall(OPCODE opcode, else if (JitConfig.JitProfileValues() && call->IsCall() && call->AsCall()->IsSpecialIntrinsic(this, NI_System_SpanHelpers_Memmove)) { - if (opts.IsOptimizedWithProfile()) - { - call = impDuplicateWithProfiledArg(call->AsCall(), rawILOffset); - } - else if (opts.IsInstrumented()) - { - // We might want to instrument it for optimized versions too, but we don't currently. - HandleHistogramProfileCandidateInfo* pInfo = - new (this, CMK_Inlining) HandleHistogramProfileCandidateInfo; - pInfo->ilOffset = rawILOffset; - pInfo->probeIndex = 0; - call->AsCall()->gtHandleHistogramProfileCandidateInfo = pInfo; - compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); - } + call = impProfileValueGuardedTree(call, &call->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(), + rawILOffset, + /* minProfitable */ 1, + /* maxProfitable */ (ssize_t)getUnrollThreshold(ProfiledMemmove), + &Metrics.ValueProfiledMemmove DEBUGARG("Profiled Memmove Qmark")); impAppendTree(call, CHECK_SPILL_ALL, impCurStmtDI); } else @@ -1533,27 +1524,13 @@ var_types Compiler::impImportCall(OPCODE opcode, if (JitConfig.JitProfileValues() && call->IsCall() && call->AsCall()->IsSpecialIntrinsic(this, NI_System_SpanHelpers_SequenceEqual)) { - if (opts.IsOptimizedWithProfile()) - { - call = impDuplicateWithProfiledArg(call->AsCall(), rawILOffset); - if (call->OperIs(GT_QMARK)) - { - // QMARK has to be a root node - unsigned tmp = lvaGrabTemp(true DEBUGARG("Grabbing temp for Qmark")); - impStoreToTemp(tmp, call, CHECK_SPILL_ALL); - call = gtNewLclvNode(tmp, call->TypeGet()); - } - } - else if (opts.IsInstrumented()) - { - // We might want to instrument it for optimized versions too, but we don't currently. - HandleHistogramProfileCandidateInfo* pInfo = - new (this, CMK_Inlining) HandleHistogramProfileCandidateInfo; - pInfo->ilOffset = rawILOffset; - pInfo->probeIndex = 0; - call->AsCall()->gtHandleHistogramProfileCandidateInfo = pInfo; - compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); - } + call = + impProfileValueGuardedTree(call, &call->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(), + rawILOffset, + /* minProfitable */ 1, + /* maxProfitable */ (ssize_t)getUnrollThreshold(ProfiledMemcmp), + &Metrics.ValueProfiledSequenceEqual DEBUGARG( + "Profiled SeqEqual Qmark")); } } @@ -1679,135 +1656,179 @@ GenTree* Compiler::impThrowIfNull(GenTreeCall* call) } //------------------------------------------------------------------------ -// impDuplicateWithProfiledArg: duplicates a call with a profiled argument, e.g.: -// Given `Buffer.Memmove(dst, src, len)` call, -// optimize it to: +// impProfileValueGuardedTree: PGO value-profile entry point. +// +// In Instrumented Tier0 (or OSR / Tier1Instrumented): mark `node` as a +// value-probe candidate so the value-histogram instrumentor inserts a +// CORINFO_HELP_VALUEPROFILE* call around the operand at `*operandRef`. // -// if (len == popularSize) -// Buffer.Memmove(dst, src, popularSize); // can be unrolled now -// else -// Buffer.Memmove(dst, src, len); // fallback +// In Tier1 with PGO: look up the popular value of the operand via +// pickProfiledValue and, if it's within [minProfitable, maxProfitable], +// replace `node` with a guard of the form: // -// if we can obtain the popular size from PGO data. +// (operand == popularValue) ? +// : +// +// The fast arm is `node` itself with `*operandRef` overwritten by a +// constant; the slow arm is a clone of `node` (which references the +// spilled operand temp). // // Arguments: -// call -- call to optimize with profiled argument -// ilOffset -- Raw IL offset of the call +// node - the tree containing the operand to profile. +// Either TYP_VOID (e.g. an unused-result call) or value-typed. +// operandRef - GenTree** to the operand-use within `node`. Mutated in place +// (replaced with the spilled-temp ref, then with the constant +// on the fast arm). +// ilOffset - IL offset of the originating opcode. +// minProfitable - inclusive lower bound on profitable popular values +// (caller-supplied, per shape). +// maxProfitable - inclusive upper bound on profitable popular values. +// successMetric - optional pointer to a JitMetrics counter; bumped on +// successful specialization. May be nullptr. +// tmpName - DEBUG-only name for the QMARK-result temp (only used when +// `node` is value-typed). // // Return Value: -// Optimized tree (or the original call tree if we can't optimize it). -// -GenTree* Compiler::impDuplicateWithProfiledArg(GenTreeCall* call, IL_OFFSET ilOffset) +// The original `node` if no specialization fires (or if we only marked it +// for instrumentation). Otherwise, a fresh tree (a QMARK for TYP_VOID, +// or an LCL_VAR fed by a STORE_LCL_VAR(QMARK) statement otherwise). +// +GenTree* Compiler::impProfileValueGuardedTree(GenTree* node, + GenTree** operandRef, + IL_OFFSET ilOffset, + ssize_t minProfitable, + ssize_t maxProfitable, + int* successMetric DEBUGARG(const char* tmpName)) { - assert(call->IsSpecialIntrinsic()); - assert(opts.IsOptimizedWithProfile()); + assert(node != nullptr); + assert(operandRef != nullptr); + assert(*operandRef != nullptr); + + // Instrumented Tier0 / OSR / Tier1Instrumented: mark this node as a value-profile + // candidate so the value-histogram instrumentor can later insert a probe. + if (opts.IsInstrumented() && !compIsForInlining()) + { + if (node->IsCall()) + { + GenTreeCall* const call = node->AsCall(); + + // gtHandleHistogramProfileCandidateInfo shares a union slot with + // gtInlineCandidateInfo / gtInlineCandidateInfoList / gtCallCookie / + // gtDirectCallAddress / etc. (see GenTreeCall in gentree.h). If the call + // already has inline metadata in that slot, allocating a fresh + // HandleHistogramProfileCandidateInfo here would overwrite the inline + // metadata pointer. The inliner would then read garbage for fields it + // expects on InlineCandidateInfo (e.g. inlinersContext) and assert. + // Skip the marking in that case; we preserve the inline metadata, and + // if the call ultimately fails to inline a future re-jit can revisit it. + if (call->IsInlineCandidate() || call->IsGuardedDevirtualizationCandidate()) + { + return node; + } + + // Calls carry the IL offset via gtHandleHistogramProfileCandidateInfo. + // Other node kinds (e.g. GT_LCLHEAP) carry it on the node itself + // (e.g. via GenTreeOpWithILOffset); the caller is responsible for that. + HandleHistogramProfileCandidateInfo* pInfo = new (this, CMK_Inlining) HandleHistogramProfileCandidateInfo; + pInfo->ilOffset = ilOffset; + pInfo->probeIndex = 0; + call->gtHandleHistogramProfileCandidateInfo = pInfo; + } + compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); + JITDUMP("\n ... marking [%06u] in " FMT_BB " for value profile instrumentation\n", dspTreeID(node), + compCurBB->bbNum); + return node; + } - if (call->IsInlineCandidate()) + // Tier1 + PGO: try to specialize. + if (!opts.IsOptimizedWithProfile()) { - // We decided to inline the whole thing? We won't be able to clone it then. - return call; + return node; } - const unsigned MaxLikelyValues = 8; - LikelyValueRecord likelyValues[MaxLikelyValues]; - UINT32 valuesCount = - getLikelyValues(likelyValues, MaxLikelyValues, fgPgoSchema, fgPgoSchemaCount, fgPgoData, ilOffset); - - JITDUMP("%u likely values:\n", valuesCount) - for (UINT32 i = 0; i < valuesCount; i++) + // Don't specialize calls about to be inlined: cloning then is unsafe. + if (node->IsCall() && node->AsCall()->IsInlineCandidate()) { - JITDUMP(" %u) %u - %u%%\n", i, likelyValues[i].value, likelyValues[i].likelihood) + return node; } - // For now, we only do a single guess, but it's pretty straightforward to - // extend it to support multiple guesses. - LikelyValueRecord likelyValue = likelyValues[0]; -#if DEBUG - // Re-use JitRandomGuardedDevirtualization for stress-testing. - if (JitConfig.JitRandomGuardedDevirtualization() != 0) + if ((*operandRef)->OperIsConst()) { - CLRRandom* random = impInlineRoot()->m_inlineStrategy->GetRandom(JitConfig.JitRandomGuardedDevirtualization()); + return node; + } - valuesCount = 1; - likelyValue.value = random->Next(256); - likelyValue.likelihood = 100; + ssize_t profiledValue = 0; + uint32_t likelihood = 0; + // TODO: Tune the likelihood threshold, for now it's 50%. + if (!pickProfiledValue(ilOffset, &likelihood, &profiledValue) || (likelihood < 50)) + { + return node; } -#endif - // TODO: Tune the likelihood threshold, for now it's 50% - if ((valuesCount > 0) && (likelyValue.likelihood >= 50)) + if (!FitsIn(profiledValue) || (profiledValue < minProfitable) || (profiledValue > maxProfitable)) { - const ssize_t profiledValue = likelyValue.value; + JITDUMP("Profiled value %zd outside [%zd, %zd] - skipping\n", profiledValue, minProfitable, maxProfitable); + return node; + } - unsigned argNum = 0; - ssize_t minValue = 0; - ssize_t maxValue = 0; - if (call->IsSpecialIntrinsic(this, NI_System_SpanHelpers_Memmove)) - { - // dst(0), src(1), len(2) - argNum = 2; + JITDUMP("Building profile-guarded tree for popular value = %zd (%u%%)\n", profiledValue, likelihood); + DISPTREE(node); - minValue = 1; // TODO: enable for 0 as well. - maxValue = (ssize_t)getUnrollThreshold(ProfiledMemmove); - } - else if (call->IsSpecialIntrinsic(this, NI_System_SpanHelpers_SequenceEqual)) + // Pre-spill any side-effecting sibling operands so their effects fire before + // the QMARK guard. This is generic across node kinds (calls, unary nodes, + // HWINTRINSICs, ...): GenTreeUseEdgeIterator walks all use-edges in evaluation + // order, including gtControlExpr and every CallArg slot for calls. + // (The profiled operand is spilled separately below.) + for (GenTree** useEdge : node->UseEdges()) + { + if (useEdge == operandRef) { - // dst(0), src(1), len(2) - argNum = 2; - - minValue = 1; // TODO: enable for 0 as well. - maxValue = (ssize_t)getUnrollThreshold(ProfiledMemcmp); + continue; } - else + if (((*useEdge)->gtFlags & GTF_SIDE_EFFECT) == 0) { - // only Memmove is expected at the moment. - // Possible future extensions: Memset, Memcpy - unreached(); + // No side effects -> no reordering hazard, no need for a temp. + continue; } + impCloneExpr(*useEdge, useEdge, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling sibling operand")); + } - if ((profiledValue >= minValue) && (profiledValue <= maxValue)) - { - JITDUMP("Duplicating for popular value = %u\n", profiledValue) - DISPTREE(call) + // Spill the profiled operand so we can reference it both in the comparison + // and in the slow arm. + GenTree* const operandClone = + impCloneExpr(*operandRef, operandRef, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling profiled operand")); + const var_types operandType = genActualType(operandClone->TypeGet()); - if (call->gtArgs.GetUserArgByIndex(argNum)->GetNode()->OperIsConst()) - { - JITDUMP("Profiled arg is already a constant - bail out.\n") - return call; - } + // Slow tree: clone of `node` (which now references the spilled-temp at *operandRef). + GenTree* const slowTree = gtCloneExpr(node); - // Spill all the arguments to temp locals to preserve the execution order - GenTree** argRef = nullptr; - GenTree* argClone = nullptr; - for (unsigned i = 0; i < call->gtArgs.CountUserArgs(); i++) - { - GenTree** node = &call->gtArgs.GetUserArgByIndex(i)->EarlyNodeRef(); - GenTree* cloned = impCloneExpr(*node, node, CHECK_SPILL_ALL, nullptr DEBUGARG("spilling arg")); - - // Record the reference to the argument we're going to replace. - if (i == argNum) - { - argRef = node; - argClone = cloned; - } - } + // Fast tree: `node` itself, with the profiled operand replaced by the constant. + *operandRef = gtNewIconNode(profiledValue, operandType); + GenTree* const fastTree = node; - GenTree* fallbackCall = gtCloneExpr(call); - GenTree* profiledValueNode = gtNewIconNode(profiledValue, argClone->TypeGet()); - *argRef = profiledValueNode; + const var_types resultType = node->TypeGet(); + GenTreeColon* colon = new (this, GT_COLON) GenTreeColon(resultType, fastTree, slowTree); + GenTreeOp* cond = gtNewOperNode(GT_EQ, TYP_INT, operandClone, gtNewIconNode(profiledValue, operandType)); + GenTreeQmark* qmark = gtNewQmarkNode(resultType, cond, colon); + qmark->SetThenNodeLikelihood(likelihood); - // TODO: Specify weights for the branches in the Qmark node. - GenTreeColon* colon = new (this, GT_COLON) GenTreeColon(call->TypeGet(), call, fallbackCall); - GenTreeOp* cond = gtNewOperNode(GT_EQ, TYP_INT, argClone, gtCloneExpr(profiledValueNode)); - GenTreeQmark* qmark = gtNewQmarkNode(call->TypeGet(), cond, colon); + JITDUMP("\nResulting tree:\n"); + DISPTREE(qmark); - JITDUMP("\n\nResulting tree:\n") - DISPTREE(qmark) + if (successMetric != nullptr) + { + (*successMetric)++; + } - return qmark; - } + if (resultType == TYP_VOID) + { + return qmark; } - return call; + + // QMARKs producing a value cannot stand on their own; spill into a temp. + const unsigned tmp = lvaGrabTemp(true DEBUGARG(tmpName)); + impStoreToTemp(tmp, qmark, CHECK_SPILL_ALL); + return gtNewLclvNode(tmp, resultType); } #ifdef DEBUG @@ -7155,6 +7176,65 @@ void Compiler::addFatPointerCandidate(GenTreeCall* call) helper.StoreRetExprResultsInArgs(call); } +//------------------------------------------------------------------------ +// pickProfiledValue: Use profile information to pick a value candidate for the given IL offset. +// +// Arguments: +// ilOffset - exact IL offset of the value-profile probe site +// pLikelihood - [out] likelihood of the picked value +// pValue - [out] the picked value +// +// Return Value: +// true if a value was picked, false otherwise +// +bool Compiler::pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssize_t* pValue) +{ + assert(pLikelihood != nullptr); + assert(pValue != nullptr); + + // Default the outputs to safe values so callers that ignore the return value + // (or pass uninitialized locals) can't observe garbage. + *pLikelihood = 0; + *pValue = 0; + +#if DEBUG + // Request 8 likely values in debug to get more information in JitDump. + const unsigned MaxLikelyValues = 8; +#else + const unsigned MaxLikelyValues = 1; +#endif + + LikelyValueRecord likelyValues[MaxLikelyValues] = {}; + UINT32 valuesCount = getLikelyValues(likelyValues, MaxLikelyValues, fgPgoSchema, fgPgoSchemaCount, fgPgoData, + static_cast(ilOffset)); + assert(valuesCount <= MaxLikelyValues); + +#if DEBUG + JITDUMP("%u likely values:\n", valuesCount); + for (UINT32 i = 0; i < valuesCount; i++) + { + JITDUMP(" %u) %zd - %u%%\n", i, likelyValues[i].value, likelyValues[i].likelihood); + } + + // Re-use JitRandomGuardedDevirtualization for stress-testing. + if (JitConfig.JitRandomGuardedDevirtualization() != 0) + { + CLRRandom* random = impInlineRoot()->m_inlineStrategy->GetRandom(JitConfig.JitRandomGuardedDevirtualization()); + valuesCount = max(valuesCount, 1u); + likelyValues[0].value = random->Next(256); + likelyValues[0].likelihood = 100; + } +#endif + + if (valuesCount >= 1) + { + *pValue = likelyValues[0].value; + *pLikelihood = likelyValues[0].likelihood; + return true; + } + return false; +} + //------------------------------------------------------------------------ // pickGDV: Use profile information to pick a GDV/cast type candidate for a call site. // diff --git a/src/coreclr/jit/jitmetadatalist.h b/src/coreclr/jit/jitmetadatalist.h index 215ea82ead5606..73e540437e3318 100644 --- a/src/coreclr/jit/jitmetadatalist.h +++ b/src/coreclr/jit/jitmetadatalist.h @@ -95,6 +95,9 @@ JITMETADATAMETRIC(MorphTrackedLocals, int, 0) JITMETADATAMETRIC(MorphLocals, int, 0) JITMETADATAMETRIC(EnumeratorGDVProvisionalNoEscape, int, 0) JITMETADATAMETRIC(EnumeratorGDVCanCloneToEnsureNoEscape, int, 0) +JITMETADATAMETRIC(ValueProfiledLclHeap, int, JIT_METADATA_HIGHER_IS_BETTER) +JITMETADATAMETRIC(ValueProfiledMemmove, int, JIT_METADATA_HIGHER_IS_BETTER) +JITMETADATAMETRIC(ValueProfiledSequenceEqual, int, JIT_METADATA_HIGHER_IS_BETTER) #undef JITMETADATA #undef JITMETADATAINFO