From e394fa42335ef25e761a0753cd0658a8158c7eb0 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 10 Jul 2024 14:25:57 +0200 Subject: [PATCH 1/4] Update docs --- src/coreclr/jit/inductionvariableopts.cpp | 63 ++++++++++++++--------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/src/coreclr/jit/inductionvariableopts.cpp b/src/coreclr/jit/inductionvariableopts.cpp index bdbece4cc3614e..fe38b6a37bbbd3 100644 --- a/src/coreclr/jit/inductionvariableopts.cpp +++ b/src/coreclr/jit/inductionvariableopts.cpp @@ -5,29 +5,46 @@ // scalar evolution analysis (see scev.h and scev.cpp for more information // about the scalar evolution analysis). // -// Currently the only optimization done is widening of primary induction -// variables from 32 bits into 64 bits. This is generally only profitable on -// x64 that does not allow zero extension of 32-bit values in addressing modes -// (in contrast, arm64 does have the capability of including zero extensions in -// addressing modes). For x64 this saves a zero extension for every array -// access inside the loop, in exchange for some widening or narrowing stores -// outside the loop: -// - To make sure the new widened IV starts at the right value it is -// initialized to the value of the narrow IV outside the loop (either in the -// preheader or at the def location of the narrow IV). Usually the start -// value is a constant, in which case the widened IV is just initialized to -// the constant value. -// - If the narrow IV is used after the loop we need to store it back from -// the widened IV in the exits. We depend on liveness sets to figure out -// which exits to insert IR into. -// -// These steps ensure that the wide IV has the right value to begin with and -// the old narrow IV still has the right value after the loop. Additionally, -// we must replace every use of the narrow IV inside the loop with the widened -// IV. This is done by a traversal of the IR inside the loop. We do not -// actually widen the uses of the IV; rather, we keep all uses and defs as -// 32-bit, which the backend is able to handle efficiently on x64. Because of -// this we do not need to worry about overflow. +// Currently the following optimizations are done: +// +// IV widening: +// This widens primary induction variables from 32 bits into 64 bits. This is +// generally only profitable on x64 that does not allow zero extension of +// 32-bit values in addressing modes (in contrast, arm64 does have the +// capability of including zero extensions in addressing modes). For x64 this +// saves a zero extension for every array access inside the loop, in exchange +// for some widening or narrowing stores outside the loop: +// - To make sure the new widened IV starts at the right value it is +// initialized to the value of the narrow IV outside the loop (either in +// the preheader or at the def location of the narrow IV). Usually the +// start value is a constant, in which case the widened IV is just +// initialized to the constant value. +// - If the narrow IV is used after the loop we need to store it back from +// the widened IV in the exits. We depend on liveness sets to figure out +// which exits to insert IR into. +// +// These steps ensure that the wide IV has the right value to begin with and +// the old narrow IV still has the right value after the loop. Additionally, +// we must replace every use of the narrow IV inside the loop with the widened +// IV. This is done by a traversal of the IR inside the loop. We do not +// actually widen the uses of the IV; rather, we keep all uses and defs as +// 32-bit, which the backend is able to handle efficiently on x64. Because of +// this we do not need to worry about overflow. +// +// Loop reversing: +// This converts loops that are up-counted into loops that are down-counted. +// Down-counted loops can generally do their IV update and compare in a +// single instruction, bypassing the need to do a separate comparison with a +// bound. +// +// Strength reduction (disabled): +// This changes the stride of primary IVs in a loop to avoid more expensive +// multiplications inside the loop. Commonly the primary IVs are only used +// for indexing memory at some element size, which can end up with these +// multiplications. +// +// Strength reduction frequently relies on reversing the loop to remove the +// last non-multiplied use of the primary IV. // #include "jitpch.h" From 36c645a01dffd3c16a51c92d1b99ebf75ced382e Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 10 Jul 2024 17:01:45 +0200 Subject: [PATCH 2/4] JIT: Prove some cases where strength reducing to GC pointers is ok For loops iterating over arrays we often have bounds that allow us to prove that an add recurrence formed by strength reduction will stay within that array. In these cases we know that forming the byrefs eagerly is ok. For example, when strength reduction is enabled, this changes the codegen of ```csharp private struct S { public int A, B, C; } [MethodImpl(MethodImplOptions.NoInlining)] private static float Sum(S[] ss) { int sum = 0; for (int i = 0; i < ss.Length; i++) { S v = ss[i]; sum += v.A; sum += v.B; sum += v.C; } return sum; } ``` in the following way: ```diff G_M63518_IG03: - mov r11d, 16 + add rcx, 16 ;; size=4 bbWeight=0.25 PerfScore 0.06 G_M63518_IG04: - lea r8, bword ptr [rcx+r11] + mov r8, rcx mov r10d, dword ptr [r8] mov r9d, dword ptr [r8+0x04] mov r8d, dword ptr [r8+0x08] add eax, r10d add eax, r9d add eax, r8d - add r11, 12 + add rcx, 12 dec edx jne SHORT G_M63518_IG04 ;; size=31 bbWeight=4 PerfScore 34.00 ``` --- src/coreclr/jit/assertionprop.cpp | 46 ++++++++++ src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/inductionvariableopts.cpp | 107 ++++++++++++++++++++-- src/coreclr/jit/scev.cpp | 42 +++++++++ src/coreclr/jit/scev.h | 2 + 5 files changed, 192 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 3a3139db4a1577..2e5b463c63d8c8 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -4848,6 +4848,52 @@ AssertionIndex Compiler::optAssertionIsNonNullInternal(GenTree* return NO_ASSERTION_INDEX; } +//------------------------------------------------------------------------ +// optAssertionVNIsNonNull: See if we can prove that the value of a VN is +// non-null using assertions. +// +// Arguments: +// vn - VN to check +// assertions - set of live assertions +// +// Return Value: +// True if the VN could be proven non-null. +// +bool Compiler::optAssertionVNIsNonNull(ValueNum vn, ASSERT_VALARG_TP assertions) +{ + if (vnStore->IsKnownNonNull(vn)) + { + return true; + } + + // Check each assertion to find if we have a vn != null assertion. + // + BitVecOps::Iter iter(apTraits, assertions); + unsigned index = 0; + while (iter.NextElem(&index)) + { + AssertionIndex assertionIndex = GetAssertionIndex(index); + if (assertionIndex > optAssertionCount) + { + break; + } + AssertionDsc* curAssertion = optGetAssertion(assertionIndex); + if (!curAssertion->CanPropNonNull()) + { + continue; + } + + if (curAssertion->op1.vn != vn) + { + continue; + } + + return true; + } + + return false; +} + /***************************************************************************** * * Given a tree consisting of a call and a set of available assertions, we diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 08307977ac6920..b92664bc0652de 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8025,6 +8025,7 @@ class Compiler AssertionIndex optAssertionIsSubrange(GenTree* tree, IntegralRange range, ASSERT_VALARG_TP assertions); AssertionIndex optAssertionIsSubtype(GenTree* tree, GenTree* methodTableArg, ASSERT_VALARG_TP assertions); AssertionIndex optAssertionIsNonNullInternal(GenTree* op, ASSERT_VALARG_TP assertions DEBUGARG(bool* pVnBased)); + bool optAssertionVNIsNonNull(ValueNum vn, ASSERT_VALARG_TP assertions); bool optAssertionIsNonNull(GenTree* op, ASSERT_VALARG_TP assertions DEBUGARG(bool* pVnBased) DEBUGARG(AssertionIndex* pIndex)); diff --git a/src/coreclr/jit/inductionvariableopts.cpp b/src/coreclr/jit/inductionvariableopts.cpp index fe38b6a37bbbd3..e2b5f6e41043d4 100644 --- a/src/coreclr/jit/inductionvariableopts.cpp +++ b/src/coreclr/jit/inductionvariableopts.cpp @@ -1244,6 +1244,7 @@ class StrengthReductionContext bool InitializeCursors(GenTreeLclVarCommon* primaryIVLcl, ScevAddRec* primaryIV); void AdvanceCursors(ArrayStack* cursors, ArrayStack* nextCursors); bool CheckAdvancedCursors(ArrayStack* cursors, int derivedLevel, ScevAddRec** nextIV); + bool StaysWithinManagedObject(ScevAddRec* addRec); bool TryReplaceUsesWithNewPrimaryIV(ArrayStack* cursors, ScevAddRec* iv); BasicBlock* FindUpdateInsertionPoint(ArrayStack* cursors); @@ -1361,13 +1362,11 @@ bool StrengthReductionContext::TryStrengthReduce() } assert(nextIV != nullptr); - // We need more sanity checks to allow materializing GC-typed add - // recs. Otherwise we may eagerly form a GC pointer that was only - // lazily formed under some conditions before, which can be - // illegal. For now we just bail. - if (varTypeIsGC(nextIV->Type)) + if (varTypeIsGC(nextIV->Type) && !StaysWithinManagedObject(nextIV)) { - JITDUMP(" Next IV has type %s. Bailing.\n", varTypeName(nextIV->Type)); + JITDUMP( + " Next IV computes a GC pointer that we cannot prove to be inside a managed object. Bailing.\n", + varTypeName(nextIV->Type)); break; } @@ -1711,6 +1710,102 @@ bool StrengthReductionContext::CheckAdvancedCursors(ArrayStack* curs return *nextIV != nullptr; } +//------------------------------------------------------------------------ +// StaysWithinManagedObject: Check whether the specified GC-pointer add-rec can +// be guaranteed to be inside the same managed object for the whole loop. +// +// Parameters: +// addRec - The add recurrence +// +// Returns: +// True if we were able to prove so. +// +bool StrengthReductionContext::StaysWithinManagedObject(ScevAddRec* addRec) +{ + int64_t offset; + Scev* baseScev = addRec->Start->PeelAdditions(&offset); + offset = static_cast(offset); + + // We only support arrays here. To strength reduce Span accesses we need + // additional properies on the range designated by a Span that we + // currently do not specify, or we need to prove that the byref we may form + // in the IV update would have been formed anyway by the loop. + if (!baseScev->OperIs(ScevOper::Local) || !baseScev->TypeIs(TYP_REF)) + { + return false; + } + + ScevLocal* local = (ScevLocal*)baseScev; + LclVarDsc* dsc = m_comp->lvaGetDesc(local->LclNum); + if ((dsc->lvClassHnd == NO_CLASS_HANDLE) || !m_comp->info.compCompHnd->isSDArray(dsc->lvClassHnd)) + { + return false; + } + + ValueNum vn = m_scevContext.MaterializeVN(baseScev); + if (vn == ValueNumStore::NoVN) + { + return false; + } + + BasicBlock* preheader = m_loop->EntryEdge(0)->getSourceBlock(); + if (!m_comp->optAssertionVNIsNonNull(vn, preheader->bbAssertionOut)) + { + return false; + } + + // We have a non-null array. Check that the 'start' offset looks fine. + // TODO: We could also use assertions on the length of the array. E.g. if + // we know the length of the array is > 3, then we can allow the add rec to + // have a later start. Maybe range check can be used? + if ((offset < 0) || (offset > OFFSETOF__CORINFO_Array__data)) + { + return false; + } + + // Now see if we have a bound that guarantees that we iterate less than the + // array length's times. + for (int i = 0; i < m_backEdgeBounds.Height(); i++) + { + // TODO: EvaluateRelop ought to be powerful enough to prove something + // like bound < ARR_LENGTH(vn), but it is not able to prove that + // currently, even for bound = ARR_LENGTH(vn) - 1 (common case). + Scev* bound = m_backEdgeBounds.Bottom(i); + + int64_t boundOffset; + Scev* boundBase = bound->PeelAdditions(&boundOffset); + + if (bound->TypeIs(TYP_INT)) + { + boundOffset = static_cast(boundOffset); + } + + if (boundOffset >= 0) + { + // If we take the backedge >= the array length times, then we would + // advance the addrec past the end. + continue; + } + + ValueNum boundBaseVN = m_scevContext.MaterializeVN(boundBase); + + VNFuncApp vnf; + if (!m_comp->vnStore->GetVNFunc(boundBaseVN, &vnf)) + { + continue; + } + + if ((vnf.m_func != VNF_ARR_LENGTH) || (vnf.m_args[0] != vn)) + { + continue; + } + + return true; + } + + return false; +} + //------------------------------------------------------------------------ // TryReplaceUsesWithNewPrimaryIV: Perform final sanity checks before // introducing a new primary IV and replacing the uses represented by the diff --git a/src/coreclr/jit/scev.cpp b/src/coreclr/jit/scev.cpp index 9053d47bbf55d0..617d3e0a428a67 100644 --- a/src/coreclr/jit/scev.cpp +++ b/src/coreclr/jit/scev.cpp @@ -220,6 +220,48 @@ bool Scev::IsInvariant() return result != ScevVisit::Abort; } +//------------------------------------------------------------------------ +// Scev::PeelAdditions: Peel the aditions from a SCEV and return the base SCEV +// and the sum of the offsets peeled. +// +// Parameters: +// offset - [out] The sum of offsets peeled +// +// Returns: +// The base SCEV. +// +// Remarks: +// If the SCEV is 32-bits, the user is expected to apply the proper +// truncation (or extension into 64-bit). +// +Scev* Scev::PeelAdditions(int64_t* offset) +{ + *offset = 0; + + Scev* scev = this; + while (scev->OperIs(ScevOper::Add)) + { + Scev* op1 = ((ScevBinop*)scev)->Op1; + Scev* op2 = ((ScevBinop*)scev)->Op2; + if (op1->OperIs(ScevOper::Constant)) + { + *offset += ((ScevConstant*)op1)->Value; + scev = op2; + } + else if (op2->OperIs(ScevOper::Constant)) + { + *offset += ((ScevConstant*)op2)->Value; + scev = op1; + } + else + { + break; + } + } + + return scev; +} + //------------------------------------------------------------------------ // Scev::Equals: Check if two SCEV trees are equal. // diff --git a/src/coreclr/jit/scev.h b/src/coreclr/jit/scev.h index e965137b90f1a9..fec05231ce34c8 100644 --- a/src/coreclr/jit/scev.h +++ b/src/coreclr/jit/scev.h @@ -75,6 +75,8 @@ struct Scev bool IsInvariant(); + Scev* PeelAdditions(int64_t* offset); + static bool Equals(Scev* left, Scev* right); }; From 53846ae6dbe06b66c637a3cab88d2f9b86c0c38e Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 10 Jul 2024 17:30:44 +0200 Subject: [PATCH 3/4] Use ARR_ADDR instead of class handle metadata --- src/coreclr/jit/inductionvariableopts.cpp | 39 +++++++++++++++++++---- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/inductionvariableopts.cpp b/src/coreclr/jit/inductionvariableopts.cpp index e2b5f6e41043d4..900ee7f60244ec 100644 --- a/src/coreclr/jit/inductionvariableopts.cpp +++ b/src/coreclr/jit/inductionvariableopts.cpp @@ -1244,7 +1244,7 @@ class StrengthReductionContext bool InitializeCursors(GenTreeLclVarCommon* primaryIVLcl, ScevAddRec* primaryIV); void AdvanceCursors(ArrayStack* cursors, ArrayStack* nextCursors); bool CheckAdvancedCursors(ArrayStack* cursors, int derivedLevel, ScevAddRec** nextIV); - bool StaysWithinManagedObject(ScevAddRec* addRec); + bool StaysWithinManagedObject(ArrayStack* cursors, ScevAddRec* addRec); bool TryReplaceUsesWithNewPrimaryIV(ArrayStack* cursors, ScevAddRec* iv); BasicBlock* FindUpdateInsertionPoint(ArrayStack* cursors); @@ -1362,7 +1362,7 @@ bool StrengthReductionContext::TryStrengthReduce() } assert(nextIV != nullptr); - if (varTypeIsGC(nextIV->Type) && !StaysWithinManagedObject(nextIV)) + if (varTypeIsGC(nextIV->Type) && !StaysWithinManagedObject(nextCursors, nextIV)) { JITDUMP( " Next IV computes a GC pointer that we cannot prove to be inside a managed object. Bailing.\n", @@ -1715,12 +1715,13 @@ bool StrengthReductionContext::CheckAdvancedCursors(ArrayStack* curs // be guaranteed to be inside the same managed object for the whole loop. // // Parameters: -// addRec - The add recurrence +// cursors - Cursors pointing to next uses that correspond to the specific add-rec. +// addRec - The add recurrence // // Returns: // True if we were able to prove so. // -bool StrengthReductionContext::StaysWithinManagedObject(ScevAddRec* addRec) +bool StrengthReductionContext::StaysWithinManagedObject(ArrayStack* cursors, ScevAddRec* addRec) { int64_t offset; Scev* baseScev = addRec->Start->PeelAdditions(&offset); @@ -1735,13 +1736,37 @@ bool StrengthReductionContext::StaysWithinManagedObject(ScevAddRec* addRec) return false; } - ScevLocal* local = (ScevLocal*)baseScev; - LclVarDsc* dsc = m_comp->lvaGetDesc(local->LclNum); - if ((dsc->lvClassHnd == NO_CLASS_HANDLE) || !m_comp->info.compCompHnd->isSDArray(dsc->lvClassHnd)) + // Now use the fact that we keep ARR_ADDRs in the IR when we have array + // accesses. + GenTreeArrAddr* arrAddr = nullptr; + for (int i = 0; i < cursors->Height(); i++) + { + CursorInfo& cursor = cursors->BottomRef(i); + GenTree* parent = cursor.Tree->gtGetParent(nullptr); + if ((parent != nullptr) && parent->OperIs(GT_ARR_ADDR)) + { + arrAddr = parent->AsArrAddr(); + break; + } + } + + if (arrAddr == nullptr) { return false; } + unsigned arrElemSize = arrAddr->GetElemType() == TYP_STRUCT + ? m_comp->typGetObjLayout(arrAddr->GetElemClassHandle())->GetSize() + : genTypeSize(arrAddr->GetElemType()); + + int64_t stepCns; + if (!addRec->Step->GetConstantValue(m_comp, &stepCns) || ((unsigned)stepCns > arrElemSize)) + { + return false; + } + + ScevLocal* local = (ScevLocal*)baseScev; + ValueNum vn = m_scevContext.MaterializeVN(baseScev); if (vn == ValueNumStore::NoVN) { From 6e503da090f9c3293fdb7e59470e994325cc2159 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 10 Jul 2024 17:54:40 +0200 Subject: [PATCH 4/4] Fix build --- src/coreclr/jit/inductionvariableopts.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/inductionvariableopts.cpp b/src/coreclr/jit/inductionvariableopts.cpp index 900ee7f60244ec..ae8b9c62ebb964 100644 --- a/src/coreclr/jit/inductionvariableopts.cpp +++ b/src/coreclr/jit/inductionvariableopts.cpp @@ -1783,7 +1783,7 @@ bool StrengthReductionContext::StaysWithinManagedObject(ArrayStack* // TODO: We could also use assertions on the length of the array. E.g. if // we know the length of the array is > 3, then we can allow the add rec to // have a later start. Maybe range check can be used? - if ((offset < 0) || (offset > OFFSETOF__CORINFO_Array__data)) + if ((offset < 0) || (offset > (int64_t)OFFSETOF__CORINFO_Array__data)) { return false; }