diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 2375158e7d11b1..b6d1c6fd70f9ad 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -532,12 +532,24 @@ bool IntegralRange::Contains(int64_t value) const // CAST(ulong/long <- int) - [INT_MIN..INT_MAX] if (!cast->gtOverflow()) { - if ((fromType == TYP_INT) && fromUnsigned) + IntegralRange typeRange = (fromType == TYP_INT) && fromUnsigned + ? IntegralRange{SymbolicIntegerValue::Zero, SymbolicIntegerValue::UIntMax} + : IntegralRange{SymbolicIntegerValue::IntMin, SymbolicIntegerValue::IntMax}; + + if (!compiler->opts.MinOpts()) { - return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::UIntMax}; + // Refine using the operand's known range: when the operand fits entirely + // inside what the cast can represent, the cast preserves the value, so we + // can tighten the output range to the operand's range. This handles cases + // like CAST(int <- ulong) of a value already known to be in uint16 range. + IntegralRange operandRange = ForNode(cast->CastOp(), compiler); + if (typeRange.Contains(operandRange)) + { + return operandRange; + } } - return {SymbolicIntegerValue::IntMin, SymbolicIntegerValue::IntMax}; + return typeRange; } SymbolicIntegerValue lowerBound; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 583892ca426796..f8c7c82922fe1c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7884,6 +7884,7 @@ class Compiler #define OMF_HAS_STACK_ARRAY 0x00100000 // Method contains stack allocated arrays #define OMF_HAS_BOUNDS_CHECKS 0x00200000 // Method contains bounds checks #define OMF_HAS_EARLY_QMARKS 0x00400000 // Method contains early expandable QMARKs +#define OMF_HAS_UMOD_BY_CONST_UINT16_CANDIDATE 0x00800000 // Method contains a MOD/UMOD that may turn into a uint16 FastMod candidate during the range check phase // clang-format on @@ -7924,6 +7925,16 @@ class Compiler optMethodFlags |= OMF_HAS_BOUNDS_CHECKS; } + bool doesMethodHaveUModByConstUInt16Candidate() + { + return (optMethodFlags & OMF_HAS_UMOD_BY_CONST_UINT16_CANDIDATE) != 0; + } + + void setMethodHasUModByConstUInt16Candidate() + { + optMethodFlags |= OMF_HAS_UMOD_BY_CONST_UINT16_CANDIDATE; + } + bool doesMethodHaveExpandableCasts() { return (optMethodFlags & OMF_HAS_EXPANDABLE_CAST) != 0; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 1946ace685bc22..162c5f9f828582 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -2672,8 +2672,8 @@ bool GenTree::Compare(GenTree* op1, GenTree* op2, bool swapOK) } if (op1->OperIs(GT_MOD, GT_UMOD, GT_DIV, GT_UDIV)) { - if ((op1->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW)) != - (op2->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW))) + if ((op1->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW | GTF_UMOD_UINT16_OPERANDS)) != + (op2->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW | GTF_UMOD_UINT16_OPERANDS))) { return false; } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index a71ed68351c9b4..c04e3fb6f44bb4 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -528,6 +528,8 @@ enum GenTreeFlags : unsigned GTF_DIV_MOD_NO_OVERFLOW = 0x40000000, // GT_DIV, GT_MOD -- Div or mod definitely does not overflow. + GTF_UMOD_UINT16_OPERANDS = 0x80000000, // GT_UMOD -- Both operands to a mod are in uint16 range. The divisor is a non-zero constant. + GTF_CHK_INDEX_INBND = 0x80000000, // GT_BOUNDS_CHECK -- have proven this check is always in-bounds GTF_ARRLEN_NONFAULTING = 0x20000000, // GT_ARR_LENGTH -- An array length operation that cannot fault. Same as GT_IND_NONFAULTING. diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6e7f6824ecea3a..44053912b2c3c3 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -8088,7 +8088,10 @@ bool Lowering::TryLowerConstIntUDivOrUMod(GenTreeOp* divMod) assert(varTypeIsFloating(divMod->TypeGet())); #endif // USE_HELPERS_FOR_INT_DIV #if defined(TARGET_ARM64) - assert(!divMod->OperIs(GT_UMOD)); + // ARM64 has no remainder instruction. Morph rewrites every non-pow2 MOD/UMOD + // into a SUB-MUL-DIV form except when the FastMod path here can apply, which + // requires a constant uint16 divisor (and a uint16-range dividend). + assert(!divMod->OperIs(GT_UMOD) || divMod->gtGetOp2()->IsCnsIntOrI()); #endif // TARGET_ARM64 GenTree* dividend = divMod->gtGetOp1(); @@ -8170,10 +8173,78 @@ bool Lowering::TryLowerConstIntUDivOrUMod(GenTreeOp* divMod) } } + assert(divisorValue >= 3); + // TODO-ARM-CQ: Currently there's no GT_MULHI for ARM32 #if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) - if (!m_compiler->opts.MinOpts() && (divisorValue >= 3)) + if (!m_compiler->opts.MinOpts()) { +#ifdef TARGET_64BIT + // Replace (uint16 % uint16) with a cheaper variant of FastMod, specialized for 16-bit operands. + // + // uint multiplier = uint.MaxValue / divisor + 1; + // ulong result = ((ulong)(dividend * multiplier) * divisor) >> 32; + // return (int)result; + // + // The dividend is first truncated to TYP_INT (safe because we know it fits in + // uint16), and the final value (always in [0, divisor)) is widened back to the + // mod's result type. The dividend's range may have been recovered either + // statically by IntegralRange::ForNode at this point, or earlier by the range + // check phase (which leaves GTF_UMOD_UINT16_OPERANDS for us). + if (!isDiv && FitsIn(divisorValue) && + (((divMod->gtFlags & GTF_UMOD_UINT16_OPERANDS) != 0) || + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(dividend, m_compiler)))) + { + GenTree* dividendInt = dividend; + if (genActualType(dividend) != TYP_INT) + { + assert(genActualType(dividend) == TYP_LONG); + dividendInt = m_compiler->gtNewCastNode(TYP_INT, dividend, /* unsigned */ true, TYP_INT); + BlockRange().InsertBefore(divMod, dividendInt); + } + + GenTree* multiplier = + m_compiler->gtNewIconNode(static_cast((UINT32_MAX / divisorValue) + 1), TYP_INT); + GenTree* mul1 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, dividendInt, multiplier); + mul1->SetUnsigned(); + GenTree* castUp = m_compiler->gtNewCastNode(TYP_LONG, mul1, /* unsigned */ true, TYP_LONG); + BlockRange().InsertBefore(divMod, multiplier, mul1, castUp); + + // Reuse the existing constant divisor as a TYP_LONG operand for the second multiply. + divisor->BashToConst(static_cast(divisorValue), TYP_LONG); + + GenTree* mul2 = m_compiler->gtNewOperNode(GT_MUL, TYP_LONG, castUp, divisor); + mul2->SetUnsigned(); + GenTree* shiftAmount = m_compiler->gtNewIconNode(32); + BlockRange().InsertBefore(divMod, mul2, shiftAmount); + + GenTree* firstNode = (dividendInt == dividend) ? multiplier : dividendInt; + + if (type == TYP_LONG) + { + // The shift result is already TYP_LONG; turn divMod itself into the shift. + divMod->ChangeOper(GT_RSZ); + divMod->gtOp1 = mul2; + divMod->gtOp2 = shiftAmount; + } + else + { + assert(type == TYP_INT); + GenTree* shift = m_compiler->gtNewOperNode(GT_RSZ, TYP_LONG, mul2, shiftAmount); + BlockRange().InsertBefore(divMod, shift); + + divMod->ChangeOper(GT_CAST); + divMod->AsCast()->gtCastType = TYP_INT; + divMod->gtOp1 = shift; + divMod->gtOp2 = nullptr; + divMod->SetUnsigned(); + } + + ContainCheckRange(firstNode, divMod); + return true; + } +#endif // TARGET_64BIT + size_t magic; bool increment; int preShift; diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index f5dd9fe9bef06a..c3304056ed2101 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -7424,11 +7424,44 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, bool* optAssertionPropDone) else if (tree->OperIs(GT_MOD) && op2->IsIntegralConst() && !op2->IsIntegralConstAbsPow2()) #endif { +#ifdef TARGET_64BIT + // If the divisor is a positive uint16 constant and the dividend + // is statically provable as uint16, lower can emit a cheaper + // FastMod sequence specialized for 16-bit operands. Convert MOD + // to UMOD here so lowering picks it up, and skip the SUB-MUL-DIV + // rewrite below. + if (!opts.MinOpts() && op2->IsCnsIntOrI()) + { + ssize_t modDivisor = op2->AsIntCon()->IconValue(); + if ((modDivisor > 0) && FitsIn(modDivisor) && + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(op1, this))) + { + if (tree->OperIs(GT_MOD)) + { + tree->SetOper(GT_UMOD); + } + break; + } + } +#endif // TARGET_64BIT + // Transformation: a % b = a - (a / b) * b; tree = fgMorphModToSubMulDiv(tree->AsOp()); op1 = tree->AsOp()->gtOp1; op2 = tree->AsOp()->gtOp2; } +#if defined(TARGET_64BIT) && !defined(TARGET_ARM64) + else if (!opts.MinOpts() && tree->OperIs(GT_MOD, GT_UMOD) && !op2->IsIntegralConst() && + ((typ == TYP_INT) || (typ == TYP_LONG)) && + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(op1, this))) + { + // The dividend structurally fits in uint16 but the divisor + // hasn't folded yet (e.g. a span length computed from an RVA + // static). Let the range check phase re-examine such nodes + // after VN/CSE in case the divisor becomes a uint16 constant. + setMethodHasUModByConstUInt16Candidate(); + } +#endif // TARGET_64BIT && !TARGET_ARM64 break; USE_HELPER_FOR_ARITH: diff --git a/src/coreclr/jit/rangecheck.cpp b/src/coreclr/jit/rangecheck.cpp index 5d3326548ec260..29ee78ae736a93 100644 --- a/src/coreclr/jit/rangecheck.cpp +++ b/src/coreclr/jit/rangecheck.cpp @@ -14,12 +14,21 @@ // PhaseStatus Compiler::rangeCheckPhase() { - if (!doesMethodHaveBoundsChecks() || (fgSsaPassesCompleted == 0)) + if ((!doesMethodHaveBoundsChecks() && !doesMethodHaveUModByConstUInt16Candidate()) || (fgSsaPassesCompleted == 0)) { return PhaseStatus::MODIFIED_NOTHING; } - const bool madeChanges = GetRangeCheck()->OptimizeRangeChecks(); + RangeCheck* rc = GetRangeCheck(); + bool madeChanges = false; + if (doesMethodHaveBoundsChecks()) + { + madeChanges = rc->OptimizeRangeChecks(); + } + if (doesMethodHaveUModByConstUInt16Candidate()) + { + madeChanges |= rc->TryMarkUModUInt16Operands(); + } return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; } @@ -2401,3 +2410,87 @@ bool RangeCheck::OptimizeRangeChecks() return madeChanges; } + +//------------------------------------------------------------------------ +// TryMarkUModUInt16Operands: detect GT_MOD/GT_UMOD nodes where the divisor is +// a positive uint16 constant and the dividend is provably in uint16 range. +// Such nodes are converted to GT_UMOD (if signed) and tagged with +// GTF_UMOD_UINT16_OPERANDS so that lowering can emit a cheaper FastMod +// sequence specialized for 16-bit operands. +// +// Return Value: +// True if any node was modified. +// +bool RangeCheck::TryMarkUModUInt16Operands() +{ + bool madeChanges = false; + + for (BasicBlock* const block : m_compiler->Blocks()) + { + for (Statement* const stmt : block->Statements()) + { + for (GenTree* const tree : stmt->TreeList()) + { + if (!tree->OperIs(GT_MOD, GT_UMOD)) + { + continue; + } + + if ((tree->gtFlags & GTF_UMOD_UINT16_OPERANDS) != 0) + { + continue; + } + + if (!tree->TypeIs(TYP_INT, TYP_LONG)) + { + continue; + } + + GenTree* divisor = tree->gtGetOp2(); + if (!divisor->IsCnsIntOrI()) + { + continue; + } + + ssize_t divisorValue = divisor->AsIntCon()->IconValue(); + if ((divisorValue <= 0) || !FitsIn(divisorValue)) + { + continue; + } + + GenTree* dividend = tree->gtGetOp1(); + + bool dividendFitsUint16 = + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(dividend, m_compiler)); + + if (!dividendFitsUint16 && (m_compiler->vnStore != nullptr)) + { + // Fall back to a richer VN-/assertion-based range analysis. + Range range = Range(Limit(Limit::keUndef)); + if (TryGetRange(block, dividend, &range) && range.IsConstantRange()) + { + const int lower = range.LowerLimit().GetConstant(); + const int upper = range.UpperLimit().GetConstant(); + dividendFitsUint16 = (lower >= 0) && (upper <= UINT16_MAX); + } + } + + if (!dividendFitsUint16) + { + continue; + } + + JITDUMP("Marking [%06u] as a uint16 UMOD candidate\n", Compiler::dspTreeID(tree)); + + if (tree->OperIs(GT_MOD)) + { + tree->SetOper(GT_UMOD); + } + tree->gtFlags |= GTF_UMOD_UINT16_OPERANDS; + madeChanges = true; + } + } + } + + return madeChanges; +} diff --git a/src/coreclr/jit/rangecheck.h b/src/coreclr/jit/rangecheck.h index c9f189deeb73de..89abee415406f8 100644 --- a/src/coreclr/jit/rangecheck.h +++ b/src/coreclr/jit/rangecheck.h @@ -773,6 +773,12 @@ class RangeCheck // and assertion prop phases are completed. bool OptimizeRangeChecks(); + // Walks the IR looking for "uint16 % const-uint16" patterns and tags eligible + // GT_UMOD nodes with GTF_UMOD_UINT16_OPERANDS so lowering can emit a cheaper + // FastMod sequence. GT_MOD nodes whose dividend is non-negative and whose + // divisor is a positive uint16 constant are converted to GT_UMOD. + bool TryMarkUModUInt16Operands(); + bool TryGetRange(BasicBlock* block, GenTree* expr, Range* pRange); // Cheaper version of TryGetRange that is based only on incoming assertions. diff --git a/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs new file mode 100644 index 00000000000000..eec5af4a8dd5bc --- /dev/null +++ b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs @@ -0,0 +1,169 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using Xunit; + +public class Program +{ + private static ushort s_field1; + private static ulong s_field2; + + [MethodImpl(MethodImplOptions.NoInlining)] + public static uint Umod_U4_CharByZero(char c) + { + return (uint)c % 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ushort Umod_U2_CharByConst(char c) + { + return (ushort)(c % 42); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static int Umod_I4_CharByConst(char c) + { + return c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static uint Umod_U4_CharByConst(char c) + { + return (uint)c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static long Umod_I8_CharByConst(char c) + { + return (long)c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ulong Umod_U8_CharByConst(char c) + { + return (ulong)c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static bool TestIsWhiteSpace(char c) + { + ReadOnlySpan HashEntries = [' ', ' ', '\u00A0', ' ', ' ', ' ', ' ', ' ', ' ', '\t', '\n', '\v', '\f', '\r', ' ', ' ', '\u2028', '\u2029', ' ', ' ', ' ', ' ', ' ', '\u202F', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u3000', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u0085', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A', ' ', ' ', ' ', ' ', ' ', '\u205F', '\u1680', ' ', ' ', ' ', ' ', ' ', ' ']; + return HashEntries[c % HashEntries.Length] == c; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ushort Umod_TZC(ulong value) + { + return (ushort)(BitOperations.TrailingZeroCount(value) % 42); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ushort Umod_TZC_Intrinsic(ulong value) + { + return (ushort)(Bmi1.X64.TrailingZeroCount(value) % 42); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static int Umod_UInt16Range_ByConst(int value) + { + if (value is > 0 and < 1234) + { + return value % 123; + } + + return -1; + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + private static void Test1() + { + for (int i = 0; i < 2; i++) + { + Core(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void Core() + { + s_field1 = (ushort)(Bmi1.X64.TrailingZeroCount(s_field2) % 42); + } + } + + [Fact] + public static int TestEntryPoint() + { + try + { + Umod_U4_CharByZero('a'); + return 0; + } + catch (DivideByZeroException) { } + + if (Umod_U2_CharByConst('a') != 13) + return 0; + + if (Umod_I4_CharByConst('a') != 13) + return 0; + + if (Umod_U4_CharByConst('a') != 13) + return 0; + + if (Umod_I8_CharByConst('a') != 13) + return 0; + + if (Umod_U8_CharByConst('a') != 13) + return 0; + + if (!TestIsWhiteSpace(' ')) + return 0; + + if (!TestIsWhiteSpace('\u2029')) + return 0; + + if (TestIsWhiteSpace('\0')) + return 0; + + if (TestIsWhiteSpace('a')) + return 0; + + if (Umod_TZC(1L << 40) != 40) + return 0; + + if (Umod_TZC(1L << 50) != 8) + return 0; + + if (Bmi1.X64.IsSupported) + { + if (Umod_TZC_Intrinsic(1L << 40) != 40) + return 0; + + if (Umod_TZC_Intrinsic(1L << 50) != 8) + return 0; + } + + if (Umod_UInt16Range_ByConst(0) != -1) + return 0; + + if (Umod_UInt16Range_ByConst(42) != 42) + return 0; + + if (Umod_UInt16Range_ByConst(123) != 0) + return 0; + + if (Bmi1.X64.IsSupported) + { + s_field1 = 1; + s_field2 = 1L << 50; + Test1(); + + if (s_field1 != 8) + return 0; + } + + return 100; + } +} diff --git a/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj new file mode 100644 index 00000000000000..501217e4d86892 --- /dev/null +++ b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj @@ -0,0 +1,9 @@ + + + None + True + + + + +