From ec31dfe675122249070955ceb55b24c995a605f6 Mon Sep 17 00:00:00 2001 From: Swapnil Gaikwad Date: Thu, 30 Nov 2023 17:24:37 +0000 Subject: [PATCH 1/5] Use multi-reg load/store for EncodeToUtf8 --- .../src/System/Buffers/Text/Base64Encoder.cs | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Encoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Encoder.cs index 08ca62b533f510..af77de7f3c8813 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Encoder.cs @@ -85,6 +85,15 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span goto DoneExit; } + end = srcMax - 48; + if (AdvSimd.Arm64.IsSupported && (end >= src)) + { + NeonEncode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); + + if (src == srcEnd) + goto DoneExit; + } + end = srcMax - 16; if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src)) { @@ -480,6 +489,60 @@ private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, b destBytes = dest; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + private static unsafe void NeonEncode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) + { + // C# implementatino of https://github.com/aklomp/base64/blob/e516d769a2a432c08404f1981e73b431566057be/lib/arch/neon64/enc_loop.c + // If we have Neon support, pick off 48 bytes at a time for as long as we can. + byte* src = srcBytes; + byte* dest = destBytes; + + Vector128 str1; + Vector128 str2; + Vector128 str3; + + Vector128 res1; + Vector128 res2; + Vector128 res3; + Vector128 res4; + + Vector128 tbl_enc = Vector128.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"u8).AsByte(); + do + { + // Load 48 bytes and deinterleave: + AssertRead>(src, srcStart, sourceLength); + (str1, str2, str3) = AdvSimd.Arm64.LoadVector128x3(src); + + // Divide bits of three input bytes over four output bytes: + res1 = AdvSimd.ShiftRightLogical(str1, 2); + res2 = AdvSimd.ShiftRightLogical(str2, 4) | AdvSimd.ShiftLeftLogical(str1, 4); + res3 = AdvSimd.ShiftRightLogical(str3, 6) | AdvSimd.ShiftLeftLogical(str2, 2); + res4 = str3; + + // Clear top two bits: + res1 &= AdvSimd.DuplicateToVector128(0x3F).AsByte(); + res2 &= AdvSimd.DuplicateToVector128(0x3F).AsByte(); + res3 &= AdvSimd.DuplicateToVector128(0x3F).AsByte(); + res4 &= AdvSimd.DuplicateToVector128(0x3F).AsByte(); + + // The bits have now been shifted to the right locations; + // translate their values 0..63 to the Base64 alphabet. + // Use a 64-byte table lookup: + res1 = AdvSimd.Arm64.VectorTableLookup(tbl_enc, res1); + res2 = AdvSimd.Arm64.VectorTableLookup(tbl_enc, res2); + res3 = AdvSimd.Arm64.VectorTableLookup(tbl_enc, res3); + res4 = AdvSimd.Arm64.VectorTableLookup(tbl_enc, res4); + + // Interleave and store result: + AssertWrite>(dest, destStart, destLength); + AdvSimd.Arm64.StoreVector128x4AndZip(dest, (res1, res2, res3, res4)); + + src += 48; + dest += 64; + } while (src <= srcEnd); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] From cb5750d64666a408c581ffe5e622db6913b64c33 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Mon, 4 Dec 2023 11:53:42 -0800 Subject: [PATCH 2/5] Do not allocate consecutive register if one of them is in use at current location --- src/coreclr/jit/lsraarm64.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 3914af3fa5e18b..991513cfa33cbd 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -144,11 +144,14 @@ bool LinearScan::canAssignNextConsecutiveRegisters(RefPosition* firstRefPosition nextRefPosition = getNextConsecutiveRefPosition(nextRefPosition); } - // If regToAssign is not free, check if it is already assigned to the interval corresponding - // to the subsequent nextRefPosition. If yes, it would just use regToAssign for that nextRefPosition. - if ((nextRefPosition->getInterval() != nullptr) && - (nextRefPosition->getInterval()->assignedReg != nullptr) && - ((nextRefPosition->getInterval()->assignedReg->regNum == regToAssign))) + Interval* interval = nextRefPosition->getInterval(); + + // If regToAssign is not free, make sure it is not in use at current location. + // If not, then check if it is already assigned to the interval corresponding + // to the subsequent nextRefPosition. + // If yes, it would just use regToAssign for that nextRefPosition. + if ((interval != nullptr) && !isRegInUse(regToAssign, interval->registerType) && + (interval->assignedReg != nullptr) && ((interval->assignedReg->regNum == regToAssign))) { continue; } From 6e8b09624165bca37b33336d0c0181e268aa7170 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Mon, 4 Dec 2023 13:59:29 -0800 Subject: [PATCH 3/5] Do not use register in spill heuristics if it is already assigned to consecutive register refposition --- src/coreclr/jit/lsra.cpp | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 23307435025ee5..8e955c77831177 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -11949,18 +11949,30 @@ void LinearScan::RegisterSelection::try_SPILL_COST() continue; } - if ((recentRefPosition != nullptr) && (recentRefPosition->RegOptional() && - !(assignedInterval->isLocalVar && recentRefPosition->IsActualRef()))) + RefPosition* reloadRefPosition = assignedInterval->getNextRefPosition(); + + if (reloadRefPosition != nullptr) { - // We do not "spillAfter" if previous (recent) refPosition was regOptional or if it - // is not an actual ref. In those cases, we will reload in future (next) refPosition. - // For such cases, consider the spill cost of next refposition. - // See notes in "spillInterval()". - RefPosition* reloadRefPosition = assignedInterval->getNextRefPosition(); - if (reloadRefPosition != nullptr) + if ((recentRefPosition != nullptr) && + (recentRefPosition->RegOptional() && + !(assignedInterval->isLocalVar && recentRefPosition->IsActualRef()))) { + // We do not "spillAfter" if previous (recent) refPosition was regOptional or if it + // is not an actual ref. In those cases, we will reload in future (next) refPosition. + // For such cases, consider the spill cost of next refposition. + // See notes in "spillInterval()". currentSpillWeight = linearScan->getWeight(reloadRefPosition); } +#ifdef TARGET_ARM64 + else if (assignedInterval->getNextRefPosition()->needsConsecutive) + { + // if next refposition is part of consecutive registers and there is already a register + // assigned to it then do not reassign for currentRefPosition, because with that, other + // registers for the next consecutive register assignment would have to be copied to + // different consecutive registers since this register is busy from this point onwards. + continue; + } +#endif } } #ifdef TARGET_ARM64 From b54c972a64bac6febe462f0dcb6d4ab1ec18d325 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Mon, 4 Dec 2023 17:49:59 -0800 Subject: [PATCH 4/5] Revert "Use multi-reg load/store for EncodeToUtf8" This reverts commit ec31dfe675122249070955ceb55b24c995a605f6. --- .../src/System/Buffers/Text/Base64Encoder.cs | 63 ------------------- 1 file changed, 63 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Encoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Encoder.cs index af77de7f3c8813..08ca62b533f510 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Encoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Encoder.cs @@ -85,15 +85,6 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan bytes, Span goto DoneExit; } - end = srcMax - 48; - if (AdvSimd.Arm64.IsSupported && (end >= src)) - { - NeonEncode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); - - if (src == srcEnd) - goto DoneExit; - } - end = srcMax - 16; if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src)) { @@ -489,60 +480,6 @@ private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, b destBytes = dest; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] - private static unsafe void NeonEncode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) - { - // C# implementatino of https://github.com/aklomp/base64/blob/e516d769a2a432c08404f1981e73b431566057be/lib/arch/neon64/enc_loop.c - // If we have Neon support, pick off 48 bytes at a time for as long as we can. - byte* src = srcBytes; - byte* dest = destBytes; - - Vector128 str1; - Vector128 str2; - Vector128 str3; - - Vector128 res1; - Vector128 res2; - Vector128 res3; - Vector128 res4; - - Vector128 tbl_enc = Vector128.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"u8).AsByte(); - do - { - // Load 48 bytes and deinterleave: - AssertRead>(src, srcStart, sourceLength); - (str1, str2, str3) = AdvSimd.Arm64.LoadVector128x3(src); - - // Divide bits of three input bytes over four output bytes: - res1 = AdvSimd.ShiftRightLogical(str1, 2); - res2 = AdvSimd.ShiftRightLogical(str2, 4) | AdvSimd.ShiftLeftLogical(str1, 4); - res3 = AdvSimd.ShiftRightLogical(str3, 6) | AdvSimd.ShiftLeftLogical(str2, 2); - res4 = str3; - - // Clear top two bits: - res1 &= AdvSimd.DuplicateToVector128(0x3F).AsByte(); - res2 &= AdvSimd.DuplicateToVector128(0x3F).AsByte(); - res3 &= AdvSimd.DuplicateToVector128(0x3F).AsByte(); - res4 &= AdvSimd.DuplicateToVector128(0x3F).AsByte(); - - // The bits have now been shifted to the right locations; - // translate their values 0..63 to the Base64 alphabet. - // Use a 64-byte table lookup: - res1 = AdvSimd.Arm64.VectorTableLookup(tbl_enc, res1); - res2 = AdvSimd.Arm64.VectorTableLookup(tbl_enc, res2); - res3 = AdvSimd.Arm64.VectorTableLookup(tbl_enc, res3); - res4 = AdvSimd.Arm64.VectorTableLookup(tbl_enc, res4); - - // Interleave and store result: - AssertWrite>(dest, destStart, destLength); - AdvSimd.Arm64.StoreVector128x4AndZip(dest, (res1, res2, res3, res4)); - - src += 48; - dest += 64; - } while (src <= srcEnd); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] From 024e18a01e075c0421ca83f1a573d77455ccf3ee Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 5 Dec 2023 07:35:08 -0800 Subject: [PATCH 5/5] Make spilling of consecutive register harder --- src/coreclr/jit/lsra.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 8e955c77831177..8b047bca41b65e 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -3241,13 +3241,15 @@ regNumber LinearScan::assignCopyReg(RefPosition* refPosition) assert(allocatedReg != REG_NA); + // restore the related interval + currentInterval->relatedInterval = savedRelatedInterval; + INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_COPY_REG, currentInterval, allocatedReg, nullptr, registerScore)); // Now restore the old info - currentInterval->relatedInterval = savedRelatedInterval; - currentInterval->physReg = oldPhysReg; - currentInterval->assignedReg = oldRegRecord; - currentInterval->isActive = true; + currentInterval->physReg = oldPhysReg; + currentInterval->assignedReg = oldRegRecord; + currentInterval->isActive = true; return allocatedReg; } @@ -11966,11 +11968,13 @@ void LinearScan::RegisterSelection::try_SPILL_COST() #ifdef TARGET_ARM64 else if (assignedInterval->getNextRefPosition()->needsConsecutive) { - // if next refposition is part of consecutive registers and there is already a register - // assigned to it then do not reassign for currentRefPosition, because with that, other - // registers for the next consecutive register assignment would have to be copied to - // different consecutive registers since this register is busy from this point onwards. - continue; + // If next refposition is part of consecutive registers and there is already a register + // assigned to it then try not to reassign for currentRefPosition, because with that, + // other registers for the next consecutive register assignment would have to be copied + // to different consecutive registers since this register is busy from this point onwards. + // + // Have it as a candidate, but make its spill cost higher than others. + currentSpillWeight = linearScan->getWeight(reloadRefPosition) * 10; } #endif }