From bcb8f75d7e8ff62b488c5a99caa62a0bf8356275 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sat, 21 Jan 2023 01:07:24 +0100 Subject: [PATCH 1/9] Vectorize ProbabilisticMap.IndexOfAny on AVX2 --- .../IndexOfAnyValues/ProbabilisticMap.cs | 73 ++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs index d0a0a7821913e1..1f6a32ff9589f5 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs @@ -1,9 +1,12 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; #pragma warning disable IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228 @@ -72,7 +75,34 @@ private static bool IsCharBitSet(ref uint charMap, byte value) => internal static bool Contains(ref uint charMap, ReadOnlySpan values, int ch) => IsCharBitSet(ref charMap, (byte)ch) && IsCharBitSet(ref charMap, (byte)(ch >> 8)) && - values.Contains((char)ch); + Contains(values, (char)ch); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool Contains(ReadOnlySpan values, char ch) => + SpanHelpers.NonPackedContainsValueType( + ref Unsafe.As(ref MemoryMarshal.GetReference(values)), + (short)ch, + values.Length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 ContainsMaskAvx2(Vector256 charMap, ref char searchSpace) + { + Debug.Assert(Avx2.IsSupported); + Vector256 source = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref Unsafe.As(ref searchSpace))).AsUInt32(); + Vector256 resultLow = IsCharBitNotSetAvx2(charMap, source & Vector256.Create(255u)); + Vector256 resultHigh = IsCharBitNotSetAvx2(charMap, Vector256.ShiftRightLogical(source, 8)); + return ~(resultLow | resultHigh); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 IsCharBitNotSetAvx2(Vector256 charMap, Vector256 values) + { + Debug.Assert(Avx2.IsSupported); + Vector256 index = values & Vector256.Create((uint)IndexMask); + Vector256 bitPositions = Avx2.ShiftLeftLogicalVariable(Vector256.One, Vector256.ShiftRightLogical(values, IndexShift)); + Vector256 bitMask = Avx2.PermuteVar8x32(charMap, index); + return Vector256.Equals(bitMask & bitPositions, Vector256.Zero); + } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool ShouldUseSimpleLoop(int searchSpaceLength, int valuesLength) @@ -201,6 +231,47 @@ internal static int IndexOfAny(ref uint charMap, ref char searchSpace, ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); ref char cur = ref searchSpace; + if (Avx2.IsSupported && typeof(TNegator) == typeof(IndexOfAnyAsciiSearcher.DontNegate) && searchSpaceLength >= Vector128.Count) + { + Vector256 charMap256 = Vector256.LoadUnsafe(ref charMap); + + ref char oneVectorAwayFromEnd = ref Unsafe.Subtract(ref searchSpaceEnd, Vector128.Count); + + do + { + Vector256 result = ContainsMaskAvx2(charMap256, ref cur); + + if (result != Vector256.Zero) + { + uint mask = result.ExtractMostSignificantBits(); + do + { + int charPos = BitOperations.TrailingZeroCount(mask); + + ref char candidatePos = ref Unsafe.Add(ref cur, charPos); + + if (Contains(values, candidatePos)) + { + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char)); + } + + if (Bmi1.IsSupported) + { + mask = Bmi1.ResetLowestSetBit(mask); + } + else + { + mask &= ~(uint)(1 << charPos); + } + } + while (mask != 0); + } + + cur = ref Unsafe.Add(ref cur, Vector128.Count); + } + while (!Unsafe.IsAddressGreaterThan(ref cur, ref oneVectorAwayFromEnd)); + } + while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) { int ch = cur; From 60819ffc67e2d405e9c78035a3886c1b1dd980e4 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sat, 21 Jan 2023 05:32:35 +0100 Subject: [PATCH 2/9] Use ResetLowestSetBit from BitOperations --- .../src/System/IndexOfAnyValues/ProbabilisticMap.cs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs index 1f6a32ff9589f5..ea389ce33122b3 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs @@ -246,23 +246,14 @@ internal static int IndexOfAny(ref uint charMap, ref char searchSpace, uint mask = result.ExtractMostSignificantBits(); do { - int charPos = BitOperations.TrailingZeroCount(mask); - - ref char candidatePos = ref Unsafe.Add(ref cur, charPos); + ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask)); if (Contains(values, candidatePos)) { return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char)); } - if (Bmi1.IsSupported) - { - mask = Bmi1.ResetLowestSetBit(mask); - } - else - { - mask &= ~(uint)(1 << charPos); - } + mask = BitOperations.ResetLowestSetBit(mask); } while (mask != 0); } From 9bd28f4bd954e21543bd4d23412adb4bc9650ea1 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sat, 21 Jan 2023 10:34:10 +0100 Subject: [PATCH 3/9] Speed up Avx2 and add Vector128 support --- .../IndexOfAnyValues/ProbabilisticMap.cs | 222 +++++++++++++++--- 1 file changed, 187 insertions(+), 35 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs index ea389ce33122b3..934ce549401b35 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs @@ -1,11 +1,11 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; #pragma warning disable IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228 @@ -26,8 +26,19 @@ namespace System.Buffers [StructLayout(LayoutKind.Sequential)] internal readonly struct ProbabilisticMap { - private const int IndexMask = 0x7; - private const int IndexShift = 0x3; + // We need at least Sse41 for the hardware accelerated ConditionalSelect support. + private static bool IsVectorizationSupported => Sse41.IsSupported || AdvSimd.Arm64.IsSupported; + + // The vectorized algorithm operates on bytes instead of uint32s. + // The index and shift are adjusted so that we represent the structure + // as "32 x uint8" instead of "8 x uint32". + private const uint VectorizedIndexMask = 31u; + private const int VectorizedIndexShift = 5; + + // If we don't support vectorization, use uint32 to speed up + // "IsCharBitSet" checks in scalar loops. + private const uint PortableIndexMask = 7u; + private const int PortableIndexShift = 3; private readonly uint _e0, _e1, _e2, _e3, _e4, _e5, _e6, _e7; @@ -59,17 +70,27 @@ public ProbabilisticMap(ReadOnlySpan values) if (hasAscii) { // Common to search for ASCII symbols. Just set the high value once. - charMap |= 1u; + SetCharBit(ref charMap, 0); } } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void SetCharBit(ref uint charMap, byte value) => - Unsafe.Add(ref charMap, (uint)value & IndexMask) |= 1u << (value >> IndexShift); + private static void SetCharBit(ref uint charMap, byte value) + { + if (IsVectorizationSupported) + { + Unsafe.Add(ref Unsafe.As(ref charMap), value & VectorizedIndexMask) |= (byte)(1u << (value >> VectorizedIndexShift)); + } + else + { + Unsafe.Add(ref charMap, value & PortableIndexMask) |= 1u << (value >> PortableIndexShift); + } + } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsCharBitSet(ref uint charMap, byte value) => - (Unsafe.Add(ref charMap, (uint)value & IndexMask) & (1u << (value >> IndexShift))) != 0; + private static bool IsCharBitSet(ref uint charMap, byte value) => IsVectorizationSupported + ? (Unsafe.Add(ref Unsafe.As(ref charMap), value & VectorizedIndexMask) & (1u << (value >> VectorizedIndexShift))) != 0 + : (Unsafe.Add(ref charMap, value & PortableIndexMask) & (1u << (value >> PortableIndexShift))) != 0; [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool Contains(ref uint charMap, ReadOnlySpan values, int ch) => @@ -85,23 +106,86 @@ ref Unsafe.As(ref MemoryMarshal.GetReference(values)), values.Length); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 ContainsMaskAvx2(Vector256 charMap, ref char searchSpace) + private static Vector256 ContainsMask32CharsAvx2(Vector256 charMapLower, Vector256 charMapUpper, ref char searchSpace) { - Debug.Assert(Avx2.IsSupported); - Vector256 source = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref Unsafe.As(ref searchSpace))).AsUInt32(); - Vector256 resultLow = IsCharBitNotSetAvx2(charMap, source & Vector256.Create(255u)); - Vector256 resultHigh = IsCharBitNotSetAvx2(charMap, Vector256.ShiftRightLogical(source, 8)); - return ~(resultLow | resultHigh); + Vector256 source0 = Vector256.LoadUnsafe(ref Unsafe.As(ref searchSpace)); + Vector256 source1 = Vector256.LoadUnsafe(ref Unsafe.As(ref searchSpace), (nuint)Vector256.Count); + + Vector256 sourceLower = Avx2.PackUnsignedSaturate( + (source0 & Vector256.Create((ushort)255)).AsInt16(), + (source1 & Vector256.Create((ushort)255)).AsInt16()); + + Vector256 sourceUpper = Avx2.PackUnsignedSaturate( + Vector256.ShiftRightLogical(source0, 8).AsInt16(), + Vector256.ShiftRightLogical(source1, 8).AsInt16()); + + Vector256 resultLower = IsCharBitNotSetAvx2(charMapLower, charMapUpper, sourceLower); + Vector256 resultUpper = IsCharBitNotSetAvx2(charMapLower, charMapUpper, sourceUpper); + + return ~(resultLower | resultUpper); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 IsCharBitNotSetAvx2(Vector256 charMap, Vector256 values) + private static Vector256 IsCharBitNotSetAvx2(Vector256 charMapLower, Vector256 charMapUpper, Vector256 values) { - Debug.Assert(Avx2.IsSupported); - Vector256 index = values & Vector256.Create((uint)IndexMask); - Vector256 bitPositions = Avx2.ShiftLeftLogicalVariable(Vector256.One, Vector256.ShiftRightLogical(values, IndexShift)); - Vector256 bitMask = Avx2.PermuteVar8x32(charMap, index); - return Vector256.Equals(bitMask & bitPositions, Vector256.Zero); + Vector256 highNibble = Avx2.ShiftRightLogical(values.AsInt32(), VectorizedIndexShift).AsByte() & Vector256.Create((byte)15); + + Vector256 bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble); + + Vector256 index = values & Vector256.Create((byte)VectorizedIndexMask); + Vector256 bitMaskLower = Avx2.Shuffle(charMapLower, index); + Vector256 bitMaskUpper = Avx2.Shuffle(charMapUpper, index - Vector256.Create((byte)16)); + Vector256 mask = Vector256.GreaterThan(index, Vector256.Create((byte)15)); + Vector256 bitMask = Vector256.ConditionalSelect(mask, bitMaskUpper, bitMaskLower); + + return Vector256.Equals(bitMask & bitPositions, Vector256.Zero); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 ContainsMask16Chars(Vector128 charMapLower, Vector128 charMapUpper, ref char searchSpace) + { + Vector128 source0 = Vector128.LoadUnsafe(ref Unsafe.As(ref searchSpace)); + Vector128 source1 = Vector128.LoadUnsafe(ref Unsafe.As(ref searchSpace), (nuint)Vector128.Count); + + Vector128 sourceLower = Sse2.IsSupported + ? Sse2.PackUnsignedSaturate((source0 & Vector128.Create((ushort)255)).AsInt16(), (source1 & Vector128.Create((ushort)255)).AsInt16()) + : AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte()); + + Vector128 sourceUpper = Sse2.IsSupported + ? Sse2.PackUnsignedSaturate(Vector128.ShiftRightLogical(source0, 8).AsInt16(), Vector128.ShiftRightLogical(source1, 8).AsInt16()) + : AdvSimd.Arm64.UnzipOdd(source0.AsByte(), source1.AsByte()); + + Vector128 resultLower = IsCharBitNotSet(charMapLower, charMapUpper, sourceLower); + Vector128 resultUpper = IsCharBitNotSet(charMapLower, charMapUpper, sourceUpper); + + return ~(resultLower | resultUpper); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 IsCharBitNotSet(Vector128 charMapLower, Vector128 charMapUpper, Vector128 values) + { + Vector128 highNibble = Sse2.IsSupported + ? Sse2.ShiftRightLogical(values.AsInt32(), VectorizedIndexShift).AsByte() & Vector128.Create((byte)15) + : AdvSimd.ShiftRightLogical(values, VectorizedIndexShift); + + Vector128 bitPositions = Shuffle(Vector128.Create(0x8040201008040201).AsByte(), highNibble); + + Vector128 index = values & Vector128.Create((byte)VectorizedIndexMask); + Vector128 bitMaskLower = Shuffle(charMapLower, index); + Vector128 bitMaskUpper = Shuffle(charMapUpper, index - Vector128.Create((byte)16)); + Vector128 mask = Vector128.GreaterThan(index, Vector128.Create((byte)15)); + Vector128 bitMask = Vector128.ConditionalSelect(mask, bitMaskUpper, bitMaskLower); + + return Vector128.Equals(bitMask & bitPositions, Vector128.Zero); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 Shuffle(Vector128 vector, Vector128 indices) + { + // We're not using Vector128.Shuffle as the caller already accounts for differences in behavior between platforms. + return Ssse3.IsSupported + ? Ssse3.Shuffle(vector, indices) + : AdvSimd.Arm64.VectorTableLookup(vector, indices); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -231,17 +315,73 @@ internal static int IndexOfAny(ref uint charMap, ref char searchSpace, ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); ref char cur = ref searchSpace; - if (Avx2.IsSupported && typeof(TNegator) == typeof(IndexOfAnyAsciiSearcher.DontNegate) && searchSpaceLength >= Vector128.Count) + if (IsVectorizationSupported && typeof(TNegator) == typeof(IndexOfAnyAsciiSearcher.DontNegate) && searchSpaceLength >= 16) { - Vector256 charMap256 = Vector256.LoadUnsafe(ref charMap); + Vector128 charMapLower = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap)); + Vector128 charMapUpper = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap), (nuint)Vector128.Count); + + if (Avx2.IsSupported && searchSpaceLength >= 32) + { + Vector256 charMapLower256 = Vector256.Create(charMapLower, charMapLower); + Vector256 charMapUpper256 = Vector256.Create(charMapUpper, charMapUpper); - ref char oneVectorAwayFromEnd = ref Unsafe.Subtract(ref searchSpaceEnd, Vector128.Count); + ref char lastStartVectorAvx2 = ref Unsafe.Subtract(ref searchSpaceEnd, 32); - do + while (true) + { + Vector256 result = ContainsMask32CharsAvx2(charMapLower256, charMapUpper256, ref cur); + + if (result != Vector256.Zero) + { + result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); + uint mask = result.ExtractMostSignificantBits(); + do + { + ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask)); + + if (Contains(values, candidatePos)) + { + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char)); + } + + mask = BitOperations.ResetLowestSetBit(mask); + } + while (mask != 0); + } + + cur = ref Unsafe.Add(ref cur, 32); + + if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVectorAvx2)) + { + if (Unsafe.AreSame(ref cur, ref searchSpaceEnd)) + { + return -1; + } + + if (Unsafe.ByteOffset(ref cur, ref searchSpaceEnd) > 16 * sizeof(char)) + { + // If we have more than 16 characters left to process, we can + // adjust the current vector and do one last iteration of Avx2. + cur = ref lastStartVectorAvx2; + } + else + { + // Otherwise adjust the vector such that we'll only need to do a single + // iteration of ContainsMask16Chars below. + cur = ref Unsafe.Subtract(ref searchSpaceEnd, 16); + break; + } + } + } + } + + ref char lastStartVector = ref Unsafe.Subtract(ref searchSpaceEnd, 16); + + while (true) { - Vector256 result = ContainsMaskAvx2(charMap256, ref cur); + Vector128 result = ContainsMask16Chars(charMapLower, charMapUpper, ref cur); - if (result != Vector256.Zero) + if (result != Vector128.Zero) { uint mask = result.ExtractMostSignificantBits(); do @@ -258,20 +398,32 @@ internal static int IndexOfAny(ref uint charMap, ref char searchSpace, while (mask != 0); } - cur = ref Unsafe.Add(ref cur, Vector128.Count); + cur = ref Unsafe.Add(ref cur, 16); + + if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVector)) + { + if (Unsafe.AreSame(ref cur, ref searchSpaceEnd)) + { + break; + } + + // Adjust the current vector and do one last iteration. + cur = ref lastStartVector; + } } - while (!Unsafe.IsAddressGreaterThan(ref cur, ref oneVectorAwayFromEnd)); } - - while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) + else { - int ch = cur; - if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch))) + while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) { - return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); - } + int ch = cur; + if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch))) + { + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); + } - cur = ref Unsafe.Add(ref cur, 1); + cur = ref Unsafe.Add(ref cur, 1); + } } return -1; From 71e7406f4a73267192d39c7d89a6166c6ea13702 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Fri, 27 Jan 2023 00:37:32 +0100 Subject: [PATCH 4/9] Add Vector{128/256}.LoadUnsafe(ref char) and Vector128.ShuffleUnsafe --- .../Common/src/System/HexConverter.cs | 9 ++--- .../src/System/Globalization/Ordinal.cs | 4 +-- .../IndexOfAnyAsciiSearcher.cs | 20 +++-------- .../IndexOfAnyValues/ProbabilisticMap.cs | 23 ++++-------- .../System/Runtime/Intrinsics/Vector128.cs | 36 +++++++++++++++++++ .../System/Runtime/Intrinsics/Vector256.cs | 15 ++++++++ .../src/System/SpanHelpers.Char.cs | 18 +++++----- .../src/System/String.Manipulation.cs | 4 +-- 8 files changed, 77 insertions(+), 52 deletions(-) diff --git a/src/libraries/Common/src/System/HexConverter.cs b/src/libraries/Common/src/System/HexConverter.cs index 7986ccb43a9b1f..81b56970be05a1 100644 --- a/src/libraries/Common/src/System/HexConverter.cs +++ b/src/libraries/Common/src/System/HexConverter.cs @@ -99,13 +99,8 @@ internal static (Vector128, Vector128) AsciiToHexVector128(Vector128 Vector128 lowNibbles = Vector128.UnpackLow(shiftedSrc, src); Vector128 highNibbles = Vector128.UnpackHigh(shiftedSrc, src); - return (ShuffleUnsafe(hexMap, lowNibbles & Vector128.Create((byte)0xF)), - ShuffleUnsafe(hexMap, highNibbles & Vector128.Create((byte)0xF))); - - // TODO: remove once https://github.com/dotnet/runtime/pull/80963 is merged - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 ShuffleUnsafe(Vector128 value, Vector128 mask) - => Ssse3.IsSupported ? Ssse3.Shuffle(value, mask) : AdvSimd.Arm64.VectorTableLookup(value, mask); + return (Vector128.ShuffleUnsafe(hexMap, lowNibbles & Vector128.Create((byte)0xF)), + Vector128.ShuffleUnsafe(hexMap, highNibbles & Vector128.Create((byte)0xF))); } private static void EncodeToUtf16_Vector128(ReadOnlySpan bytes, Span chars, Casing casing) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Ordinal.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Ordinal.cs index 1dd36f67306615..45007ac7d22b52 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Ordinal.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Ordinal.cs @@ -88,8 +88,8 @@ private static bool EqualsIgnoreCase_Vector128(ref char charA, ref char charB, i Vector128 vec2; do { - vec1 = Vector128.LoadUnsafe(ref Unsafe.As(ref charA), i); - vec2 = Vector128.LoadUnsafe(ref Unsafe.As(ref charB), i); + vec1 = Vector128.LoadUnsafe(ref charA, i); + vec2 = Vector128.LoadUnsafe(ref charB, i); if (!Utf16Utility.AllCharsInVector128AreAscii(vec1 | vec2)) { diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs index 87c258cee79832..207024d99962fb 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs @@ -860,10 +860,10 @@ private static Vector128 IndexOfAnyLookupCore(Vector128 source, Vect // The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle. // Lookup the rows via the lower nibble and the column via the higher nibble. - Vector128 bitMask = Shuffle(bitmapLookup, lowNibbles); + Vector128 bitMask = Vector128.ShuffleUnsafe(bitmapLookup, lowNibbles); // For values above 127, the high nibble will be above 7. We construct the positions vector for the shuffle such that those values map to 0. - Vector128 bitPositions = Shuffle(Vector128.Create(0x8040201008040201, 0).AsByte(), highNibbles); + Vector128 bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201, 0).AsByte(), highNibbles); Vector128 result = bitMask & bitPositions; return result; @@ -909,10 +909,10 @@ private static Vector128 IndexOfAnyLookup(Vector128 source Vector128 lowNibbles = source & Vector128.Create((byte)0xF); Vector128 highNibbles = Vector128.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector128.Create((byte)0xF); - Vector128 row0 = Shuffle(bitmapLookup0, lowNibbles); - Vector128 row1 = Shuffle(bitmapLookup1, lowNibbles); + Vector128 row0 = Vector128.ShuffleUnsafe(bitmapLookup0, lowNibbles); + Vector128 row1 = Vector128.ShuffleUnsafe(bitmapLookup1, lowNibbles); - Vector128 bitmask = Shuffle(Vector128.Create(0x8040201008040201).AsByte(), highNibbles); + Vector128 bitmask = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibbles); Vector128 mask = Vector128.GreaterThan(highNibbles.AsSByte(), Vector128.Create((sbyte)0x7)).AsByte(); Vector128 bitsets = Vector128.ConditionalSelect(mask, row1, row0); @@ -944,16 +944,6 @@ private static Vector256 IndexOfAnyLookup(Vector256 source return TNegator.NegateIfNeeded(result); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector128 Shuffle(Vector128 vector, Vector128 indices) - { - // We're not using Vector128.Shuffle as the caller already accounts for and relies on differences in behavior between platforms. - return - Ssse3.IsSupported ? Ssse3.Shuffle(vector, indices) : - AdvSimd.Arm64.IsSupported ? AdvSimd.Arm64.VectorTableLookup(vector, indices) : - PackedSimd.Swizzle(vector, indices); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe int ComputeFirstIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs index 934ce549401b35..b7e24b633bd2d1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs @@ -108,8 +108,8 @@ ref Unsafe.As(ref MemoryMarshal.GetReference(values)), [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector256 ContainsMask32CharsAvx2(Vector256 charMapLower, Vector256 charMapUpper, ref char searchSpace) { - Vector256 source0 = Vector256.LoadUnsafe(ref Unsafe.As(ref searchSpace)); - Vector256 source1 = Vector256.LoadUnsafe(ref Unsafe.As(ref searchSpace), (nuint)Vector256.Count); + Vector256 source0 = Vector256.LoadUnsafe(ref searchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref searchSpace, (nuint)Vector256.Count); Vector256 sourceLower = Avx2.PackUnsignedSaturate( (source0 & Vector256.Create((ushort)255)).AsInt16(), @@ -144,8 +144,8 @@ private static Vector256 IsCharBitNotSetAvx2(Vector256 charMapLower, [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 ContainsMask16Chars(Vector128 charMapLower, Vector128 charMapUpper, ref char searchSpace) { - Vector128 source0 = Vector128.LoadUnsafe(ref Unsafe.As(ref searchSpace)); - Vector128 source1 = Vector128.LoadUnsafe(ref Unsafe.As(ref searchSpace), (nuint)Vector128.Count); + Vector128 source0 = Vector128.LoadUnsafe(ref searchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref searchSpace, (nuint)Vector128.Count); Vector128 sourceLower = Sse2.IsSupported ? Sse2.PackUnsignedSaturate((source0 & Vector128.Create((ushort)255)).AsInt16(), (source1 & Vector128.Create((ushort)255)).AsInt16()) @@ -168,26 +168,17 @@ private static Vector128 IsCharBitNotSet(Vector128 charMapLower, Vec ? Sse2.ShiftRightLogical(values.AsInt32(), VectorizedIndexShift).AsByte() & Vector128.Create((byte)15) : AdvSimd.ShiftRightLogical(values, VectorizedIndexShift); - Vector128 bitPositions = Shuffle(Vector128.Create(0x8040201008040201).AsByte(), highNibble); + Vector128 bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble); Vector128 index = values & Vector128.Create((byte)VectorizedIndexMask); - Vector128 bitMaskLower = Shuffle(charMapLower, index); - Vector128 bitMaskUpper = Shuffle(charMapUpper, index - Vector128.Create((byte)16)); + Vector128 bitMaskLower = Vector128.ShuffleUnsafe(charMapLower, index); + Vector128 bitMaskUpper = Vector128.ShuffleUnsafe(charMapUpper, index - Vector128.Create((byte)16)); Vector128 mask = Vector128.GreaterThan(index, Vector128.Create((byte)15)); Vector128 bitMask = Vector128.ConditionalSelect(mask, bitMaskUpper, bitMaskLower); return Vector128.Equals(bitMask & bitPositions, Vector128.Zero); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector128 Shuffle(Vector128 vector, Vector128 indices) - { - // We're not using Vector128.Shuffle as the caller already accounts for differences in behavior between platforms. - return Ssse3.IsSupported - ? Ssse3.Shuffle(vector, indices) - : AdvSimd.Arm64.VectorTableLookup(vector, indices); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool ShouldUseSimpleLoop(int searchSpaceLength, int valuesLength) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index 8eadbbb37980c4..a27900a1480b43 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -1820,6 +1820,21 @@ public static Vector128 LoadUnsafe(ref T source, nuint elementOffset) return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); } + /// Loads a vector from the given source and reinterprets it as . + /// The source from which the vector will be loaded. + /// The vector loaded from . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128 LoadUnsafe(ref char source) => + LoadUnsafe(ref Unsafe.As(ref source)); + + /// Loads a vector from the given source and element offset and reinterprets it as . + /// The source to which will be added before loading the vector. + /// The element offset from from which the vector will be loaded. + /// The vector loaded from plus . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128 LoadUnsafe(ref char source, nuint elementOffset) => + LoadUnsafe(ref Unsafe.As(ref source), elementOffset); + /// Computes the maximum of two vectors on a per-element basis. /// The type of the elements in the vector. /// The vector to compare with . @@ -2419,6 +2434,27 @@ public static Vector128 Shuffle(Vector128 vector, Vector128 return result; } + /// Creates a new vector by selecting values from an input vector using a set of indices. + /// Behavior is platform-dependent for out-of-range indices. + /// The input vector from which values are selected. + /// The per-element indices used to select a value from . + /// A new vector containing the values from selected by the given . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128 ShuffleUnsafe(Vector128 vector, Vector128 indices) + { + if (Ssse3.IsSupported) + { + return Ssse3.Shuffle(vector, indices); + } + + if (AdvSimd.Arm64.IsSupported) + { + return AdvSimd.Arm64.VectorTableLookup(vector, indices); + } + + throw new PlatformNotSupportedException(); + } + /// Creates a new vector by selecting values from an input vector using a set of indices. /// The input vector from which values are selected. /// The per-element indices used to select a value from . diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 212133baf28b33..340d20b0812327 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -1809,6 +1809,21 @@ public static Vector256 LoadUnsafe(ref T source, nuint elementOffset) return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); } + /// Loads a vector from the given source and reinterprets it as . + /// The source from which the vector will be loaded. + /// The vector loaded from . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 LoadUnsafe(ref char source) => + LoadUnsafe(ref Unsafe.As(ref source)); + + /// Loads a vector from the given source and element offset and reinterprets it as . + /// The source to which will be added before loading the vector. + /// The element offset from from which the vector will be loaded. + /// The vector loaded from plus . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 LoadUnsafe(ref char source, nuint elementOffset) => + LoadUnsafe(ref Unsafe.As(ref source), elementOffset); + /// Computes the maximum of two vectors on a per-element basis. /// The type of the elements in the vector. /// The vector to compare with . diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index e51370c7805383..f48659545b9d8a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -68,7 +68,6 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + 1)), // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285 SEARCH_TWO_CHARS: - ref ushort ushortSearchSpace = ref Unsafe.As(ref searchSpace); if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0) { // Find the last unique (which is not equal to ch1) character @@ -89,8 +88,8 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + 1)), // Make sure we don't go out of bounds Debug.Assert(offset + ch1ch2Distance + Vector256.Count <= searchSpaceLength); - Vector256 cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance))); - Vector256 cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset)); + Vector256 cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); + Vector256 cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset)); Vector256 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); // Early out: cmpAnd is all zeros @@ -156,8 +155,8 @@ ref Unsafe.As(ref value), (nuint)(uint)valueLength * 2)) // Make sure we don't go out of bounds Debug.Assert(offset + ch1ch2Distance + Vector128.Count <= searchSpaceLength); - Vector128 cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance))); - Vector128 cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset)); + Vector128 cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); + Vector128 cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset)); Vector128 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); // Early out: cmpAnd is all zeros @@ -254,7 +253,6 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, relativeIndex + 1)), // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285 SEARCH_TWO_CHARS: - ref ushort ushortSearchSpace = ref Unsafe.As(ref searchSpace); if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count) { offset = searchSpaceMinusValueTailLength - Vector256.Count; @@ -272,8 +270,8 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, relativeIndex + 1)), do { - Vector256 cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset)); - Vector256 cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance))); + Vector256 cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset)); + Vector256 cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); Vector256 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); // Early out: cmpAnd is all zeros @@ -321,8 +319,8 @@ ref Unsafe.As(ref value), (nuint)(uint)valueLength * 2)) do { - Vector128 cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset)); - Vector128 cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance))); + Vector128 cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset)); + Vector128 cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); Vector128 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); // Early out: cmpAnd is all zeros diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs index c5433157967bc6..589c7d021e3f5d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs @@ -1916,7 +1916,7 @@ private static void MakeSeparatorListVectorized(ReadOnlySpan sourceSpan, r nuint offset = 0; nuint lengthToExamine = (uint)sourceSpan.Length; - ref ushort source = ref Unsafe.As(ref MemoryMarshal.GetReference(sourceSpan)); + ref char source = ref MemoryMarshal.GetReference(sourceSpan); Vector128 v1 = Vector128.Create((ushort)c); Vector128 v2 = Vector128.Create((ushort)c2); @@ -1947,7 +1947,7 @@ private static void MakeSeparatorListVectorized(ReadOnlySpan sourceSpan, r while (offset < lengthToExamine) { - char curr = (char)Unsafe.Add(ref source, offset); + char curr = Unsafe.Add(ref source, offset); if (curr == c || curr == c2 || curr == c3) { sepListBuilder.Append((int)offset); From 44d8a93b1f510034c075a440da289ac1a3bf55ef Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Mon, 6 Feb 2023 14:15:33 +0100 Subject: [PATCH 5/9] Use Vector128.ShuffleUnsafe in more places --- .../src/System/Buffers/Text/Base64Decoder.cs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs index f4ebc942ac331a..cc239d1a5e981a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs @@ -477,20 +477,17 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b destBytes = dest; } - // This can be replaced once https://github.com/dotnet/runtime/issues/63331 is implemented. [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 SimdShuffle(Vector128 left, Vector128 right, Vector128 mask8F) { Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian); - if (Ssse3.IsSupported) + if (AdvSimd.Arm64.IsSupported) { - return Ssse3.Shuffle(left, right); - } - else - { - return AdvSimd.Arm64.VectorTableLookup(left, Vector128.BitwiseAnd(right, mask8F)); + right &= mask8F; } + + return Vector128.ShuffleUnsafe(left, right); } [MethodImpl(MethodImplOptions.AggressiveInlining)] From 21c5d9575d44f9c8d982b6d59770247d90fc288f Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Thu, 23 Feb 2023 23:29:15 +0100 Subject: [PATCH 6/9] PR feedback --- .../IndexOfAnyValues/ProbabilisticMap.cs | 188 ++++++++++-------- 1 file changed, 100 insertions(+), 88 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs index b7e24b633bd2d1..4ec500680041de 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -26,12 +27,10 @@ namespace System.Buffers [StructLayout(LayoutKind.Sequential)] internal readonly struct ProbabilisticMap { - // We need at least Sse41 for the hardware accelerated ConditionalSelect support. - private static bool IsVectorizationSupported => Sse41.IsSupported || AdvSimd.Arm64.IsSupported; - // The vectorized algorithm operates on bytes instead of uint32s. // The index and shift are adjusted so that we represent the structure // as "32 x uint8" instead of "8 x uint32". + // We use the vectorized implementation when we have access to Sse41 or Arm64 intrinsics. private const uint VectorizedIndexMask = 31u; private const int VectorizedIndexShift = 5; @@ -77,7 +76,7 @@ public ProbabilisticMap(ReadOnlySpan values) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void SetCharBit(ref uint charMap, byte value) { - if (IsVectorizationSupported) + if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported) { Unsafe.Add(ref Unsafe.As(ref charMap), value & VectorizedIndexMask) |= (byte)(1u << (value >> VectorizedIndexShift)); } @@ -88,7 +87,7 @@ private static void SetCharBit(ref uint charMap, byte value) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsCharBitSet(ref uint charMap, byte value) => IsVectorizationSupported + private static bool IsCharBitSet(ref uint charMap, byte value) => Sse41.IsSupported || AdvSimd.Arm64.IsSupported ? (Unsafe.Add(ref Unsafe.As(ref charMap), value & VectorizedIndexMask) & (1u << (value >> VectorizedIndexShift))) != 0 : (Unsafe.Add(ref charMap, value & PortableIndexMask) & (1u << (value >> PortableIndexShift))) != 0; @@ -116,8 +115,8 @@ private static Vector256 ContainsMask32CharsAvx2(Vector256 charMapLo (source1 & Vector256.Create((ushort)255)).AsInt16()); Vector256 sourceUpper = Avx2.PackUnsignedSaturate( - Vector256.ShiftRightLogical(source0, 8).AsInt16(), - Vector256.ShiftRightLogical(source1, 8).AsInt16()); + (source0 >>> 8).AsInt16(), + (source1 >>> 8).AsInt16()); Vector256 resultLower = IsCharBitNotSetAvx2(charMapLower, charMapUpper, sourceLower); Vector256 resultUpper = IsCharBitNotSetAvx2(charMapLower, charMapUpper, sourceUpper); @@ -128,7 +127,7 @@ private static Vector256 ContainsMask32CharsAvx2(Vector256 charMapLo [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector256 IsCharBitNotSetAvx2(Vector256 charMapLower, Vector256 charMapUpper, Vector256 values) { - Vector256 highNibble = Avx2.ShiftRightLogical(values.AsInt32(), VectorizedIndexShift).AsByte() & Vector256.Create((byte)15); + Vector256 highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15); Vector256 bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble); @@ -152,7 +151,7 @@ private static Vector128 ContainsMask16Chars(Vector128 charMapLower, : AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte()); Vector128 sourceUpper = Sse2.IsSupported - ? Sse2.PackUnsignedSaturate(Vector128.ShiftRightLogical(source0, 8).AsInt16(), Vector128.ShiftRightLogical(source1, 8).AsInt16()) + ? Sse2.PackUnsignedSaturate((source0 >>> 8).AsInt16(), (source1 >>> 8).AsInt16()) : AdvSimd.Arm64.UnzipOdd(source0.AsByte(), source1.AsByte()); Vector128 resultLower = IsCharBitNotSet(charMapLower, charMapUpper, sourceLower); @@ -165,7 +164,7 @@ private static Vector128 ContainsMask16Chars(Vector128 charMapLower, private static Vector128 IsCharBitNotSet(Vector128 charMapLower, Vector128 charMapUpper, Vector128 values) { Vector128 highNibble = Sse2.IsSupported - ? Sse2.ShiftRightLogical(values.AsInt32(), VectorizedIndexShift).AsByte() & Vector128.Create((byte)15) + ? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15) : AdvSimd.ShiftRightLogical(values, VectorizedIndexShift); Vector128 bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble); @@ -220,7 +219,7 @@ private static int IndexOfAny(ref char searchSpace, int searchSpaceLen while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) { char c = cur; - if (TNegator.NegateIfNeeded(valuesSpan.Contains(c))) + if (TNegator.NegateIfNeeded(Contains(valuesSpan, c))) { return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); } @@ -252,7 +251,7 @@ private static int LastIndexOfAny(ref char searchSpace, int searchSpac for (int i = searchSpaceLength - 1; i >= 0; i--) { char c = Unsafe.Add(ref searchSpace, i); - if (TNegator.NegateIfNeeded(valuesSpan.Contains(c))) + if (TNegator.NegateIfNeeded(Contains(valuesSpan, c))) { return i; } @@ -303,77 +302,71 @@ private static int ProbabilisticLastIndexOfAny(ref char searchSpace, i internal static int IndexOfAny(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) where TNegator : struct, IndexOfAnyAsciiSearcher.INegator { + if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && typeof(TNegator) == typeof(IndexOfAnyAsciiSearcher.DontNegate) && searchSpaceLength >= 16) + { + return IndexOfAnyVectorized(ref charMap, ref searchSpace, searchSpaceLength, values); + } + ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); ref char cur = ref searchSpace; - if (IsVectorizationSupported && typeof(TNegator) == typeof(IndexOfAnyAsciiSearcher.DontNegate) && searchSpaceLength >= 16) + while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) { - Vector128 charMapLower = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap)); - Vector128 charMapUpper = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap), (nuint)Vector128.Count); - - if (Avx2.IsSupported && searchSpaceLength >= 32) + int ch = cur; + if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch))) { - Vector256 charMapLower256 = Vector256.Create(charMapLower, charMapLower); - Vector256 charMapUpper256 = Vector256.Create(charMapUpper, charMapUpper); + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); + } - ref char lastStartVectorAvx2 = ref Unsafe.Subtract(ref searchSpaceEnd, 32); + cur = ref Unsafe.Add(ref cur, 1); + } - while (true) - { - Vector256 result = ContainsMask32CharsAvx2(charMapLower256, charMapUpper256, ref cur); + return -1; + } - if (result != Vector256.Zero) - { - result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); - uint mask = result.ExtractMostSignificantBits(); - do - { - ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask)); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int LastIndexOfAny(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) + where TNegator : struct, IndexOfAnyAsciiSearcher.INegator + { + for (int i = searchSpaceLength - 1; i >= 0; i--) + { + int ch = Unsafe.Add(ref searchSpace, i); + if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch))) + { + return i; + } + } - if (Contains(values, candidatePos)) - { - return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char)); - } + return -1; + } - mask = BitOperations.ResetLowestSetBit(mask); - } - while (mask != 0); - } + private static int IndexOfAnyVectorized(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) + { + Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported); + Debug.Assert(searchSpaceLength >= 16); - cur = ref Unsafe.Add(ref cur, 32); + ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); + ref char cur = ref searchSpace; - if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVectorAvx2)) - { - if (Unsafe.AreSame(ref cur, ref searchSpaceEnd)) - { - return -1; - } + Vector128 charMapLower = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap)); + Vector128 charMapUpper = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap), (nuint)Vector128.Count); - if (Unsafe.ByteOffset(ref cur, ref searchSpaceEnd) > 16 * sizeof(char)) - { - // If we have more than 16 characters left to process, we can - // adjust the current vector and do one last iteration of Avx2. - cur = ref lastStartVectorAvx2; - } - else - { - // Otherwise adjust the vector such that we'll only need to do a single - // iteration of ContainsMask16Chars below. - cur = ref Unsafe.Subtract(ref searchSpaceEnd, 16); - break; - } - } - } - } + if (Avx2.IsSupported && searchSpaceLength >= 32) + { + Vector256 charMapLower256 = Vector256.Create(charMapLower, charMapLower); + Vector256 charMapUpper256 = Vector256.Create(charMapUpper, charMapUpper); - ref char lastStartVector = ref Unsafe.Subtract(ref searchSpaceEnd, 16); + ref char lastStartVectorAvx2 = ref Unsafe.Subtract(ref searchSpaceEnd, 32); while (true) { - Vector128 result = ContainsMask16Chars(charMapLower, charMapUpper, ref cur); + Vector256 result = ContainsMask32CharsAvx2(charMapLower256, charMapUpper256, ref cur); - if (result != Vector128.Zero) + if (result != Vector256.Zero) { + // Account for how ContainsMask32CharsAvx2 packed the source chars (Avx2.PackUnsignedSaturate). + result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); + uint mask = result.ExtractMostSignificantBits(); do { @@ -389,47 +382,66 @@ internal static int IndexOfAny(ref uint charMap, ref char searchSpace, while (mask != 0); } - cur = ref Unsafe.Add(ref cur, 16); + cur = ref Unsafe.Add(ref cur, 32); - if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVector)) + if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVectorAvx2)) { if (Unsafe.AreSame(ref cur, ref searchSpaceEnd)) { - break; + return -1; } - // Adjust the current vector and do one last iteration. - cur = ref lastStartVector; + if (Unsafe.ByteOffset(ref cur, ref searchSpaceEnd) > 16 * sizeof(char)) + { + // If we have more than 16 characters left to process, we can + // adjust the current vector and do one last iteration of Avx2. + cur = ref lastStartVectorAvx2; + } + else + { + // Otherwise adjust the vector such that we'll only need to do a single + // iteration of ContainsMask16Chars below. + cur = ref Unsafe.Subtract(ref searchSpaceEnd, 16); + break; + } } } } - else + + ref char lastStartVector = ref Unsafe.Subtract(ref searchSpaceEnd, 16); + + while (true) { - while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) + Vector128 result = ContainsMask16Chars(charMapLower, charMapUpper, ref cur); + + if (result != Vector128.Zero) { - int ch = cur; - if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch))) + uint mask = result.ExtractMostSignificantBits(); + do { - return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); - } + ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask)); - cur = ref Unsafe.Add(ref cur, 1); + if (Contains(values, candidatePos)) + { + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char)); + } + + mask = BitOperations.ResetLowestSetBit(mask); + } + while (mask != 0); } - } - return -1; - } + cur = ref Unsafe.Add(ref cur, 16); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int LastIndexOfAny(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) - where TNegator : struct, IndexOfAnyAsciiSearcher.INegator - { - for (int i = searchSpaceLength - 1; i >= 0; i--) - { - int ch = Unsafe.Add(ref searchSpace, i); - if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch))) + if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVector)) { - return i; + if (Unsafe.AreSame(ref cur, ref searchSpaceEnd)) + { + break; + } + + // Adjust the current vector and do one last iteration. + cur = ref lastStartVector; } } From 22953700f9e0976e4a00e34ccf69a19cf346e1a1 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Fri, 24 Feb 2023 00:23:19 +0100 Subject: [PATCH 7/9] Replace another ShiftRightLogical with '>>>' --- .../src/System/IndexOfAnyValues/ProbabilisticMap.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs index 4ec500680041de..fe4560dc6c0dd6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs @@ -127,6 +127,7 @@ private static Vector256 ContainsMask32CharsAvx2(Vector256 charMapLo [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector256 IsCharBitNotSetAvx2(Vector256 charMapLower, Vector256 charMapUpper, Vector256 values) { + // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564 Vector256 highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15); Vector256 bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble); @@ -163,9 +164,10 @@ private static Vector128 ContainsMask16Chars(Vector128 charMapLower, [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 IsCharBitNotSet(Vector128 charMapLower, Vector128 charMapUpper, Vector128 values) { + // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564 Vector128 highNibble = Sse2.IsSupported ? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15) - : AdvSimd.ShiftRightLogical(values, VectorizedIndexShift); + : values >>> VectorizedIndexShift; Vector128 bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble); From a753250f7bf161678f53aac8b374bddf9b8997eb Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Wed, 8 Mar 2023 20:49:10 +0100 Subject: [PATCH 8/9] Add WASM path to Vector128.ShuffleUnsafe --- .../src/System/Runtime/Intrinsics/Vector128.cs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index a27900a1480b43..35c5dac4be9502 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -6,6 +6,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; namespace System.Runtime.Intrinsics @@ -2452,7 +2453,12 @@ internal static Vector128 ShuffleUnsafe(Vector128 vector, Vector128< return AdvSimd.Arm64.VectorTableLookup(vector, indices); } - throw new PlatformNotSupportedException(); + if (PackedSimd.IsSupported) + { + return PackedSimd.Swizzle(vector, indices); + } + + return Shuffle(vector, indices); } /// Creates a new vector by selecting values from an input vector using a set of indices. From 978b4e858704ba9efeb21a80386abebf0b792432 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Thu, 9 Mar 2023 21:58:58 +0100 Subject: [PATCH 9/9] PR feedback --- .../IndexOfAnyValues/ProbabilisticMap.cs | 20 +++++++++---------- .../System/Runtime/Intrinsics/Vector128.cs | 3 +++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs index fe4560dc6c0dd6..2b13c5b19ce7e3 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs @@ -118,14 +118,14 @@ private static Vector256 ContainsMask32CharsAvx2(Vector256 charMapLo (source0 >>> 8).AsInt16(), (source1 >>> 8).AsInt16()); - Vector256 resultLower = IsCharBitNotSetAvx2(charMapLower, charMapUpper, sourceLower); - Vector256 resultUpper = IsCharBitNotSetAvx2(charMapLower, charMapUpper, sourceUpper); + Vector256 resultLower = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceLower); + Vector256 resultUpper = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceUpper); - return ~(resultLower | resultUpper); + return resultLower & resultUpper; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 IsCharBitNotSetAvx2(Vector256 charMapLower, Vector256 charMapUpper, Vector256 values) + private static Vector256 IsCharBitSetAvx2(Vector256 charMapLower, Vector256 charMapUpper, Vector256 values) { // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564 Vector256 highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15); @@ -138,7 +138,7 @@ private static Vector256 IsCharBitNotSetAvx2(Vector256 charMapLower, Vector256 mask = Vector256.GreaterThan(index, Vector256.Create((byte)15)); Vector256 bitMask = Vector256.ConditionalSelect(mask, bitMaskUpper, bitMaskLower); - return Vector256.Equals(bitMask & bitPositions, Vector256.Zero); + return ~Vector256.Equals(bitMask & bitPositions, Vector256.Zero); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -155,14 +155,14 @@ private static Vector128 ContainsMask16Chars(Vector128 charMapLower, ? Sse2.PackUnsignedSaturate((source0 >>> 8).AsInt16(), (source1 >>> 8).AsInt16()) : AdvSimd.Arm64.UnzipOdd(source0.AsByte(), source1.AsByte()); - Vector128 resultLower = IsCharBitNotSet(charMapLower, charMapUpper, sourceLower); - Vector128 resultUpper = IsCharBitNotSet(charMapLower, charMapUpper, sourceUpper); + Vector128 resultLower = IsCharBitSet(charMapLower, charMapUpper, sourceLower); + Vector128 resultUpper = IsCharBitSet(charMapLower, charMapUpper, sourceUpper); - return ~(resultLower | resultUpper); + return resultLower & resultUpper; } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector128 IsCharBitNotSet(Vector128 charMapLower, Vector128 charMapUpper, Vector128 values) + private static Vector128 IsCharBitSet(Vector128 charMapLower, Vector128 charMapUpper, Vector128 values) { // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564 Vector128 highNibble = Sse2.IsSupported @@ -177,7 +177,7 @@ private static Vector128 IsCharBitNotSet(Vector128 charMapLower, Vec Vector128 mask = Vector128.GreaterThan(index, Vector128.Create((byte)15)); Vector128 bitMask = Vector128.ConditionalSelect(mask, bitMaskUpper, bitMaskLower); - return Vector128.Equals(bitMask & bitPositions, Vector128.Zero); + return ~Vector128.Equals(bitMask & bitPositions, Vector128.Zero); } [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index 35c5dac4be9502..a8853d950e1cd6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -2440,6 +2440,9 @@ public static Vector128 Shuffle(Vector128 vector, Vector128 /// The input vector from which values are selected. /// The per-element indices used to select a value from . /// A new vector containing the values from selected by the given . + /// Unlike Shuffle, this method delegates to the underlying hardware intrinsic without ensuring that are normalized to [0, 15]. + /// On hardware with support, indices are treated as modulo 16, and if the high bit is set, the result will be set to 0 for that element. + /// On hardware with or support, this method behaves the same as Shuffle. [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static Vector128 ShuffleUnsafe(Vector128 vector, Vector128 indices) {