From 09fd60ce163de0a950035bb98587197c3cc52534 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 19 Oct 2021 10:29:49 -0400 Subject: [PATCH 1/3] Add limited support for backtracking Regex single char loops to simplified code gen In .NET 5, we added simpler compiled code gen for regexes that didn't entail backtracking (or that had only very constrained backtracking, such as a top-level alternation). In our corpus of ~90K regular expressions, that code generator is employed for ~40% of them. The primary purpose of adding that code generator initially was performance, as it was able to avoid lots of the expense that original code generator had, especially for simple regexes. However, with the source generator, it's much more valuable to use this code gen as the generated code is human-readable and really helps to understand how the regex is operating, is much more easily debugged, etc. This change allows the simplified code gen to be used even if there are backtracking single-character loops in the regex, as long as those loops are in a top-level concatenation (or a simple grouping structure like a capture). This increases the percentage of expressions in our corpus that will use the simplified code gen to ~65%. Once we have the simplified loop code gen, it's also a lot easier to add in vectorization of searching for the next location to back off to based on a literal that comes immediately after the loop (e.g. "abc.*def"). This adds support into both RegexOptions.Compiled and the source generator to use LastIndexOf in that case. The change also entailed adding/updating a few recursive functions. The plan has been to adopt the same model as in System.Linq.Expressions, Roslyn, and elsewhere, where we fork processing to continue on a secondary thread, rather than trying to enforce some max depth or rewrite as iterative, so I've done that as part of this change as well. --- .../gen/RegexGenerator.Emitter.cs | 411 ++++++++++----- ...m.Text.RegularExpressions.Generator.csproj | 1 + .../src/System.Text.RegularExpressions.csproj | 4 +- .../Text/RegularExpressions/RegexCompiler.cs | 377 ++++++++------ .../Text/RegularExpressions/RegexNode.cs | 471 +++++++++++------- .../Symbolic/RegexNodeToSymbolicConverter.cs | 7 +- .../Symbolic/StackHelper.cs | 31 -- .../Symbolic/SymbolicRegexNode.cs | 26 +- .../src/System/Threading/StackHelper.cs | 82 +++ .../tests/Regex.Match.Tests.cs | 18 +- .../tests/RegexReductionTests.cs | 19 +- 11 files changed, 927 insertions(+), 520 deletions(-) delete mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs create mode 100644 src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 1acada27f5179a..ec5b2ad5a09a11 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -668,18 +668,44 @@ void EmitAnchorAndLeadingChecks() private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id) { Debug.Assert(rm.Code.Tree.Root.Type == RegexNode.Capture); + if ((rm.Options & RegexOptions.NonBacktracking) != 0) { EmitNonBacktrackingGo(writer, rm, id); + return; } - else if (RegexNode.NodeSupportsSimplifiedCodeGenerationImplementation(rm.Code.Tree.Root.Child(0), RegexNode.DefaultMaxRecursionDepth) && - (((RegexOptions)rm.Code.Tree.Root.Options) & RegexOptions.RightToLeft) == 0) + RegexNode root = rm.Code.Tree.Root; + if (!ExceedsMaxDepthForSimpleCodeGeneration(root) && + root.Child(0).SupportsSimplifiedCodeGenerationImplementation() && + (((RegexOptions)root.Options) & RegexOptions.RightToLeft) == 0) { EmitSimplifiedGo(writer, rm, id); + return; } - else + + EmitCompleteGo(writer, rm, id); + + // Deep RegexNode trees used with the simplified code generator can result in + // emitting C# code that exceeds C# compiler limitations, leading to "CS8078: An + // expression is too long or complex to compile". Place an artificial limit on + // max tree depth in order to mitigate such issues. + static bool ExceedsMaxDepthForSimpleCodeGeneration(RegexNode node, int maxDepth = 30) { - EmitCompleteGo(writer, rm, id); + if (maxDepth <= 0) + { + return true; + } + + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (ExceedsMaxDepthForSimpleCodeGeneration(node.Child(i), maxDepth - 1)) + { + return true; + } + } + + return false; } } @@ -698,8 +724,8 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, bool rtl = code.RightToLeft; bool hasTimeout = false; - int nextLocalId = 0; - string GetNextLocalId() => $"i{nextLocalId++}"; ; + int localCounter = 0; + string NextLocalName(string prefix) => $"{prefix}{localCounter++}"; RegexNode node = rm.Code.Tree.Root; Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node"); @@ -708,6 +734,11 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, // Skip the Capture node. We handle the implicit root capture specially. node = node.Child(0); + // If there's any backtracking in the expression, nodes may emit labels that their peers + // need to jump to. Scopes (which we emit for readability) get in the way of that. As such, + // for nodes that emit such labels, we emit faux, commented-out scopes instead. + HashSet nodesWithCrossScopeLabels = NodesWithCrossScopeLabels(node); + // In some limited cases, FindFirstChar will only return true if it successfully matched the whole thing. // This is the case, in particular, for strings. We can special case these to do essentially nothing // in Go other than emit the capture. @@ -750,16 +781,15 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, LoadTextSpanLocal(writer, defineLocal: true); writer.WriteLine(); - int localCounter = 0; - string NextLocalName(string prefix) => $"{prefix}{localCounter++}"; - int labelCounter = 0; - string DefineLabel() => $"L{labelCounter++}"; + string DefineLabel(string prefix = "L") => $"{prefix}{labelCounter++}"; void MarkLabel(string label) => writer.WriteLine($"{label}:"); void Goto(string label) => writer.WriteLine($"goto {label};"); string doneLabel = "NoMatch"; + string originalDoneLabel = doneLabel; // Emit the code for all nodes in the tree. + bool expressionHasCaptures = (node.Options & RegexNode.HasCapturesFlag) != 0; EmitNode(node); // Emit success @@ -775,10 +805,10 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, // Emit failure writer.WriteLine("// No match"); - MarkLabel(doneLabel); - if ((node.Options & RegexNode.HasCapturesFlag) != 0) + MarkLabel(originalDoneLabel); + if (expressionHasCaptures) { - writer.WriteLine("while (base.Crawlpos() != 0) base.Uncapture();"); + EmitUncaptureUntil("0"); } else { @@ -945,7 +975,7 @@ static RegexNode CloneMultiWithoutFirstChar(RegexNode node) void EmitAllBranches() { // Label to jump to when any branch completes successfully. - string doneAlternateLabel = DefineLabel(); + string doneAlternateLabel = DefineLabel("Match"); // Save off runtextpos. We'll need to reset this each time a branch fails. string startingRunTextPosName = NextLocalName("startingRunTextPos"); @@ -957,9 +987,10 @@ void EmitAllBranches() // as the alternation is atomic, so we're not concerned about captures after // the alternation. bool hasStartingCrawlpos = (node.Options & RegexNode.HasCapturesFlag) != 0; + string startingCrawlPos = NextLocalName("startingCrawlPos"); if (hasStartingCrawlpos) { - writer.WriteLine("int startingCrawlpos = base.Crawlpos();"); + writer.WriteLine($"int {startingCrawlPos} = base.Crawlpos();"); } writer.WriteLine(); @@ -971,7 +1002,7 @@ void EmitAllBranches() { using var __ = EmitScope(writer, $"Branch {i}"); - string nextBranch = DefineLabel(); + string nextBranch = DefineLabel("NoMatch"); doneLabel = nextBranch; // Emit the code for each branch. @@ -994,7 +1025,7 @@ void EmitAllBranches() textSpanPos = startingTextSpanPos; if (hasStartingCrawlpos) { - EmitUncaptureUntil(); + EmitUncaptureUntil(startingCrawlPos); } } @@ -1005,14 +1036,14 @@ void EmitAllBranches() { if (hasStartingCrawlpos) { - string uncapture = DefineLabel(); + string uncapture = DefineLabel("Uncapture"); doneLabel = uncapture; EmitNode(node.Child(childCount - 1)); doneLabel = postAlternateDoneLabel; TransferTextSpanPosToRunTextPos(); writer.WriteLine($"goto {doneAlternateLabel};"); MarkLabel(uncapture); - EmitUncaptureUntil(); + EmitUncaptureUntil(startingCrawlPos); writer.WriteLine($"goto {doneLabel};"); } else @@ -1031,7 +1062,7 @@ void EmitAllBranches() } // Emits the code for a Capture node. - void EmitCapture(RegexNode node) + void EmitCapture(RegexNode node, RegexNode? subsequent = null) { Debug.Assert(node.N == -1); @@ -1049,16 +1080,19 @@ void EmitCapture(RegexNode node) writer.WriteLine($"int {startingRunTextPosName} = runtextpos;"); // Emit child node. - EmitNode(node.Child(0)); + EmitNode(node.Child(0), subsequent); TransferTextSpanPosToRunTextPos(); writer.WriteLine($"base.Capture({capnum}, {startingRunTextPosName}, runtextpos);"); } // Emits code to unwind the capture stack until the crawl position specified in the provided local. - void EmitUncaptureUntil() + void EmitUncaptureUntil(string crawlpos) { - writer.WriteLine("while (base.Crawlpos() != startingCrawlpos) base.Uncapture();"); + using (EmitBlock(writer, $"while (base.Crawlpos() != {crawlpos})")) + { + writer.WriteLine("base.Uncapture();"); + } } // Emits the code to handle a positive lookahead assertion. @@ -1088,7 +1122,8 @@ void EmitNegativeLookaheadAssertion(RegexNode node) int startingTextSpanPos = textSpanPos; string originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + string negativeLookaheadDoneLabel = DefineLabel("Match"); + doneLabel = negativeLookaheadDoneLabel; // Emit the child. EmitNode(node.Child(0)); @@ -1098,7 +1133,8 @@ void EmitNegativeLookaheadAssertion(RegexNode node) Goto(originalDoneLabel); // Failures (success for a negative lookahead) jump here. - MarkLabel(doneLabel); + MarkLabel(negativeLookaheadDoneLabel); + Debug.Assert(doneLabel == negativeLookaheadDoneLabel); doneLabel = originalDoneLabel; // After the child completes in failure (success for negative lookahead), reset the text positions. @@ -1110,8 +1146,14 @@ void EmitNegativeLookaheadAssertion(RegexNode node) static string DescribeNode(RegexNode node) => SymbolDisplay.FormatLiteral(node.Description(), quote: false); // Emits the code for the node. - void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) + void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + StackHelper.CallOnEmptyStack(EmitNode, node, subsequent, emitLengthChecksIfRequired); + return; + } + // Separate out several node types that, for conciseness, don't need a header and scope written into the source. switch (node.Type) { @@ -1119,12 +1161,13 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) return; case RegexNode.Atomic: - EmitNode(node.Child(0)); + EmitNode(node.Child(0), subsequent); return; } - // Put the node's code into its own scope - using var _ = EmitScope(writer, DescribeNode(node)); + // Put the node's code into its own scope. If the node contains labels that may need to + // be visible outside of its scope, the scope is still emitted for clarity but is commented out. + using var _ = EmitScope(writer, DescribeNode(node), nodesWithCrossScopeLabels.Contains(node)); switch (node.Type) { @@ -1180,12 +1223,15 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) break; case RegexNode.Oneloop: - case RegexNode.Onelazy: case RegexNode.Notoneloop: - case RegexNode.Notonelazy: case RegexNode.Setloop: + EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); + break; + + case RegexNode.Onelazy: + case RegexNode.Notonelazy: case RegexNode.Setlazy: - EmitSingleCharRepeater(node, emitLengthChecksIfRequired); + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); break; case RegexNode.Concatenate: @@ -1199,19 +1245,20 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) for (; i < exclusiveEnd; i++) { - EmitNode(node.Child(i), emitLengthChecksIfRequired: false); + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); } i--; - continue; } - - EmitNode(node.Child(i), emitLengthChecksIfRequired); + else + { + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired); + } } break; case RegexNode.Capture: - EmitCapture(node); + EmitCapture(node, subsequent); break; case RegexNode.Require: @@ -1236,6 +1283,47 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) } } + /// + /// Provides a set of all the nodes in the node tree that contains a node + /// which triggers backtracking and thus may emit labels that peer nodes need + /// to be able to see. + /// + static HashSet NodesWithCrossScopeLabels(RegexNode node) + { + var results = new HashSet(); + NodesWithCrossScopeLabels(node, results); + return results; + + static bool NodesWithCrossScopeLabels(RegexNode node, HashSet results) + { + // Nodes that trigger backtracking and thus may emit labels that need to be reached by non-descendants. + bool contains = node.InstigatesBacktracking; + + if (!contains) + { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + // Rather than forking to another thread, just say this has cross-scope labels. + // The effect is simply more faux scopes output. + return true; + } + + int childcount = node.ChildCount(); + for (int i = 0; i < childcount; i++) + { + contains |= NodesWithCrossScopeLabels(node.Child(i), results); + } + } + + if (contains) + { + results.Add(node); + } + + return contains; + } + } + // Emits the code to handle updating base.runtextpos to runtextpos in response to // an UpdateBumpalong node. This is used when we want to inform the scan loop that // it should bump from this location rather than from the original location. @@ -1248,33 +1336,20 @@ void EmitUpdateBumpalong() // Emits the code to handle a single-character match. void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset = null) { - string expr = $"{textSpanLocal}[{Sum(textSpanPos, offset)}]"; - switch (node.Type) - { - // This only emits a single check, but it's called from the looping constructs in a loop - // to generate the code for a single check, so we map those looping constructs to the - // appropriate single check. + // This only emits a single check, but it's called from the looping constructs in a loop + // to generate the code for a single check, so we map those looping constructs to the + // appropriate single check. - case RegexNode.Set: - case RegexNode.Setlazy: - case RegexNode.Setloop: - case RegexNode.Setloopatomic: - expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node))}"; - break; - - case RegexNode.One: - case RegexNode.Onelazy: - case RegexNode.Oneloop: - case RegexNode.Oneloopatomic: - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} != {Literal(node.Ch)}"; - break; + string expr = $"{textSpanLocal}[{Sum(textSpanPos, offset)}]"; - default: - Debug.Assert(node.Type == RegexNode.Notone || node.Type == RegexNode.Notonelazy || node.Type == RegexNode.Notoneloop || node.Type == RegexNode.Notoneloopatomic); - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} == {Literal(node.Ch)}"; - break; + if (node.IsSetFamily) + { + expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node))}"; + } + else + { + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = $"{expr} {(node.IsOneFamily ? "!=" : "==")} {Literal(node.Ch)}"; } using (EmitBlock(writer, emitLengthCheck ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : $"if ({expr})")) @@ -1393,10 +1468,6 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck = true) { // Unroll shorter strings. - // TODO: This might employ 64-bit operations on a 32-bit machine. Decide if avoiding that - // is worth adding further complexity for (RegexOptions.Compiled doesn't have to deal with - // this, as the machine generating the code in-memory is the same one running it.) - // For strings more than two characters and when performing case-sensitive searches, we try to do fewer comparisons // by comparing 2 or 4 characters at a time. Because we might be compiling on one endianness and running on another, // both little and big endian values are emitted and which is used is selected at run-time. @@ -1476,7 +1547,7 @@ void EmitOr() else { EmitSpanLengthCheck(str.Length); - string i = GetNextLocalId(); + string i = NextLocalName("i"); using (EmitBlock(writer, $"for (int {i} = 0; {i} < {Literal(node.Str)}.Length; {i}++)")) { using (EmitBlock(writer, $"if ({ToLower(hasTextInfo, options, $"{textSpanLocal}[{textSpanPos} + {i}]")} != {Literal(str)}[{i}])")) @@ -1489,9 +1560,90 @@ void EmitOr() } } + void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + { + // If this is actually a repeater, emit that instead; no backtracking necessary. + if (node.M == node.N) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + return; + } + + // Emit backtracking around an atomic single char loop. We can then implement the backtracking + // as an afterthought, since we know exactly how many characters are accepted by each iteration + // of the wrapped loop (1). + + Debug.Assert(node.M < node.N); + string backtrackingLabel = DefineLabel("Backtrack"); + string endLoop = DefineLabel("EndLoop"); + string startingPos = NextLocalName("startingRunTextPos"); + string endingPos = NextLocalName("endingRunTextPos"); + string crawlPos = NextLocalName("crawlPos"); + + // We're about to enter a loop, so ensure our text position is 0. + TransferTextSpanPosToRunTextPos(); + + // Grab the current position, then emit the loop as atomic, and then + // grab the current position again. Even though we emit the loop without + // knowledge of backtracking, we can layer it on top by just walking back + // through the individual characters (a benefit of the loop matching exactly + // one character per iteration, no possible captures within the loop, etc.) + writer.WriteLine($"int {startingPos} = runtextpos;"); + EmitSingleCharAtomicLoop(node); + TransferTextSpanPosToRunTextPos(); + writer.WriteLine($"int {endingPos} = runtextpos;"); + writer.WriteLine($"int {crawlPos} = base.Crawlpos();"); + if (node.M > 0) + { + writer.WriteLine($"{startingPos} += {node.M};"); + } + writer.WriteLine($"goto {endLoop};"); + writer.WriteLine(); + + // Backtracking section. Subsequent failures will jump to here, at which + // point we decrement the matched count as long as it's above the minimum + // required, and try again by flowing to everything that comes after this. + MarkLabel(backtrackingLabel); + string originalDoneLabel = doneLabel; + using (EmitBlock(writer, $"if ({startingPos} >= {endingPos})")) + { + writer.WriteLine($"goto {originalDoneLabel};"); + } + doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes + + if (expressionHasCaptures) + { + // Uncapture any captures if the expression has any. It's possible the captures it has + // are before this node, in which case this is wasted effort, but still functionally correct. + EmitUncaptureUntil(crawlPos); + } + + if (subsequent?.FindStartingCharacter() is char subsequentCharacter) + { + writer.WriteLine($"{endingPos} = runtext.LastIndexOf({Literal(subsequentCharacter)}, {endingPos} - 1, {endingPos} - {startingPos});"); + using (EmitBlock(writer, $"if ({endingPos} < 0)")) + { + writer.WriteLine($"goto {originalDoneLabel};"); + } + writer.WriteLine($"runtextpos = {endingPos};"); + } + else + { + writer.WriteLine($"runtextpos = --{endingPos};"); + } + + LoadTextSpanLocal(writer); + writer.WriteLine(); + + MarkLabel(endLoop); + + // We explicitly do not reset doneLabel back to originalDoneLabel. + // It's left pointing to the backtracking label for everything subsequent in the expression. + } + // Emits the code to handle a loop (repeater) with a fixed number of iterations. // RegexNode.M is used for the number of iterations; RegexNode.N is ignored. - void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) + void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthCheck = true) { int iterations = node.M; if (iterations == 0) @@ -1525,7 +1677,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) { string spanLocal = "slice"; // As this repeater doesn't wrap arbitrary node emits, this shouldn't conflict with anything writer.WriteLine($"global::System.ReadOnlySpan {spanLocal} = {textSpanLocal}.Slice({textSpanPos}, {iterations});"); - string i = GetNextLocalId(); + string i = NextLocalName("i"); using (EmitBlock(writer, $"for (int {i} = 0; {i} < {spanLocal}.Length; {i}++)")) { EmitTimeoutCheck(writer, hasTimeout); @@ -1561,7 +1713,7 @@ void EmitNodeRepeater(RegexNode node) // Ensure textSpanPos is 0 prior to emitting the child. TransferTextSpanPosToRunTextPos(); - string i = GetNextLocalId(); + string i = NextLocalName("i"); using (EmitBlock(writer, $"for (int {i} = 0; {i} < {iterations}; {i}++)")) { EmitTimeoutCheck(writer, hasTimeout); @@ -1574,15 +1726,10 @@ void EmitNodeRepeater(RegexNode node) // Emits the code to handle a non-backtracking, variable-length loop around a single character comparison. void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = true) { - Debug.Assert( - node.Type == RegexNode.Oneloopatomic || - node.Type == RegexNode.Notoneloopatomic || - node.Type == RegexNode.Setloopatomic); - // If this is actually a repeater, emit that instead. if (node.M == node.N) { - EmitSingleCharRepeater(node, emitLengthChecksIfRequired); + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); return; } @@ -1600,12 +1747,12 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = Span setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny int numSetChars = 0; - string iterationLocal = "i"; // No need for a dynamically named value, as no other 'i' can be in scope - if (node.Type == RegexNode.Notoneloopatomic && + string iterationLocal = NextLocalName("i"); + if (node.IsNotoneFamily && maxIterations == int.MaxValue && (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch))) { - // For Notoneloopatomic, we're looking for a specific character, as everything until we find + // For Notone, we're looking for a specific character, as everything until we find // it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, // we can use the vectorized IndexOf to do the search, rather than open-coding it. The unbounded // restriction is purely for simplicity; it could be removed in the future with additional code to @@ -1625,14 +1772,14 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = $"{iterationLocal} = {textSpanLocal}.Length;"); } } - else if (node.Type == RegexNode.Setloopatomic && + else if (node.IsSetFamily && maxIterations == int.MaxValue && !IsCaseInsensitive(node) && (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 && RegexCharClass.IsNegated(node.Str!)) { // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would - // have been reduced to a Notoneloopatomic), we can use an IndexOfAny to find any of the target characters. + // have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters. // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. writer.Write($"int {iterationLocal} = global::System.MemoryExtensions.IndexOfAny({textSpanLocal}"); @@ -1650,10 +1797,10 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = $"{iterationLocal} = {textSpanLocal}.Length;"); } } - else if (node.Type == RegexNode.Setloopatomic && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) + else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) { // .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end. - // The unbounded constraint is the same as in the Notoneloopatomic case above, done purely for simplicity. + // The unbounded constraint is the same as in the Notone case above, done purely for simplicity. // int i = runtextend - runtextpos; TransferTextSpanPosToRunTextPos(); @@ -1664,22 +1811,26 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = // For everything else, do a normal loop. string expr = $"{textSpanLocal}[{iterationLocal}]"; - switch (node.Type) + if (node.IsSetFamily) { - case RegexNode.Oneloopatomic: - case RegexNode.Notoneloopatomic: - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} {(node.Type == RegexNode.Oneloopatomic ? "==" : "!=")} {Literal(node.Ch)}"; - break; - case RegexNode.Setloopatomic: - expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node)); - break; + expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node)); + } + else + { + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; } - // Transfer text pos to runtextpos to help with bounds check elimination on the loop. - TransferTextSpanPosToRunTextPos(); + if (minIterations != 0 || maxIterations != int.MaxValue) + { + // For any loops other than * loops, transfer text pos to runtextpos in + // order to zero it out to be able to use the single iteration variable + // for both iteration count and indexer. + TransferTextSpanPosToRunTextPos(); + } - writer.WriteLine($"int {iterationLocal} = 0;"); + writer.WriteLine($"int {iterationLocal} = {textSpanPos};"); + textSpanPos = 0; string maxClause = maxIterations != int.MaxValue ? $"{iterationLocal} < {maxIterations} && " : ""; using (EmitBlock(writer, $"while ({maxClause}(uint){iterationLocal} < (uint){textSpanLocal}.Length && {expr})")) @@ -1708,29 +1859,17 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = // Emits the code to handle a non-backtracking optional zero-or-one loop. void EmitAtomicSingleCharZeroOrOne(RegexNode node) { - string skipUpdatesLabel = DefineLabel(); - - Debug.Assert( - node.Type == RegexNode.Oneloopatomic || - node.Type == RegexNode.Notoneloopatomic || - node.Type == RegexNode.Setloopatomic); Debug.Assert(node.M == 0 && node.N == 1); string expr = $"{textSpanLocal}[{textSpanPos}]"; - switch (node.Type) + if (node.IsSetFamily) { - case RegexNode.Oneloopatomic: - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} == {Literal(node.Ch)}"; - break; - case RegexNode.Notoneloopatomic: - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} != {Literal(node.Ch)}"; - break; - case RegexNode.Setloopatomic: - expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node)); - expr = $"{expr}"; - break; + expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node)); + } + else + { + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; } using (EmitBlock(writer, $"if ((uint){textSpanPos} < (uint){textSpanLocal}.Length && {expr})")) @@ -1757,7 +1896,8 @@ void EmitAtomicNodeLoop(RegexNode node) } string originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + string atomicNodeLabel = DefineLabel("NoMatch"); + doneLabel = atomicNodeLabel; // We might loop any number of times. In order to ensure this loop // and subsequent code sees textSpanPos the same regardless, we always need it to contain @@ -1772,12 +1912,13 @@ void EmitAtomicNodeLoop(RegexNode node) using (EmitBlock(writer, maxIterations == int.MaxValue ? "while (true)" : $"while ({iterationLocal} < {maxIterations})")) { EmitTimeoutCheck(writer, hasTimeout); - string successfulIterationLabel = DefineLabel(); + string successfulIterationLabel = DefineLabel("Match"); // Iteration body string prevDone = doneLabel; - doneLabel = DefineLabel(); + string iterationLabel = DefineLabel("NoMatch"); + doneLabel = iterationLabel; // Save off runtextpos. string startingRunTextPosLocal = NextLocalName("startingRunTextPos"); @@ -1791,7 +1932,8 @@ void EmitAtomicNodeLoop(RegexNode node) // If the generated code gets here, the iteration failed. // Reset state, branch to done. - MarkLabel(doneLabel); + MarkLabel(iterationLabel); + Debug.Assert(doneLabel == iterationLabel); doneLabel = prevDone; // reset done label writer.WriteLine($"runtextpos = {startingRunTextPosLocal};"); Goto(doneLabel); @@ -1801,26 +1943,23 @@ void EmitAtomicNodeLoop(RegexNode node) writer.WriteLine($"{iterationLocal}++;"); } + // Done: + MarkLabel(atomicNodeLabel); + Debug.Assert(doneLabel == atomicNodeLabel); + doneLabel = originalDoneLabel; + // Check to ensure we've found at least min iterations. if (minIterations > 0) { - // Done: - MarkLabel(doneLabel); - doneLabel = originalDoneLabel; // Restore the original done label using (EmitBlock(writer, $"if ({iterationLocal} < {minIterations})")) { - writer.WriteLine($"goto {doneLabel};"); + writer.WriteLine($"goto {originalDoneLabel};"); } } - - // We can't have a label in front of a closing brace, so if we didn't emit the label - // earlier, emit now that we've closed out the scope. - if (minIterations <= 0) + else { - // Done: - MarkLabel(doneLabel); + // Labels require a statement after them. writer.WriteLine(";"); - doneLabel = originalDoneLabel; // Restore the original done label } } } @@ -3261,17 +3400,17 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options private static string Literal(string s) => SymbolDisplay.FormatLiteral(s, quote: true); - private static FinishEmitScope EmitScope(IndentedTextWriter writer, string title) => EmitBlock(writer, $"// {title}", appendBlankLine: true); + private static FinishEmitScope EmitScope(IndentedTextWriter writer, string title, bool faux = false) => EmitBlock(writer, $"// {title}", appendBlankLine: true, faux); - private static FinishEmitScope EmitBlock(IndentedTextWriter writer, string? clause, bool appendBlankLine = false) + private static FinishEmitScope EmitBlock(IndentedTextWriter writer, string? clause, bool appendBlankLine = false, bool faux = false) { if (clause is not null) { writer.WriteLine(clause); } - writer.WriteLine("{"); + writer.WriteLine(faux ? "//{" : "{"); writer.Indent++; - return new FinishEmitScope(writer, appendBlankLine); + return new FinishEmitScope(writer, appendBlankLine, faux); } private static void EmitAdd(IndentedTextWriter writer, string variable, int value) @@ -3293,11 +3432,13 @@ private static void EmitAdd(IndentedTextWriter writer, string variable, int valu { private readonly IndentedTextWriter _writer; private readonly bool _appendBlankLine; + private readonly bool _faux; - public FinishEmitScope(IndentedTextWriter writer, bool appendBlankLine) + public FinishEmitScope(IndentedTextWriter writer, bool appendBlankLine, bool faux) { _writer = writer; _appendBlankLine = appendBlankLine; + _faux = faux; } public void Dispose() @@ -3305,7 +3446,7 @@ public void Dispose() if (_writer is not null) { _writer.Indent--; - _writer.WriteLine("}"); + _writer.WriteLine(_faux ? "//}" : "}"); if (_appendBlankLine) { _writer.WriteLine(); diff --git a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj index cb7e816f76f4c5..7f59e37493cd85 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj +++ b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj @@ -29,6 +29,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index f6da67980d2d18..17f1d0fc877e64 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -7,6 +7,7 @@ + @@ -17,8 +18,8 @@ - + @@ -53,7 +54,6 @@ - diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index fabb0020108ddc..f8cae4f7b7e932 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -8,6 +8,7 @@ using System.Reflection; using System.Reflection.Emit; using System.Runtime.InteropServices; +using System.Threading; namespace System.Text.RegularExpressions { @@ -67,6 +68,7 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; private static readonly MethodInfo s_stringIndexOfCharInt = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int) })!; + private static readonly MethodInfo s_stringLastIndexOfCharIntInt = typeof(string).GetMethod("LastIndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!; private static readonly MethodInfo s_textInfoToLowerMethod = typeof(TextInfo).GetMethod("ToLower", new Type[] { typeof(char) })!; protected ILGenerator? _ilg; @@ -1676,7 +1678,7 @@ protected void GenerateFindFirstChar() } } - private bool TryGenerateNonBacktrackingGo(RegexNode node) + private bool TryGenerateSimplifiedGo(RegexNode node) { Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node"); Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child"); @@ -1689,7 +1691,7 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) // Skip the Capture node. We handle the implicit root capture specially. node = node.Child(0); - if (!RegexNode.NodeSupportsSimplifiedCodeGenerationImplementation(node, maxDepth: RegexNode.DefaultMaxRecursionDepth)) + if (!node.SupportsSimplifiedCodeGenerationImplementation()) { return false; } @@ -1741,6 +1743,7 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) LocalBuilder runtextendLocal = DeclareInt32(); Label stopSuccessLabel = DefineLabel(); Label doneLabel = DefineLabel(); + Label originalDoneLabel = doneLabel; if (_hasTimeout) { _loopTimeoutCounterLocal = DeclareInt32(); @@ -1771,6 +1774,7 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) LoadTextSpanLocal(); // Emit the code for all nodes in the tree. + bool expressionHasCaptures = (node.Options & RegexNode.HasCapturesFlag) != 0; EmitNode(node); // Success: @@ -1795,14 +1799,14 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) Call(s_captureMethod); // If the graph contained captures, undo any remaining to handle failed matches. - if ((node.Options & RegexNode.HasCapturesFlag) != 0) + if (expressionHasCaptures) { // while (Crawlpos() != 0) Uncapture(); Label finalReturnLabel = DefineLabel(); Br(finalReturnLabel); - MarkLabel(doneLabel); + MarkLabel(originalDoneLabel); Label condition = DefineLabel(); Label body = DefineLabel(); Br(condition); @@ -1820,7 +1824,7 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) else { // Done: - MarkLabel(doneLabel); + MarkLabel(originalDoneLabel); } // return; @@ -1936,7 +1940,7 @@ void EmitAtomicAlternate(RegexNode node) // BranchN(); // jumps to Done on failure // Save off runtextpos. We'll need to reset this each time a branch fails. - using RentedLocalBuilder startingRunTextPos = RentInt32Local(); + LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); int startingTextSpanPos = textSpanPos; @@ -1945,10 +1949,10 @@ void EmitAtomicAlternate(RegexNode node) // state. Note that this is only about subexpressions within the alternation, // as the alternation is atomic, so we're not concerned about captures after // the alternation. - RentedLocalBuilder? startingCrawlpos = null; + LocalBuilder? startingCrawlpos = null; if ((node.Options & RegexNode.HasCapturesFlag) != 0) { - startingCrawlpos = RentInt32Local(); + startingCrawlpos = DeclareInt32(); Ldthis(); Call(s_crawlposMethod); Stloc(startingCrawlpos); @@ -1959,7 +1963,7 @@ void EmitAtomicAlternate(RegexNode node) // A failure in a branch other than the last should jump to the next // branch, not to the final done. - Label postAlternateDone = doneLabel; + Label originalDoneLabel = doneLabel; int childCount = node.ChildCount(); for (int i = 0; i < childCount - 1; i++) @@ -2000,7 +2004,7 @@ void EmitAtomicAlternate(RegexNode node) Label uncapture = DefineLabel(); doneLabel = uncapture; EmitNode(node.Child(childCount - 1)); - doneLabel = postAlternateDone; + doneLabel = originalDoneLabel; TransferTextSpanPosToRunTextPos(); Br(doneAlternate); @@ -2010,24 +2014,21 @@ void EmitAtomicAlternate(RegexNode node) } else { - doneLabel = postAlternateDone; + doneLabel = originalDoneLabel; EmitNode(node.Child(childCount - 1)); TransferTextSpanPosToRunTextPos(); } // Successfully completed the alternate. MarkLabel(doneAlternate); - - startingCrawlpos?.Dispose(); - Debug.Assert(textSpanPos == 0); } // Emits the code for a Capture node. - void EmitCapture(RegexNode node) + void EmitCapture(RegexNode node, RegexNode? subsequent = null) { Debug.Assert(node.N == -1); - using RentedLocalBuilder startingRunTextPos = RentInt32Local(); + LocalBuilder startingRunTextPos = DeclareInt32(); // Get the capture number. This needs to be kept // in sync with MapCapNum in RegexWriter. @@ -2047,7 +2048,7 @@ void EmitCapture(RegexNode node) Stloc(startingRunTextPos); // Emit child node. - EmitNode(node.Child(0)); + EmitNode(node.Child(0), subsequent); // runtextpos += textSpanPos; // textSpan = textSpan.Slice(textSpanPos); @@ -2083,7 +2084,7 @@ void EmitUncaptureUntil(LocalBuilder startingCrawlpos) void EmitPositiveLookaheadAssertion(RegexNode node) { // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead. - using RentedLocalBuilder startingRunTextPos = RentInt32Local(); + LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); int startingTextSpanPos = textSpanPos; @@ -2103,13 +2104,14 @@ void EmitPositiveLookaheadAssertion(RegexNode node) void EmitNegativeLookaheadAssertion(RegexNode node) { // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead. - using RentedLocalBuilder startingRunTextPos = RentInt32Local(); + LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); int startingTextSpanPos = textSpanPos; Label originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + Label negativeLookaheadDoneLabel = DefineLabel(); + doneLabel = negativeLookaheadDoneLabel; // Emit the child. EmitNode(node.Child(0)); @@ -2119,7 +2121,8 @@ void EmitNegativeLookaheadAssertion(RegexNode node) BrFar(originalDoneLabel); // Failures (success for a negative lookahead) jump here. - MarkLabel(doneLabel); + MarkLabel(negativeLookaheadDoneLabel); + Debug.Assert(doneLabel == negativeLookaheadDoneLabel); doneLabel = originalDoneLabel; // After the child completes in failure (success for negative lookahead), reset the text positions. @@ -2130,8 +2133,14 @@ void EmitNegativeLookaheadAssertion(RegexNode node) } // Emits the code for the node. - void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) + void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + StackHelper.CallOnEmptyStack(EmitNode, node, subsequent, emitLengthChecksIfRequired); + return; + } + switch (node.Type) { case RegexNode.One: @@ -2182,7 +2191,7 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) break; case RegexNode.Atomic: - EmitNode(node.Child(0)); + EmitNode(node.Child(0), subsequent); break; case RegexNode.Alternate: @@ -2190,12 +2199,15 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) break; case RegexNode.Oneloop: - case RegexNode.Onelazy: case RegexNode.Notoneloop: - case RegexNode.Notonelazy: case RegexNode.Setloop: + EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); + break; + + case RegexNode.Onelazy: + case RegexNode.Notonelazy: case RegexNode.Setlazy: - EmitSingleCharRepeater(node, emitLengthChecksIfRequired); + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); break; case RegexNode.Concatenate: @@ -2207,19 +2219,19 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) EmitSpanLengthCheck(requiredLength); for (; i < exclusiveEnd; i++) { - EmitNode(node.Child(i), emitLengthChecksIfRequired: false); + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); } i--; continue; } - EmitNode(node.Child(i)); + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent); } break; case RegexNode.Capture: - EmitCapture(node); + EmitCapture(node, subsequent); break; case RegexNode.Require: @@ -2263,6 +2275,10 @@ void EmitUpdateBumpalong() // Emits the code to handle a single-character match. void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null) { + // This only emits a single check, but it's called from the looping constructs in a loop + // to generate the code for a single check, so we check for each "family" (one, notone, set) + // rather than only for the specific single character nodes. + // if ((uint)(textSpanPos + offset) >= textSpan.Length || textSpan[textSpanPos + offset] != ch) goto Done; if (emitLengthCheck) { @@ -2272,41 +2288,26 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o EmitSum(textSpanPos, offset); Call(s_spanGetItemMethod); LdindU2(); - switch (node.Type) + if (node.IsSetFamily) { - // This only emits a single check, but it's called from the looping constructs in a loop - // to generate the code for a single check, so we map those looping constructs to the - // appropriate single check. - - case RegexNode.Set: - case RegexNode.Setlazy: - case RegexNode.Setloop: - case RegexNode.Setloopatomic: - EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); - BrfalseFar(doneLabel); - break; - - case RegexNode.One: - case RegexNode.Onelazy: - case RegexNode.Oneloop: - case RegexNode.Oneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); + EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); + BrfalseFar(doneLabel); + } + else + { + if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + { + CallToLower(); + } + Ldc(node.Ch); + if (node.IsOneFamily) + { BneFar(doneLabel); - break; - - default: - Debug.Assert(node.Type == RegexNode.Notone || node.Type == RegexNode.Notonelazy || node.Type == RegexNode.Notoneloop || node.Type == RegexNode.Notoneloopatomic); - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); + } + else // IsNotoneFamily + { BeqFar(doneLabel); - break; + } } textSpanPos++; @@ -2536,9 +2537,109 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck = true) } } + // Emits the code to handle a backtracking, single-character loop. + void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + { + // If this is actually a repeater, emit that instead; no backtracking necessary. + if (node.M == node.N) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + return; + } + + Debug.Assert(node.M < node.N); + Label backtrackingLabel = DefineLabel(); + Label endLoop = DefineLabel(); + LocalBuilder startingPos = DeclareInt32(); + LocalBuilder endingPos = DeclareInt32(); + LocalBuilder crawlPos = DeclareInt32(); + + // We're about to enter a loop, so ensure our text position is 0. + TransferTextSpanPosToRunTextPos(); + + // int startingPos = runtextpos; + // Single char atomic loop + // int endingPos = runtextpos; + // int crawlPos = base.Crawlpos(); + // startingPos += node.M; + // goto endLoop; + Ldloc(runtextposLocal); + Stloc(startingPos); + EmitSingleCharAtomicLoop(node); + TransferTextSpanPosToRunTextPos(); + Ldloc(runtextposLocal); + Stloc(endingPos); + Ldthis(); + Call(s_crawlposMethod); + Stloc(crawlPos); + if (node.M > 0) + { + Ldloc(startingPos); + Ldc(node.M); + Add(); + Stloc(startingPos); + } + Br(endLoop); + + // Backtracking: + // if (startingPos >= endingPos) goto doneLabel; + MarkLabel(backtrackingLabel); + Ldloc(startingPos); + Ldloc(endingPos); + BgeFar(doneLabel); + doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes + + // while (base.Crawlpos() != crawlPos) Uncapture(); + if (expressionHasCaptures) + { + // Uncapture any captures if the expression has any. It's possible the captures it has + // are before this node, in which case this is wasted effort, but still functionally correct. + EmitUncaptureUntil(crawlPos); + } + + if (subsequent?.FindStartingCharacter() is char subsequentCharacter) + { + // endingPos = runtext.LastIndexOf(subsequentCharacter, endingPos - 1, endingPos - startingPos); + // if (endingPos < 0) + // { + // goto doneLabel; + // } + Ldloc(runtextLocal); + Ldc(subsequentCharacter); + Ldloc(endingPos); + Ldc(1); + Sub(); + Ldloc(endingPos); + Ldloc(startingPos); + Sub(); + Call(s_stringLastIndexOfCharIntInt); + Stloc(endingPos); + Ldloc(endingPos); + Ldc(0); + BltFar(doneLabel); + } + else + { + // endingPos--; + Ldloc(endingPos); + Ldc(1); + Sub(); + Stloc(endingPos); + } + + // runtextpos = endingPos; + Ldloc(endingPos); + Stloc(runtextposLocal); + + // textspan = runtext.AsSpan(runtextpos, runtextend - runtextpos); + LoadTextSpanLocal(); + + MarkLabel(endLoop); + } + // Emits the code to handle a loop (repeater) with a fixed number of iterations. // RegexNode.M is used for the number of iterations; RegexNode.N is ignored. - void EmitSingleCharRepeater(RegexNode node, bool emitLengthChecksIfRequired = true) + void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired = true) { int iterations = node.M; @@ -2648,7 +2749,7 @@ void EmitNodeRepeater(RegexNode node) Label conditionLabel = DefineLabel(); Label bodyLabel = DefineLabel(); - using RentedLocalBuilder iterationLocal = RentInt32Local(); + LocalBuilder iterationLocal = DeclareInt32(); Ldc(0); Stloc(iterationLocal); BrFar(conditionLabel); @@ -2675,15 +2776,10 @@ void EmitNodeRepeater(RegexNode node) // Emits the code to handle a non-backtracking, variable-length loop around a single character comparison. void EmitSingleCharAtomicLoop(RegexNode node) { - Debug.Assert( - node.Type == RegexNode.Oneloopatomic || - node.Type == RegexNode.Notoneloopatomic || - node.Type == RegexNode.Setloopatomic); - // If this is actually a repeater, emit that instead. if (node.M == node.N) { - EmitSingleCharRepeater(node); + EmitSingleCharFixedRepeater(node); return; } @@ -2700,17 +2796,16 @@ void EmitSingleCharAtomicLoop(RegexNode node) using RentedLocalBuilder iterationLocal = RentInt32Local(); - Label originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + Label atomicLoopDoneLabel = DefineLabel(); Span setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny int numSetChars = 0; - if (node.Type == RegexNode.Notoneloopatomic && + if (node.IsNotoneFamily && maxIterations == int.MaxValue && (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch))) { - // For Notoneloopatomic, we're looking for a specific character, as everything until we find + // For Notone, we're looking for a specific character, as everything until we find // it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, // we can use the vectorized IndexOf to do the search, rather than open-coding it. The unbounded // restriction is purely for simplicity; it could be removed in the future with additional code to @@ -2731,10 +2826,10 @@ void EmitSingleCharAtomicLoop(RegexNode node) Call(s_spanIndexOf); Stloc(iterationLocal); - // if (i != -1) goto doneLabel; + // if (i >= 0) goto atomicLoopDoneLabel; Ldloc(iterationLocal); - Ldc(-1); - BneFar(doneLabel); + Ldc(0); + BgeFar(atomicLoopDoneLabel); // i = textSpan.Length - textSpanPos; Ldloca(textSpanLocal); @@ -2746,14 +2841,14 @@ void EmitSingleCharAtomicLoop(RegexNode node) } Stloc(iterationLocal); } - else if (node.Type == RegexNode.Setloopatomic && + else if (node.IsSetFamily && maxIterations == int.MaxValue && !IsCaseInsensitive(node) && (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 && RegexCharClass.IsNegated(node.Str!)) { // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would - // have been reduced to a Notoneloopatomic), we can use an IndexOfAny to find any of the target characters. + // have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters. // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. // int i = textSpan.Slice(textSpanPos).IndexOfAny(ch1, ch2{, ch3}); @@ -2781,10 +2876,10 @@ void EmitSingleCharAtomicLoop(RegexNode node) } Stloc(iterationLocal); - // if (i != -1) goto doneLabel; + // if (i >= 0) goto atomicLoopDoneLabel; Ldloc(iterationLocal); - Ldc(-1); - BneFar(doneLabel); + Ldc(0); + BgeFar(atomicLoopDoneLabel); // i = textSpan.Length - textSpanPos; Ldloca(textSpanLocal); @@ -2796,10 +2891,10 @@ void EmitSingleCharAtomicLoop(RegexNode node) } Stloc(iterationLocal); } - else if (node.Type == RegexNode.Setloopatomic && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) + else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) { // .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end. - // The unbounded constraint is the same as in the Notoneloopatomic case above, done purely for simplicity. + // The unbounded constraint is the same as in the Notone case above, done purely for simplicity. // int i = runtextend - runtextpos; TransferTextSpanPosToRunTextPos(); @@ -2828,39 +2923,37 @@ void EmitSingleCharAtomicLoop(RegexNode node) MarkLabel(bodyLabel); EmitTimeoutCheck(); - // if ((uint)i >= (uint)textSpan.Length) goto doneLabel; + // if ((uint)i >= (uint)textSpan.Length) goto atomicLoopDoneLabel; Ldloc(iterationLocal); Ldloca(textSpanLocal); Call(s_spanGetLengthMethod); - BgeUnFar(doneLabel); + BgeUnFar(atomicLoopDoneLabel); - // if (textSpan[i] != ch) goto Done; + // if (textSpan[i] != ch) goto atomicLoopDoneLabel; Ldloca(textSpanLocal); Ldloc(iterationLocal); Call(s_spanGetItemMethod); LdindU2(); - switch (node.Type) + if (node.IsSetFamily) { - case RegexNode.Oneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); - BneFar(doneLabel); - break; - case RegexNode.Notoneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); - BeqFar(doneLabel); - break; - case RegexNode.Setloopatomic: - EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); - BrfalseFar(doneLabel); - break; + EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); + BrfalseFar(atomicLoopDoneLabel); + } + else + { + if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + { + CallToLower(); + } + Ldc(node.Ch); + if (node.IsOneFamily) + { + BneFar(atomicLoopDoneLabel); + } + else // IsNotoneFamily + { + BeqFar(atomicLoopDoneLabel); + } } // i++; @@ -2869,7 +2962,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) Add(); Stloc(iterationLocal); - // if (i >= maxIterations) goto doneLabel; + // if (i >= maxIterations) goto atomicLoopDoneLabel; MarkLabel(conditionLabel); if (maxIterations != int.MaxValue) { @@ -2884,8 +2977,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) } // Done: - MarkLabel(doneLabel); - doneLabel = originalDoneLabel; // Restore the original done label + MarkLabel(atomicLoopDoneLabel); // Check to ensure we've found at least min iterations. if (minIterations > 0) @@ -2914,10 +3006,6 @@ void EmitSingleCharAtomicLoop(RegexNode node) // Emits the code to handle a non-backtracking optional zero-or-one loop. void EmitAtomicSingleCharZeroOrOne(RegexNode node) { - Debug.Assert( - node.Type == RegexNode.Oneloopatomic || - node.Type == RegexNode.Notoneloopatomic || - node.Type == RegexNode.Setloopatomic); Debug.Assert(node.M == 0 && node.N == 1); Label skipUpdatesLabel = DefineLabel(); @@ -2933,28 +3021,26 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) Ldc(textSpanPos); Call(s_spanGetItemMethod); LdindU2(); - switch (node.Type) + if (node.IsSetFamily) { - case RegexNode.Oneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); + EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); + BrfalseFar(skipUpdatesLabel); + } + else + { + if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + { + CallToLower(); + } + Ldc(node.Ch); + if (node.IsOneFamily) + { BneFar(skipUpdatesLabel); - break; - case RegexNode.Notoneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); + } + else // IsNotoneFamily + { BeqFar(skipUpdatesLabel); - break; - case RegexNode.Setloopatomic: - EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); - BrfalseFar(skipUpdatesLabel); - break; + } } // textSpan = textSpan.Slice(1); @@ -2986,11 +3072,12 @@ void EmitAtomicNodeLoop(RegexNode node) return; } - using RentedLocalBuilder iterationLocal = RentInt32Local(); - using RentedLocalBuilder startingRunTextPosLocal = RentInt32Local(); + LocalBuilder iterationLocal = DeclareInt32(); + LocalBuilder startingRunTextPosLocal = DeclareInt32(); Label originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + Label atomicNodeLabel = DefineLabel(); + doneLabel = atomicNodeLabel; // We might loop any number of times. In order to ensure this loop // and subsequent code sees textSpanPos the same regardless, we always need it to contain @@ -3021,7 +3108,8 @@ void EmitAtomicNodeLoop(RegexNode node) Label successfulIterationLabel = DefineLabel(); Label prevDone = doneLabel; - doneLabel = DefineLabel(); + Label iterationDone = DefineLabel(); + doneLabel = iterationDone; // Save off runtextpos. Ldloc(runtextposLocal); @@ -3035,8 +3123,10 @@ void EmitAtomicNodeLoop(RegexNode node) // If the generated code gets here, the iteration failed. // Reset state, branch to done. - MarkLabel(doneLabel); - doneLabel = prevDone; // reset done label + MarkLabel(iterationDone); + Debug.Assert(doneLabel == iterationDone); + doneLabel = prevDone; + Ldloc(startingRunTextPosLocal); Stloc(runtextposLocal); BrFar(doneLabel); @@ -3064,8 +3154,9 @@ void EmitAtomicNodeLoop(RegexNode node) } // Done: - MarkLabel(doneLabel); - doneLabel = originalDoneLabel; // Restore the original done label + MarkLabel(atomicNodeLabel); + Debug.Assert(doneLabel == atomicNodeLabel); + doneLabel = originalDoneLabel; // Check to ensure we've found at least min iterations. if (minIterations > 0) @@ -3084,8 +3175,8 @@ protected void GenerateGo() _int32LocalsPool?.Clear(); _readOnlySpanCharLocalsPool?.Clear(); - // Generate backtrack-free code when we're dealing with simpler regexes. - if (TryGenerateNonBacktrackingGo(_code.Tree.Root)) + // Generate simpler code when we're dealing with simpler regexes. + if (TryGenerateSimplifiedGo(_code.Tree.Root)) { return; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 282483e8d90657..ef7b16700d0c0b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -41,6 +41,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Threading; namespace System.Text.RegularExpressions { @@ -103,8 +104,6 @@ internal sealed class RegexNode public const int Testref = 33; // (?(n) | ) - alternation, reference public const int Testgroup = 34; // (?(...) | )- alternation, expression - internal const byte DefaultMaxRecursionDepth = 20; // arbitrary cut-off to avoid unbounded recursion - /// empty bit from the node's options to store data on whether a node contains captures internal const RegexOptions HasCapturesFlag = (RegexOptions)(1 << 31); @@ -309,7 +308,7 @@ internal RegexNode FinalOptimize() // If we find backtracking construct at the end of the regex, we can instead make it non-backtracking, // since nothing would ever backtrack into it anyway. Doing this then makes the construct available // to implementations that don't support backtracking. - EliminateEndingBacktracking(rootNode, DefaultMaxRecursionDepth); + rootNode.EliminateEndingBacktracking(); // Optimization: unnecessary re-processing of starting loops. // If an expression is guaranteed to begin with a single-character unbounded loop that isn't part of an alternation (in which case it @@ -368,27 +367,29 @@ internal RegexNode FinalOptimize() return rootNode; } - /// Converts nodes at the end of the specified node tree to be atomic. + /// Converts nodes at the end of the node tree to be atomic. /// /// The correctness of this optimization depends on nothing being able to backtrack into /// the provided node. That means it must be at the root of the overall expression, or /// it must be an Atomic node that nothing will backtrack into by the very nature of Atomic. /// - private static void EliminateEndingBacktracking(RegexNode node, uint maxDepth) + private void EliminateEndingBacktracking() { - if (maxDepth == 0) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { + // If we can't recur further, just stop optimizing. return; } // RegexOptions.NonBacktracking doesn't support atomic groups, so when that option // is set we don't want to create atomic groups where they weren't explicitly authored. - if ((node.Options & RegexOptions.NonBacktracking) != 0) + if ((Options & RegexOptions.NonBacktracking) != 0) { return; } - // Walk the tree starting from the provided node. + // Walk the tree starting from the current node. + RegexNode node = this; while (true) { switch (node.Type) @@ -433,7 +434,7 @@ private static void EliminateEndingBacktracking(RegexNode node, uint maxDepth) int branches = node.ChildCount(); for (int i = 1; i < branches; i++) { - EliminateEndingBacktracking(node.Child(i), maxDepth - 1); + node.Child(i).EliminateEndingBacktracking(); } } node = node.Child(0); @@ -444,7 +445,7 @@ private static void EliminateEndingBacktracking(RegexNode node, uint maxDepth) // e.g. (?:abc*)* => (?:ab(?>c*))* case Loop: { - RegexNode? loopDescendent = FindLastExpressionInLoopForAutoAtomic(node, maxDepth - 1); + RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic(); if (loopDescendent != null) { node = loopDescendent; @@ -601,6 +602,7 @@ private RegexNode ReduceAtomic() // Alternations have a variety of possible optimizations that can be applied // iff they're atomic. case Alternate: + if ((Options & RegexOptions.RightToLeft) == 0) { List? branches = child.Children as List; Debug.Assert(branches is not null && branches.Count != 0); @@ -709,7 +711,7 @@ private RegexNode ReduceAtomic() // For everything else, try to reduce ending backtracking of the last contained expression. default: - EliminateEndingBacktracking(child, DefaultMaxRecursionDepth); + child.EliminateEndingBacktracking(); return atomic; } } @@ -1198,9 +1200,56 @@ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan startingSpan) public char FirstCharOfOneOrMulti() { Debug.Assert(Type is One or Multi); + Debug.Assert((Options & RegexOptions.RightToLeft) == 0); return Type == One ? Ch : Str![0]; } + /// Finds the guaranteed beginning character of the node, or null if none exists. + public char? FindStartingCharacter() + { + RegexNode? node = this; + while (true) + { + if (node is null || (node.Options & RegexOptions.RightToLeft) != 0) + { + return null; + } + + char c; + switch (node.Type) + { + case One: + case Oneloop or Oneloopatomic or Onelazy when node.M > 0: + c = node.Ch; + break; + + case Multi: + c = node.Str![0]; + break; + + case Atomic: + case Concatenate: + case Capture: + case Group: + case Loop or Lazyloop when node.M > 0: + case Require: + node = node.Child(0); + continue; + + default: + return null; + } + + if ((node.Options & RegexOptions.IgnoreCase) == 0 || + !RegexCharClass.ParticipatesInCaseConversion(c)) + { + return c; + } + + return null; + } + } + /// /// Optimizes a concatenation by coalescing adjacent characters and strings, /// coalescing adjacent loops, converting loops to be atomic where applicable, @@ -1467,10 +1516,16 @@ private void ReduceConcatenationWithAutoAtomic() var children = (List)Children; for (int i = 0; i < children.Count - 1; i++) { - ProcessNode(children[i], children[i + 1], DefaultMaxRecursionDepth); + ProcessNode(children[i], children[i + 1]); - static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) + static void ProcessNode(RegexNode node, RegexNode subsequent) { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + // If we can't recur further, just stop optimizing. + return; + } + // Skip down the node past irrelevant nodes. while (true) { @@ -1487,7 +1542,7 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) // compatible for the optimization. if (node.Type == Loop) { - RegexNode? loopDescendent = FindLastExpressionInLoopForAutoAtomic(node, maxDepth - 1); + RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic(); if (loopDescendent != null) { node = loopDescendent; @@ -1502,9 +1557,9 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) // If the node can be changed to atomic based on what comes after it, do so. switch (node.Type) { - case Oneloop when CanBeMadeAtomic(node, subsequent, maxDepth - 1): - case Notoneloop when CanBeMadeAtomic(node, subsequent, maxDepth - 1): - case Setloop when CanBeMadeAtomic(node, subsequent, maxDepth - 1): + case Oneloop when CanBeMadeAtomic(node, subsequent): + case Notoneloop when CanBeMadeAtomic(node, subsequent): + case Setloop when CanBeMadeAtomic(node, subsequent): node.MakeLoopAtomic(); break; case Alternate: @@ -1518,7 +1573,7 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) int alternateBranches = node.ChildCount(); for (int b = 0; b < alternateBranches; b++) { - ProcessNode(node.Child(b), subsequent, maxDepth - 1); + ProcessNode(node.Child(b), subsequent); } } break; @@ -1532,8 +1587,16 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) /// that could be made atomic _assuming_ the conditions exist for it with the loop's ancestors. /// /// The found node that should be explored further for auto-atomicity; null if it doesn't exist. - private static RegexNode? FindLastExpressionInLoopForAutoAtomic(RegexNode node, uint maxDepth) + private RegexNode? FindLastExpressionInLoopForAutoAtomic() { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + // If we can't recur further, just stop optimizing. + return null; + } + + RegexNode node = this; + Debug.Assert(node.Type == Loop); // Start by looking at the loop's sole child. @@ -1555,7 +1618,7 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) { int concatCount = node.ChildCount(); RegexNode lastConcatChild = node.Child(concatCount - 1); - if (CanBeMadeAtomic(lastConcatChild, node.Child(0), maxDepth - 1)) + if (CanBeMadeAtomic(lastConcatChild, node.Child(0))) { return lastConcatChild; } @@ -1569,11 +1632,11 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) /// Determines whether node can be switched to an atomic loop. Subsequent is the node /// immediately after 'node'. /// - private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, uint maxDepth) + private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent) { - if (maxDepth == 0) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - // We hit our recursion limit. Just don't apply the optimization. + // If we can't recur further, just stop optimizing. return false; } @@ -1609,7 +1672,7 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, uint m int childCount = subsequent.ChildCount(); for (int i = 0; i < childCount; i++) { - if (!CanBeMadeAtomic(node, subsequent.Child(i), maxDepth - 1)) + if (!CanBeMadeAtomic(node, subsequent.Child(i))) { return false; } @@ -1697,111 +1760,106 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, uint m /// public int ComputeMinLength() { - return ComputeMinLength(this, DefaultMaxRecursionDepth); - - static int ComputeMinLength(RegexNode node, uint maxDepth) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - if (maxDepth == 0) - { - // Don't examine any further, as we've reached the max allowed depth. - return 0; - } + // If we can't recur further, assume there's no minimum we can enforce. + return 0; + } - switch (node.Type) - { - case One: - case Notone: - case Set: - // Single character. - return 1; + switch (Type) + { + case One: + case Notone: + case Set: + // Single character. + return 1; - case Multi: - // Every character in the string needs to match. - return node.Str!.Length; + case Multi: + // Every character in the string needs to match. + return Str!.Length; - case Notonelazy: - case Notoneloop: - case Notoneloopatomic: - case Onelazy: - case Oneloop: - case Oneloopatomic: - case Setlazy: - case Setloop: - case Setloopatomic: - // One character repeated at least M times. - return node.M; + case Notonelazy: + case Notoneloop: + case Notoneloopatomic: + case Onelazy: + case Oneloop: + case Oneloopatomic: + case Setlazy: + case Setloop: + case Setloopatomic: + // One character repeated at least M times. + return M; - case Lazyloop: - case Loop: - // A node graph repeated at least M times. - return (int)Math.Min(int.MaxValue, (long)node.M * ComputeMinLength(node.Child(0), maxDepth - 1)); + case Lazyloop: + case Loop: + // A node graph repeated at least M times. + return (int)Math.Min(int.MaxValue, (long)M * Child(0).ComputeMinLength()); - case Alternate: - // The minimum required length for any of the alternation's branches. + case Alternate: + // The minimum required length for any of the alternation's branches. + { + int childCount = ChildCount(); + Debug.Assert(childCount >= 2); + int min = Child(0).ComputeMinLength(); + for (int i = 1; i < childCount && min > 0; i++) { - int childCount = node.ChildCount(); - Debug.Assert(childCount >= 2); - int min = ComputeMinLength(node.Child(0), maxDepth - 1); - for (int i = 1; i < childCount && min > 0; i++) - { - min = Math.Min(min, ComputeMinLength(node.Child(i), maxDepth - 1)); - } - return min; + min = Math.Min(min, Child(i).ComputeMinLength()); } + return min; + } - case Concatenate: - // The sum of all of the concatenation's children. + case Concatenate: + // The sum of all of the concatenation's children. + { + long sum = 0; + int childCount = ChildCount(); + for (int i = 0; i < childCount; i++) { - long sum = 0; - int childCount = node.ChildCount(); - for (int i = 0; i < childCount; i++) - { - sum += ComputeMinLength(node.Child(i), maxDepth - 1); - } - return (int)Math.Min(int.MaxValue, sum); + sum += Child(i).ComputeMinLength(); } + return (int)Math.Min(int.MaxValue, sum); + } - case Atomic: - case Capture: - case Group: - // For groups, we just delegate to the sole child. - Debug.Assert(node.ChildCount() == 1); - return ComputeMinLength(node.Child(0), maxDepth - 1); - - case Empty: - case Nothing: - case UpdateBumpalong: - // Nothing to match. In the future, we could potentially use Nothing to say that the min length - // is infinite, but that would require a different structure, as that would only apply if the - // Nothing match is required in all cases (rather than, say, as one branch of an alternation). - case Beginning: - case Bol: - case Boundary: - case ECMABoundary: - case End: - case EndZ: - case Eol: - case NonBoundary: - case NonECMABoundary: - case Start: - // Difficult to glean anything meaningful from boundaries or results only known at run time. - case Prevent: - case Require: - // Lookaheads/behinds could potentially be included in the future, but that will require - // a different structure, as they can't be added as part of a concatenation, since they overlap - // with what comes after. - case Ref: - case Testgroup: - case Testref: - // Constructs requiring data at runtime from the matching pattern can't influence min length. - return 0; + case Atomic: + case Capture: + case Group: + // For groups, we just delegate to the sole child. + Debug.Assert(ChildCount() == 1); + return Child(0).ComputeMinLength(); + + case Empty: + case Nothing: + case UpdateBumpalong: + // Nothing to match. In the future, we could potentially use Nothing to say that the min length + // is infinite, but that would require a different structure, as that would only apply if the + // Nothing match is required in all cases (rather than, say, as one branch of an alternation). + case Beginning: + case Bol: + case Boundary: + case ECMABoundary: + case End: + case EndZ: + case Eol: + case NonBoundary: + case NonECMABoundary: + case Start: + // Difficult to glean anything meaningful from boundaries or results only known at run time. + case Prevent: + case Require: + // Lookaheads/behinds could potentially be included in the future, but that will require + // a different structure, as they can't be added as part of a concatenation, since they overlap + // with what comes after. + case Ref: + case Testgroup: + case Testref: + // Constructs requiring data at runtime from the matching pattern can't influence min length. + return 0; - default: + default: #if DEBUG - Debug.Fail($"Unknown node: {node.TypeName}"); + Debug.Fail($"Unknown node: {TypeName}"); #endif - goto case Empty; - } + goto case Empty; } } @@ -1826,11 +1884,11 @@ public bool TryGetJoinableLengthCheckChildRange(int childIndex, out int required { static bool CanJoinLengthCheck(RegexNode node) => node.Type switch { - RegexNode.One or RegexNode.Notone or RegexNode.Set => true, - RegexNode.Multi => true, - RegexNode.Oneloop or RegexNode.Onelazy or RegexNode.Oneloopatomic or - RegexNode.Notoneloop or RegexNode.Notonelazy or RegexNode.Notoneloopatomic or - RegexNode.Setloop or RegexNode.Setlazy or RegexNode.Setloopatomic when node.M == node.N => true, + One or Notone or Set => true, + Multi => true, + Oneloop or Onelazy or Oneloopatomic or + Notoneloop or Notonelazy or Notoneloopatomic or + Setloop or Setlazy or Setloopatomic when node.M == node.N => true, _ => false, }; @@ -1961,9 +2019,15 @@ public int ChildCount() return 1; } - // Determines whether the node supports an optimized implementation that doesn't allow for backtracking. - internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNode node, int maxDepth) + // Determines whether the node supports an optimized code gen strategy based on walking the node tree. + internal bool SupportsSimplifiedCodeGenerationImplementation() { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + // If we can't recur further, simplified code generation isn't supported as the tree is too deep. + return false; + } + bool supported = false; // We only support the default left-to-right, not right-to-left, which requires more complication in the generated code. @@ -1971,97 +2035,117 @@ internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNod // We also limit the recursion involved to prevent stack dives; this limitation can be removed by switching // away from a recursive implementation (done for convenience) to an iterative one that's more complicated // but within the same problems. - if ((node.Options & RegexOptions.RightToLeft) == 0 && maxDepth > 0) + if ((Options & RegexOptions.RightToLeft) == 0) { - int childCount = node.ChildCount(); - Debug.Assert((node.Options & HasCapturesFlag) == 0); + int childCount = ChildCount(); + Debug.Assert((Options & HasCapturesFlag) == 0); - switch (node.Type) + switch (Type) { // One/Notone/Set/Multi don't involve any repetition and are easily supported. - case RegexNode.One: - case RegexNode.Notone: - case RegexNode.Set: - case RegexNode.Multi: + case One: + case Notone: + case Set: + case Multi: // Boundaries are like set checks and don't involve repetition, either. - case RegexNode.Boundary: - case RegexNode.NonBoundary: - case RegexNode.ECMABoundary: - case RegexNode.NonECMABoundary: + case Boundary: + case NonBoundary: + case ECMABoundary: + case NonECMABoundary: // Anchors are also trivial. - case RegexNode.Beginning: - case RegexNode.Start: - case RegexNode.Bol: - case RegexNode.Eol: - case RegexNode.End: - case RegexNode.EndZ: + case Beginning: + case Start: + case Bol: + case Eol: + case End: + case EndZ: // {Set/One/Notone}loopatomic are optimized nodes that represent non-backtracking variable-length loops. // These consume their {Set/One} inputs as long as they match, and don't give up anything they // matched, which means we can support them without backtracking. - case RegexNode.Oneloopatomic: - case RegexNode.Notoneloopatomic: - case RegexNode.Setloopatomic: + case Oneloopatomic: + case Notoneloopatomic: + case Setloopatomic: // "Empty" is easy: nothing is emitted for it. // "Nothing" is also easy: it doesn't match anything. // "UpdateBumpalong" doesn't match anything, it's just an optional directive to the engine. - case RegexNode.Empty: - case RegexNode.Nothing: - case RegexNode.UpdateBumpalong: + case Empty: + case Nothing: + case UpdateBumpalong: supported = true; break; - // Repeaters don't require backtracking as long as their min and max are equal. - // At that point they're just a shorthand for writing out the One/Notone/Set - // that number of times. - case RegexNode.Oneloop: - case RegexNode.Notoneloop: - case RegexNode.Setloop: - Debug.Assert(node.Next == null || node.Next.Type != RegexNode.Atomic, "Loop should have been transformed into an atomic type."); - goto case RegexNode.Onelazy; - case RegexNode.Onelazy: - case RegexNode.Notonelazy: - case RegexNode.Setlazy: - supported = node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic); + // Single character greedy loops are supported if they're either they're actually a repeater + // or they're not contained in any construct other than simple nesting (e.g. concat, capture). + case Oneloop: + case Notoneloop: + case Setloop: + Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type."); + supported = M == N || AncestorsAllowBacktracking(Next); + static bool AncestorsAllowBacktracking(RegexNode? node) + { + while (node is not null) + { + switch (node.Type) + { + case Concatenate: + case Capture: + case Atomic: + node = node.Next; + break; + + default: + return false; + } + } + + return true; + } + break; + + case Onelazy: + case Notonelazy: + case Setlazy: + supported = M == N || (Next != null && Next.Type == Atomic); break; // {Lazy}Loop repeaters are the same, except their child also needs to be supported. // We also support such loops being atomic. - case RegexNode.Loop: - case RegexNode.Lazyloop: + case Loop: + case Lazyloop: supported = - (node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic)) && - NodeSupportsSimplifiedCodeGenerationImplementation(node.Child(0), maxDepth - 1); + (M == N || (Next != null && Next.Type == Atomic)) && + Child(0).SupportsSimplifiedCodeGenerationImplementation(); break; // We can handle atomic as long as we can handle making its child atomic, or // its child doesn't have that concept. - case RegexNode.Atomic: + case Atomic: // Lookahead assertions also only require that the child node be supported. // The RightToLeft check earlier is important to differentiate lookbehind, // which is not supported. - case RegexNode.Require: - case RegexNode.Prevent: - supported = NodeSupportsSimplifiedCodeGenerationImplementation(node.Child(0), maxDepth - 1); + case Require: + case Prevent: + supported = Child(0).SupportsSimplifiedCodeGenerationImplementation(); break; // We can handle alternates as long as they're atomic (a root / global alternate is // effectively atomic, as nothing will try to backtrack into it as it's the last thing). // Its children must all also be supported. - case RegexNode.Alternate: - if (node.Next != null && - (node.IsAtomicByParent() || // atomic alternate - (node.Next.Type == RegexNode.Capture && node.Next.Next is null))) // root alternate + case Alternate: + if (Next != null && + (IsAtomicByParent() || // atomic alternate + (Next.Type == Capture && Next.Next is null))) // root alternate { - goto case RegexNode.Concatenate; + goto case Concatenate; } break; // Concatenation doesn't require backtracking as long as its children don't. - case RegexNode.Concatenate: + case Concatenate: supported = true; for (int i = 0; i < childCount; i++) { - if (supported && !NodeSupportsSimplifiedCodeGenerationImplementation(node.Child(i), maxDepth - 1)) + if (!Child(i).SupportsSimplifiedCodeGenerationImplementation()) { supported = false; break; @@ -2069,22 +2153,22 @@ internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNod } break; - case RegexNode.Capture: + case Capture: // Currently we only support capnums without uncapnums (for balancing groups) - supported = node.N == -1; + supported = N == -1; if (supported) { // And we only support them in certain places in the tree. - RegexNode? parent = node.Next; + RegexNode? parent = Next; while (parent != null) { switch (parent.Type) { - case RegexNode.Alternate: - case RegexNode.Atomic: - case RegexNode.Capture: - case RegexNode.Concatenate: - case RegexNode.Require: + case Alternate: + case Atomic: + case Capture: + case Concatenate: + case Require: parent = parent.Next; break; @@ -2098,13 +2182,13 @@ internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNod if (supported) { // And we only support them if their children are supported. - supported = NodeSupportsSimplifiedCodeGenerationImplementation(node.Child(0), maxDepth - 1); + supported = Child(0).SupportsSimplifiedCodeGenerationImplementation(); // If we've found a supported capture, mark all of the nodes in its parent // hierarchy as containing a capture. if (supported) { - parent = node; + parent = this; while (parent != null && ((parent.Options & HasCapturesFlag) == 0)) { parent.Options |= HasCapturesFlag; @@ -2117,14 +2201,33 @@ internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNod } } #if DEBUG - if (!supported && (node.Options & RegexOptions.Debug) != 0) + if (!supported && (Options & RegexOptions.Debug) != 0) { - Debug.WriteLine($"Unable to use non-backtracking code gen: node {node.Description()} isn't supported."); + Debug.WriteLine($"Unable to use non-backtracking code gen: node {Description()} isn't supported."); } #endif return supported; } + /// Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node. + public bool IsSetFamily => Type is Set or Setloop or Setloopatomic or Setlazy; + + /// Gets whether the node is a One/Oneloop/Oneloopatomic/Onelazy node. + public bool IsOneFamily => Type is One or Oneloop or Oneloopatomic or Onelazy; + + /// Gets whether the node is a Notone/Notoneloop/Notoneloopatomic/Notonelazy node. + public bool IsNotoneFamily => Type is Notone or Notoneloop or Notoneloopatomic or Notonelazy; + + /// Gets whether this node may be a source of backtracking. + public bool InstigatesBacktracking => + Type switch + { + Oneloop or Notoneloop or Setloop or Onelazy or Notonelazy or Setlazy or Loop or Lazyloop when !IsAtomicByParent() && M != N => true, + Alternate => !IsAtomicByParent(), + Ref or Testref or Testgroup => true, + _ => false, + }; + private string TypeName => Type switch { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs index 7b88e04f29c969..d183a7eea6e453 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Globalization; using System.Runtime.CompilerServices; +using System.Threading; namespace System.Text.RegularExpressions.Symbolic { @@ -201,11 +202,9 @@ BDD MapCategoryCodeToCondition(int code) => public SymbolicRegexNode Convert(RegexNode node, bool topLevel) { // Guard against stack overflow due to deep recursion - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - RegexNode localNode = node; - bool localTopLevel = topLevel; - return StackHelper.CallOnEmptyStack(() => Convert(localNode, localTopLevel)); + return StackHelper.CallOnEmptyStack(Convert, node, topLevel); } switch (node.Type) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs deleted file mode 100644 index 254c0d5e28dfff..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs +++ /dev/null @@ -1,31 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Threading; -using System.Threading.Tasks; - -namespace System.Text.RegularExpressions.Symbolic -{ - /// Provides tools for avoiding stack overflows. - internal static class StackHelper - { - // Queues the supplied delegate to the thread pool, then block waiting for it to complete. - // It does so in a way that prevents task inlining (which would defeat the purpose) but that - // also plays nicely with the thread pool's sync-over-async aggressive thread injection policies. - - /// Calls the provided function on the stack of a different thread pool thread. - /// The return type of the function. - /// The function to invoke. - public static T CallOnEmptyStack(Func func) => - Task.Run(func) - .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) - .GetAwaiter().GetResult(); - - /// Calls the provided action on the stack of a different thread pool thread. - /// The action to invoke. - public static void CallOnEmptyStack(Action action) => - Task.Run(action) - .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) - .GetAwaiter().GetResult(); - } -} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index fbd1cbda6ee753..3ae62fa919439a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; +using System.Threading; namespace System.Text.RegularExpressions.Symbolic { @@ -618,11 +619,10 @@ public SymbolicRegexNode Restrict(S pred) /// public int GetFixedLength() { - // Guard against stack overflow due to deep recursion. - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - SymbolicRegexNode thisRef = this; - return StackHelper.CallOnEmptyStack(() => thisRef.GetFixedLength()); + // If we can't recur further, assume no fixed length. + return -1; } switch (_kind) @@ -690,11 +690,9 @@ public int GetFixedLength() internal SymbolicRegexNode MkDerivative(S elem, uint context) { // Guard against stack overflow due to deep recursion - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - S localElem = elem; - uint localContext = context; - return StackHelper.CallOnEmptyStack(() => MkDerivative(localElem, localContext)); + return StackHelper.CallOnEmptyStack(MkDerivative, elem, context); } if (this == _builder._anyStar || this == _builder._nothing) @@ -1100,10 +1098,9 @@ public override string ToString() internal void ToString(StringBuilder sb) { // Guard against stack overflow due to deep recursion - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - StringBuilder localSb = sb; - StackHelper.CallOnEmptyStack(() => ToString(localSb)); + StackHelper.CallOnEmptyStack(ToString, sb); return; } @@ -1665,12 +1662,9 @@ private S ComputeStartSet() internal SymbolicRegexNode PruneAnchors(uint prevKind, bool contWithWL, bool contWithNWL) { // Guard against stack overflow due to deep recursion - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - uint localPrevKind = prevKind; - bool localContWithWL = contWithWL; - bool localContWithNWL = contWithNWL; - return StackHelper.CallOnEmptyStack(() => PruneAnchors(localPrevKind, localContWithWL, localContWithNWL)); + return StackHelper.CallOnEmptyStack(PruneAnchors, prevKind, contWithWL, contWithNWL); } if (!_info.StartsWithSomeAnchor) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs new file mode 100644 index 00000000000000..1ec05eb7d3d76d --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs @@ -0,0 +1,82 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.CompilerServices; +using System.Threading.Tasks; + +namespace System.Threading +{ + /// Provides tools for avoiding stack overflows. + internal static class StackHelper + { + /// Tries to ensure there is sufficient stack to execute the average .NET function. + public static bool TryEnsureSufficientExecutionStack() + { +#if REGEXGENERATOR + try + { + RuntimeHelpers.EnsureSufficientExecutionStack(); + return true; + } + catch + { + return false; + } +#else + return RuntimeHelpers.TryEnsureSufficientExecutionStack(); +#endif + } + + // Queues the supplied delegate to the thread pool, then block waiting for it to complete. + // It does so in a way that prevents task inlining (which would defeat the purpose) but that + // also plays nicely with the thread pool's sync-over-async aggressive thread injection policies. + + /// Calls the provided action on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The action to invoke. + /// The first argument to pass to the action. + public static void CallOnEmptyStack(Action action, TArg1 arg1) => + Task.Run(() => action(arg1)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + + /// Calls the provided action on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The action to invoke. + /// The first argument to pass to the action. + /// The second argument to pass to the action. + /// The second argument to pass to the action. + public static void CallOnEmptyStack(Action action, TArg1 arg1, TArg2 arg2, TArg3 arg3) => + Task.Run(() => action(arg1, arg2, arg3)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The return type of the function. + /// The function to invoke. + /// The first argument to pass to the function. + /// The second argument to pass to the function. + public static TResult CallOnEmptyStack(Func func, TArg1 arg1, TArg2 arg2) => + Task.Run(() => func(arg1, arg2)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The return type of the function. + /// The function to invoke. + /// The first argument to pass to the function. + /// The second argument to pass to the function. + /// The third argument to pass to the function. + public static TResult CallOnEmptyStack(Func func, TArg1 arg1, TArg2 arg2, TArg3 arg3) => + Task.Run(() => func(arg1, arg2, arg3)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 129b2c4bc217f9..ee71a6df32c946 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -91,6 +91,10 @@ public static IEnumerable Match_MemberData() // for it to be a success. For a correct match, remove the last character, '3' from the pattern yield return ("[^0-9]+(?>[0-9]+)3", "abc123", RegexOptions.None, 0, 6, false, string.Empty); yield return ("[^0-9]+(?>[0-9]+)", "abc123", RegexOptions.None, 0, 6, true, "abc123"); + + yield return (@"(?!.*a)\w*g", "bcaefg", RegexOptions.None, 0, 6, true, "efg"); + yield return (@"(?!.*a)\w*g", "aaaaag", RegexOptions.None, 0, 6, true, "g"); + yield return (@"(?!.*a)\w*g", "aaaaaa", RegexOptions.None, 0, 6, false, string.Empty); } // More nonbacktracking expressions @@ -175,6 +179,12 @@ public static IEnumerable Match_MemberData() yield return (@".*", "abc", lineOption, 2, 1, true, "c"); } + // Nested loops + if (!RegexHelpers.IsNonBacktracking(engine)) + { + yield return new object[] { engine, "a*(?:a[ab]*)*", "aaaababbbbbbabababababaaabbb", RegexOptions.None, 0, 28, true, "aaaa" }; + } + // Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z" yield return (@"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzz", RegexOptions.IgnoreCase, 0, 17, true, "aaaasdfajsdlfjzzz"); yield return (@"\Aaaaaa\w+zzz\Z", "aaaa", RegexOptions.IgnoreCase, 0, 4, false, string.Empty); @@ -344,7 +354,13 @@ public static IEnumerable Match_MemberData() yield return ("(?(cat)dog1|dog2)", "catdog1", RegexOptions.None, 0, 7, false, string.Empty); yield return ("(?(cat)dog1|dog2)", "catdog2", RegexOptions.None, 0, 7, true, "dog2"); yield return ("(?(cat)dog1|dog2)", "catdog1dog2", RegexOptions.None, 0, 11, true, "dog2"); + yield return (@"(\w+|\d+)a+[ab]+", "123123aa", RegexOptions.None, 0, 8, true, "123123aa"); + yield return ("(a|ab|abc|abcd)d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd"); + yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.None, 0, 4, false, string.Empty); + yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd"); } + yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest"); + yield return (@"a\w*a|def", "aaaaa", RegexOptions.None, 0, 5, true, "aaaaa"); // No Negation yield return ("[abcd-[abcd]]+", "abcxyzABCXYZ`!@#$%^&*()_-+= \t\n", RegexOptions.None, 0, 30, false, string.Empty); @@ -1584,7 +1600,7 @@ public static IEnumerable AllMatches_TestData() }; // Case insensitive cases by using ?i and some non-ASCII characters like Kelvin sign and applying ?i over negated character classes - yield return new object[] { engine, "(?i:[a-dÕ]+k*)", RegexOptions.None, "xyxaBõc\u212AKAyy", new (int, int, string)[] { (3, 6, "aBõc\u212AK"), (9, 1, "A") } }; + yield return new object[] { engine, "(?i:[a-d\u00D5]+k*)", RegexOptions.None, "xyxaB\u00F5c\u212AKAyy", new (int, int, string)[] { (3, 6, "aB\u00F5c\u212AK"), (9, 1, "A") } }; yield return new object[] { engine, "(?i:[a-d]+)", RegexOptions.None, "xyxaBcyy", new (int, int, string)[] { (3, 3, "aBc") } }; yield return new object[] { engine, "(?i:[\0-@B-\uFFFF]+)", RegexOptions.None, "xaAaAy", new (int, int, string)[] { (0, 6, "xaAaAy") } }; // this is the same as .+ yield return new object[] { engine, "(?i:[\0-ac-\uFFFF])", RegexOptions.None, "b", new (int, int, string)[] { (0, 1, "b") } }; diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs index 49876011b281c1..ad5ca8d0754d98 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs @@ -500,15 +500,26 @@ public void PatternsReduceDifferently(string pattern1, string pattern2) [InlineData(@"abcd(? Date: Wed, 20 Oct 2021 17:27:04 -0400 Subject: [PATCH 2/3] Address PR feedback --- .../gen/RegexGenerator.Emitter.cs | 21 +++++++------------ .../Text/RegularExpressions/RegexNode.cs | 6 ------ .../Text/RegularExpressions/RegexParser.cs | 10 +-------- .../tests/Regex.Match.Tests.cs | 2 +- 4 files changed, 10 insertions(+), 29 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index ec5b2ad5a09a11..c2342a27be41ef 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -1296,23 +1296,18 @@ static HashSet NodesWithCrossScopeLabels(RegexNode node) static bool NodesWithCrossScopeLabels(RegexNode node, HashSet results) { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + return StackHelper.CallOnEmptyStack(NodesWithCrossScopeLabels, node, results); + } + // Nodes that trigger backtracking and thus may emit labels that need to be reached by non-descendants. bool contains = node.InstigatesBacktracking; - if (!contains) + int childcount = node.ChildCount(); + for (int i = 0; i < childcount; i++) { - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - // Rather than forking to another thread, just say this has cross-scope labels. - // The effect is simply more faux scopes output. - return true; - } - - int childcount = node.ChildCount(); - for (int i = 0; i < childcount; i++) - { - contains |= NodesWithCrossScopeLabels(node.Child(i), results); - } + contains |= NodesWithCrossScopeLabels(node.Child(i), results); } if (contains) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index ef7b16700d0c0b..c23bb50720a05c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -1589,12 +1589,6 @@ static void ProcessNode(RegexNode node, RegexNode subsequent) /// The found node that should be explored further for auto-atomicity; null if it doesn't exist. private RegexNode? FindLastExpressionInLoopForAutoAtomic() { - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - // If we can't recur further, just stop optimizing. - return null; - } - RegexNode node = this; Debug.Assert(node.Type == Loop); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 48f610caf303ba..1f4a05afa47c12 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -1659,12 +1659,6 @@ private char ScanControl() throw MakeException(RegexParseError.UnrecognizedControlCharacter, SR.UnrecognizedControlCharacter); } - /// Returns true for options allowed only at the top level - private bool IsOnlyTopOption(RegexOptions options) => - options == RegexOptions.RightToLeft || - options == RegexOptions.CultureInvariant || - options == RegexOptions.ECMAScript; - /// Scans cimsx-cimsx option string, stops at the first unrecognized char. private void ScanOptions() { @@ -1683,7 +1677,7 @@ private void ScanOptions() else { RegexOptions options = OptionFromCode(ch); - if (options == 0 || IsOnlyTopOption(options)) + if (options == 0) { return; } @@ -1804,7 +1798,6 @@ private static RegexOptions OptionFromCode(char ch) return ch switch { 'i' => RegexOptions.IgnoreCase, - 'r' => RegexOptions.RightToLeft, 'm' => RegexOptions.Multiline, 'n' => RegexOptions.ExplicitCapture, 's' => RegexOptions.Singleline, @@ -1812,7 +1805,6 @@ private static RegexOptions OptionFromCode(char ch) #if DEBUG 'd' => RegexOptions.Debug, #endif - 'e' => RegexOptions.ECMAScript, _ => 0, }; } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index ee71a6df32c946..6dd56fc109e9b3 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -182,7 +182,7 @@ public static IEnumerable Match_MemberData() // Nested loops if (!RegexHelpers.IsNonBacktracking(engine)) { - yield return new object[] { engine, "a*(?:a[ab]*)*", "aaaababbbbbbabababababaaabbb", RegexOptions.None, 0, 28, true, "aaaa" }; + yield return ("a*(?:a[ab]*)*", "aaaababbbbbbabababababaaabbb", RegexOptions.None, 0, 28, true, "aaaa"); } // Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z" From e8bb072123f1727f4038d188f0a925195049ff96 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 21 Oct 2021 17:13:59 -0400 Subject: [PATCH 3/3] Clean up partial classes in SourceGenRegexAsync test helper --- .../tests/RegexGeneratorHelper.netcoreapp.cs | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs index 3b6bfcc0afe8f3..47ece73defcc76 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs @@ -61,33 +61,29 @@ internal static async Task SourceGenRegexAsync( var code = new StringBuilder(); code.AppendLine("using System.Text.RegularExpressions;"); + code.AppendLine("public partial class C {"); // Build up the code for all of the regexes int count = 0; foreach (var regex in regexes) { Assert.True(regex.options is not null || regex.matchTimeout is null); - string attr = $"[RegexGenerator({SymbolDisplay.FormatLiteral(regex.pattern, quote: true)}"; + code.Append($" [RegexGenerator({SymbolDisplay.FormatLiteral(regex.pattern, quote: true)}"); if (regex.options is not null) { - attr += $", {string.Join(" | ", regex.options.ToString().Split(',').Select(o => $"RegexOptions.{o.Trim()}"))}"; + code.Append($", {string.Join(" | ", regex.options.ToString().Split(',').Select(o => $"RegexOptions.{o.Trim()}"))}"); if (regex.matchTimeout is not null) { - attr += string.Create(CultureInfo.InvariantCulture, $", {(int)regex.matchTimeout.Value.TotalMilliseconds}"); + code.Append(string.Create(CultureInfo.InvariantCulture, $", {(int)regex.matchTimeout.Value.TotalMilliseconds}")); } } - attr += ")]"; - - // Create the source boilerplate for the pattern - code.AppendLine($@"public partial class C - {{ - {attr} - public static partial Regex Get{count}(); - }}"); + code.AppendLine($")] public static partial Regex Get{count}();"); count++; } + code.AppendLine("}"); + // Use a cached compilation to save a little time. Rather than creating an entirely new workspace // for each test, just create a single compilation, cache it, and then replace its syntax tree // on each test.