From 384cc6c4326b22ab031b59e69c024e69657371ef Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Tue, 19 Jun 2018 18:33:34 +0200 Subject: [PATCH 01/17] RegexParser & optionsstack ref --- .../Collections/Generic/ValueListBuilder.cs | 11 +- .../System/Text/RegularExpressions/Regex.cs | 10 +- .../Text/RegularExpressions/RegexParser.cs | 330 +++++++++--------- .../RegularExpressions/RegexReplacement.cs | 2 +- 4 files changed, 174 insertions(+), 179 deletions(-) diff --git a/src/Common/src/CoreLib/System/Collections/Generic/ValueListBuilder.cs b/src/Common/src/CoreLib/System/Collections/Generic/ValueListBuilder.cs index 72da4a9e197f..aea6052f030a 100644 --- a/src/Common/src/CoreLib/System/Collections/Generic/ValueListBuilder.cs +++ b/src/Common/src/CoreLib/System/Collections/Generic/ValueListBuilder.cs @@ -21,7 +21,16 @@ public ValueListBuilder(Span initialSpan) _pos = 0; } - public int Length => _pos; + public int Length + { + get => _pos; + set + { + Debug.Assert(value >= 0); + Debug.Assert(value <= _span.Length); + _pos = value; + } + } public ref T this[int index] { diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 77158bf5ca1a..4e392d8ab235 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -112,16 +112,16 @@ private Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, bool internalMatchTimeout = matchTimeout; // Cache handling. Try to look up this regex in the cache. - string cultureKey = (options & RegexOptions.CultureInvariant) != 0 ? - CultureInfo.InvariantCulture.ToString() : - CultureInfo.CurrentCulture.ToString(); - var key = new CachedCodeEntryKey(options, cultureKey, pattern); + CultureInfo culture = (options & RegexOptions.CultureInvariant) != 0 ? + CultureInfo.InvariantCulture : + CultureInfo.CurrentCulture; + var key = new CachedCodeEntryKey(options, culture.ToString(), pattern); CachedCodeEntry cached = GetCachedCode(key, false); if (cached == null) { // Parse the input - RegexTree tree = RegexParser.Parse(pattern, roptions); + RegexTree tree = RegexParser.Parse(pattern, roptions, culture); // Extract the relevant information capnames = tree.CapNames; diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 547660f06419..43a6e96998d5 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -12,15 +12,15 @@ using System.Collections; using System.Collections.Generic; +using System.Diagnostics; using System.Globalization; -using System.IO; -using System.Text; namespace System.Text.RegularExpressions { - internal sealed class RegexParser + internal ref struct RegexParser { private const int EscapeMaxBufferSize = 256; + private const int OptionStackDefaultSize = 32; private const int MaxValueDiv10 = int.MaxValue / 10; private const int MaxValueMod10 = int.MaxValue % 10; @@ -45,63 +45,77 @@ internal sealed class RegexParser private int[] _capnumlist; private List _capnamelist; - private RegexOptions _options; - private List _optionsStack; + private RegexOptions _option; + private ValueListBuilder _optionsStack; - private bool _ignoreNextParen = false; + private bool _ignoreNextParen; - /* - * This static call constructs a RegexTree from a regular expression - * pattern string and an option string. - * - * The method creates, drives, and drops a parser instance. - */ - public static RegexTree Parse(string re, RegexOptions op) + private RegexParser(string pattern, RegexOptions option, CultureInfo culture, Hashtable caps, int capsize, Hashtable capnames, Span optionSpan) { - RegexParser p; - RegexNode root; - string[] capnamelist; - - p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture); + Debug.Assert(pattern != null, "Pattern must be set"); + Debug.Assert(culture != null, "Culture must be set"); - p._options = op; - - p.SetPattern(re); - p.CountCaptures(); - p.Reset(op); - root = p.ScanRegex(); + _pattern = pattern; + _option = option; + _culture = culture; + _caps = caps; + _capsize = capsize; + _capnames = capnames; - if (p._capnamelist == null) - capnamelist = null; - else - capnamelist = p._capnamelist.ToArray(); + _optionsStack = new ValueListBuilder(optionSpan); + _stack = default; + _group = default; + _alternation = default; + _concatenation = default; + _unit = default; + _currentPos = 0; + _autocap = default; + _capcount = default; + _captop = default; + _capnumlist = default; + _capnamelist = default; + _ignoreNextParen = false; + } - return new RegexTree(root, p._caps, p._capnumlist, p._captop, p._capnames, capnamelist, op); + private RegexParser(string pattern, RegexOptions option, CultureInfo culture, Span optionSpan) + : this(pattern, option, culture, new Hashtable(), default, null, optionSpan) + { } - /* - * This static call constructs a flat concatenation node given - * a replacement pattern. - */ - public static RegexReplacement ParseReplacement(string rep, Hashtable caps, int capsize, Hashtable capnames, RegexOptions op) + public static RegexTree Parse(string pattern, RegexOptions option, CultureInfo culture) { - RegexParser p; - RegexNode root; + Span optionSpan = stackalloc RegexOptions[OptionStackDefaultSize]; + var parser = new RegexParser(pattern, option, culture, optionSpan); + + parser.CountCaptures(); + parser.Reset(option); + RegexNode root = parser.ScanRegex(); + string[] capnamelist = parser._capnamelist?.ToArray(); + var tree = new RegexTree(root, parser._caps, parser._capnumlist, parser._captop, parser._capnames, capnamelist, option); + parser.Dispose(); - p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture); + return tree; + } - p._options = op; + /// + /// This static call constructs a flat concatenation node given a replacement pattern. + /// + public static RegexReplacement ParseReplacement(string pattern, RegexOptions option, Hashtable caps, int capsize, Hashtable capnames) + { + CultureInfo culture = (option & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; + Span optionSpan = stackalloc RegexOptions[OptionStackDefaultSize]; + var parser = new RegexParser(pattern, option, culture, caps, capsize, capnames, optionSpan); - p.NoteCaptures(caps, capsize, capnames); - p.SetPattern(rep); - root = p.ScanReplacement(); + RegexNode root = parser.ScanReplacement(); + var regexReplacement = new RegexReplacement(pattern, root, caps); + parser.Dispose(); - return new RegexReplacement(rep, root, caps); + return regexReplacement; } - /* - * Escapes all metacharacters (including |,(,),[,{,|,^,$,*,+,?,\, spaces and #) - */ + /// + /// Escapes all metacharacters (including |,(,),[,{,|,^,$,*,+,?,\, spaces and #) + /// public static string Escape(string input) { for (int i = 0; i < input.Length; i++) @@ -128,9 +142,8 @@ private static string EscapeImpl(string input, int i) new ValueStringBuilder(input.Length + 200); char ch = input[i]; - int lastpos; - vsb.Append(input.AsSpan(0, i)); + do { vsb.Append('\\'); @@ -151,7 +164,7 @@ private static string EscapeImpl(string input, int i) } vsb.Append(ch); i++; - lastpos = i; + int lastpos = i; while (i < input.Length) { @@ -168,9 +181,9 @@ private static string EscapeImpl(string input, int i) return vsb.ToString(); } - /* - * Unescapes all metacharacters (including (,),[,],{,},|,^,$,*,+,?,\, spaces and #) - */ + /// + /// Unescapes all metacharacters (including (,),[,],{,},|,^,$,*,+,?,\, spaces and #) + /// public static string Unescape(string input) { for (int i = 0; i < input.Length; i++) @@ -186,9 +199,8 @@ public static string Unescape(string input) private static string UnescapeImpl(string input, int i) { - RegexParser p = new RegexParser(CultureInfo.InvariantCulture); - int lastpos; - p.SetPattern(input); + Span optionSpan = stackalloc RegexOptions[OptionStackDefaultSize]; + var parser = new RegexParser(input, RegexOptions.None, CultureInfo.InvariantCulture, optionSpan); // In the worst case the escaped string has the same length. // For small inputs we use stack allocation. @@ -201,65 +213,49 @@ private static string UnescapeImpl(string input, int i) do { i++; - p.Textto(i); + parser.Textto(i); if (i < input.Length) - vsb.Append(p.ScanCharEscape()); - i = p.Textpos(); - lastpos = i; + vsb.Append(parser.ScanCharEscape()); + i = parser.Textpos(); + int lastpos = i; while (i < input.Length && input[i] != '\\') i++; vsb.Append(input.AsSpan(lastpos, i - lastpos)); } while (i < input.Length); - return vsb.ToString(); - } + parser.Dispose(); - /* - * Private constructor. - */ - private RegexParser(CultureInfo culture) - { - _culture = culture; - _optionsStack = new List(); - _caps = new Hashtable(); - } - - /* - * Drops a string into the pattern buffer. - */ - private void SetPattern(string Re) - { - if (Re == null) - Re = string.Empty; - _pattern = Re; - _currentPos = 0; + return vsb.ToString(); } - - /* - * Resets parsing to the beginning of the pattern. - */ - private void Reset(RegexOptions topopts) + + /// + /// Resets parsing to the beginning of the pattern. + /// + private void Reset(RegexOptions option) { _currentPos = 0; _autocap = 1; _ignoreNextParen = false; - - if (_optionsStack.Count > 0) - _optionsStack.RemoveRange(0, _optionsStack.Count - 1); - - _options = topopts; + _optionsStack.Length = 0; + _option = option; _stack = null; } + public void Dispose() + { + _optionsStack.Dispose(); + } + /* * The main parsing function. */ + private RegexNode ScanRegex() { char ch = '@'; // nonspecial ch, means at beginning bool isQuantifier = false; - StartGroup(new RegexNode(RegexNode.Capture, _options, 0, -1)); + StartGroup(new RegexNode(RegexNode.Capture, _option, 0, -1)); while (CharsRight() > 0) { @@ -489,18 +485,15 @@ private RegexNode ScanRegex() */ private RegexNode ScanReplacement() { - int c; - int startpos; - - _concatenation = new RegexNode(RegexNode.Concatenate, _options); + _concatenation = new RegexNode(RegexNode.Concatenate, _option); for (; ;) { - c = CharsRight(); + int c = CharsRight(); if (c == 0) break; - startpos = Textpos(); + int startpos = Textpos(); while (c > 0 && RightChar() != '$') { @@ -622,18 +615,14 @@ private RegexCharClass ScanCharClass(bool caseInsensitive, bool scanOnly) // It currently doesn't do anything other than skip the whole thing! if (CharsRight() > 0 && RightChar() == ':' && !inRange) { - string name; int savePos = Textpos(); MoveRight(); - name = ScanCapname(); if (CharsRight() < 2 || RightCharMoveRight() != ':' || RightCharMoveRight() != ']') Textto(savePos); - // else lookup name (nyi) } } - if (inRange) { inRange = false; @@ -721,10 +710,10 @@ private RegexNode ScanGroupOpen() if (UseOptionN() || _ignoreNextParen) { _ignoreNextParen = false; - return new RegexNode(RegexNode.Group, _options); + return new RegexNode(RegexNode.Group, _option); } else - return new RegexNode(RegexNode.Capture, _options, _autocap++, -1); + return new RegexNode(RegexNode.Capture, _option, _autocap++, -1); } MoveRight(); @@ -743,13 +732,13 @@ private RegexNode ScanGroupOpen() case '=': // lookahead assertion - _options &= ~(RegexOptions.RightToLeft); + _option &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Require; break; case '!': // negative lookahead assertion - _options &= ~(RegexOptions.RightToLeft); + _option &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Prevent; break; @@ -774,7 +763,7 @@ private RegexNode ScanGroupOpen() goto BreakRecognize; // lookbehind assertion - _options |= RegexOptions.RightToLeft; + _option |= RegexOptions.RightToLeft; NodeType = RegexNode.Require; break; @@ -783,7 +772,7 @@ private RegexNode ScanGroupOpen() goto BreakRecognize; // negative lookbehind assertion - _options |= RegexOptions.RightToLeft; + _option |= RegexOptions.RightToLeft; NodeType = RegexNode.Prevent; break; @@ -871,7 +860,7 @@ private RegexNode ScanGroupOpen() if ((capnum != -1 || uncapnum != -1) && CharsRight() > 0 && RightCharMoveRight() == close) { - return new RegexNode(RegexNode.Capture, _options, capnum, uncapnum); + return new RegexNode(RegexNode.Capture, _option, capnum, uncapnum); } goto BreakRecognize; } @@ -892,7 +881,7 @@ private RegexNode ScanGroupOpen() if (CharsRight() > 0 && RightCharMoveRight() == ')') { if (IsCaptureSlot(capnum)) - return new RegexNode(RegexNode.Testref, _options, capnum); + return new RegexNode(RegexNode.Testref, _option, capnum); else throw MakeException(RegexParseError.UndefinedReference, SR.Format(SR.UndefinedReference, capnum.ToString(CultureInfo.CurrentCulture))); } @@ -904,7 +893,7 @@ private RegexNode ScanGroupOpen() string capname = ScanCapname(); if (IsCaptureName(capname) && CharsRight() > 0 && RightCharMoveRight() == ')') - return new RegexNode(RegexNode.Testref, _options, CaptureSlotFromName(capname)); + return new RegexNode(RegexNode.Testref, _option, CaptureSlotFromName(capname)); } } // not a backref @@ -951,7 +940,7 @@ private RegexNode ScanGroupOpen() break; } - return new RegexNode(NodeType, _options); + return new RegexNode(NodeType, _option); } BreakRecognize: @@ -1017,12 +1006,10 @@ private void ScanBlank() */ private RegexNode ScanBackslash(bool scanOnly) { - char ch; - RegexCharClass cc; - if (CharsRight() == 0) throw MakeException(RegexParseError.IllegalEndEscape, SR.IllegalEndEscape); + char ch; switch (ch = RightChar()) { case 'b': @@ -1034,67 +1021,67 @@ private RegexNode ScanBackslash(bool scanOnly) MoveRight(); if (scanOnly) return null; - return new RegexNode(TypeFromCode(ch), _options); + return new RegexNode(TypeFromCode(ch), _option); case 'w': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMAWordClass); - return new RegexNode(RegexNode.Set, _options, RegexCharClass.WordClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.ECMAWordClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.WordClass); case 'W': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMAWordClass); - return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotWordClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotECMAWordClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotWordClass); case 's': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMASpaceClass); - return new RegexNode(RegexNode.Set, _options, RegexCharClass.SpaceClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.ECMASpaceClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.SpaceClass); case 'S': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMASpaceClass); - return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotSpaceClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotECMASpaceClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotSpaceClass); case 'd': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMADigitClass); - return new RegexNode(RegexNode.Set, _options, RegexCharClass.DigitClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.ECMADigitClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.DigitClass); case 'D': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMADigitClass); - return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotDigitClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotECMADigitClass); + return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotDigitClass); case 'p': case 'P': MoveRight(); if (scanOnly) return null; - cc = new RegexCharClass(); + var cc = new RegexCharClass(); cc.AddCategoryFromName(ParseProperty(), (ch != 'p'), UseOptionI(), _pattern, _currentPos); if (UseOptionI()) cc.AddLowercase(_culture); - return new RegexNode(RegexNode.Set, _options, cc.ToStringClass()); + return new RegexNode(RegexNode.Set, _option, cc.ToStringClass()); default: return ScanBasicBackslash(scanOnly); @@ -1161,7 +1148,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) if (scanOnly) return null; if (IsCaptureSlot(capnum)) - return new RegexNode(RegexNode.Ref, _options, capnum); + return new RegexNode(RegexNode.Ref, _option, capnum); else throw MakeException(RegexParseError.UndefinedBackref, SR.Format(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture))); } @@ -1186,7 +1173,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) newcapnum = newcapnum * 10 + (int)(ch - '0'); } if (capnum >= 0) - return scanOnly ? null : new RegexNode(RegexNode.Ref, _options, capnum); + return scanOnly ? null : new RegexNode(RegexNode.Ref, _option, capnum); } else { @@ -1194,7 +1181,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) if (scanOnly) return null; if (IsCaptureSlot(capnum)) - return new RegexNode(RegexNode.Ref, _options, capnum); + return new RegexNode(RegexNode.Ref, _option, capnum); else if (capnum <= 9) throw MakeException(RegexParseError.UndefinedBackref, SR.Format(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture))); } @@ -1212,7 +1199,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) if (scanOnly) return null; if (IsCaptureName(capname)) - return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname)); + return new RegexNode(RegexNode.Ref, _option, CaptureSlotFromName(capname)); else throw MakeException(RegexParseError.UndefinedNameRef, SR.Format(SR.UndefinedNameRef, capname)); } @@ -1226,7 +1213,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) if (UseOptionI()) ch = _culture.TextInfo.ToLower(ch); - return scanOnly ? null : new RegexNode(RegexNode.One, _options, ch); + return scanOnly ? null : new RegexNode(RegexNode.One, _option, ch); } /* @@ -1235,7 +1222,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) private RegexNode ScanDollar() { if (CharsRight() == 0) - return new RegexNode(RegexNode.One, _options, '$'); + return new RegexNode(RegexNode.One, _option, '$'); char ch = RightChar(); bool angled; @@ -1287,7 +1274,7 @@ private RegexNode ScanDollar() } Textto(lastEndPos); if (capnum >= 0) - return new RegexNode(RegexNode.Ref, _options, capnum); + return new RegexNode(RegexNode.Ref, _option, capnum); } else { @@ -1295,7 +1282,7 @@ private RegexNode ScanDollar() if (!angled || CharsRight() > 0 && RightCharMoveRight() == '}') { if (IsCaptureSlot(capnum)) - return new RegexNode(RegexNode.Ref, _options, capnum); + return new RegexNode(RegexNode.Ref, _option, capnum); } } } @@ -1306,7 +1293,7 @@ private RegexNode ScanDollar() if (CharsRight() > 0 && RightCharMoveRight() == '}') { if (IsCaptureName(capname)) - return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname)); + return new RegexNode(RegexNode.Ref, _option, CaptureSlotFromName(capname)); } } else if (!angled) @@ -1317,7 +1304,7 @@ private RegexNode ScanDollar() { case '$': MoveRight(); - return new RegexNode(RegexNode.One, _options, '$'); + return new RegexNode(RegexNode.One, _option, '$'); case '&': capnum = 0; @@ -1343,14 +1330,14 @@ private RegexNode ScanDollar() if (capnum != 1) { MoveRight(); - return new RegexNode(RegexNode.Ref, _options, capnum); + return new RegexNode(RegexNode.Ref, _option, capnum); } } // unrecognized $: literalize Textto(backpos); - return new RegexNode(RegexNode.One, _options, '$'); + return new RegexNode(RegexNode.One, _option, '$'); } /* @@ -1533,9 +1520,9 @@ private void ScanOptions() return; if (off) - _options &= ~option; + _option &= ~option; else - _options |= option; + _option |= option; } } } @@ -1545,9 +1532,7 @@ private void ScanOptions() */ private char ScanCharEscape() { - char ch; - - ch = RightCharMoveRight(); + char ch = RightCharMoveRight(); if (ch >= '0' && ch <= '7') { @@ -1683,8 +1668,6 @@ private static RegexOptions OptionFromCode(char ch) */ private void CountCaptures() { - char ch; - NoteCaptureSlot(0, 0); _autocap = 1; @@ -1692,7 +1675,7 @@ private void CountCaptures() while (CharsRight() > 0) { int pos = Textpos(); - ch = RightCharMoveRight(); + char ch = RightCharMoveRight(); switch (ch) { case '\\': @@ -1804,10 +1787,7 @@ private void NoteCaptureSlot(int i, int pos) if (_captop <= i) { - if (i == int.MaxValue) - _captop = i; - else - _captop = i + 1; + _captop = i == int.MaxValue ? i : i + 1; } } } @@ -1953,7 +1933,7 @@ private bool IsCaptureName(string capname) */ private bool UseOptionN() { - return (_options & RegexOptions.ExplicitCapture) != 0; + return (_option & RegexOptions.ExplicitCapture) != 0; } /* @@ -1961,7 +1941,7 @@ private bool UseOptionN() */ private bool UseOptionI() { - return (_options & RegexOptions.IgnoreCase) != 0; + return (_option & RegexOptions.IgnoreCase) != 0; } /* @@ -1969,7 +1949,7 @@ private bool UseOptionI() */ private bool UseOptionM() { - return (_options & RegexOptions.Multiline) != 0; + return (_option & RegexOptions.Multiline) != 0; } /* @@ -1977,7 +1957,7 @@ private bool UseOptionM() */ private bool UseOptionS() { - return (_options & RegexOptions.Singleline) != 0; + return (_option & RegexOptions.Singleline) != 0; } /* @@ -1985,7 +1965,7 @@ private bool UseOptionS() */ private bool UseOptionX() { - return (_options & RegexOptions.IgnorePatternWhitespace) != 0; + return (_option & RegexOptions.IgnorePatternWhitespace) != 0; } /* @@ -1993,7 +1973,7 @@ private bool UseOptionX() */ private bool UseOptionE() { - return (_options & RegexOptions.ECMAScript) != 0; + return (_option & RegexOptions.ECMAScript) != 0; } private const byte Q = 5; // quantifier @@ -2044,19 +2024,26 @@ private bool IsTrueQuantifier() int nChars = CharsRight(); if (nChars == 0) return false; + int startpos = Textpos(); char ch = CharAt(startpos); if (ch != '{') return ch <= '{' && s_category[ch] >= Q; + int pos = startpos; while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ; + if (nChars == 0 || pos - startpos == 1) return false; + if (ch == '}') return true; + if (ch != ',') return false; + while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ; + return nChars > 0 && ch == '}'; } @@ -2110,7 +2097,7 @@ private void AddConcatenate(int pos, int cch, bool isReplacement) str = _pattern.Substring(pos, cch); } - node = new RegexNode(RegexNode.Multi, _options, str); + node = new RegexNode(RegexNode.Multi, _option, str); } else { @@ -2119,7 +2106,7 @@ private void AddConcatenate(int pos, int cch, bool isReplacement) if (UseOptionI() && !isReplacement) ch = _culture.TextInfo.ToLower(ch); - node = new RegexNode(RegexNode.One, _options, ch); + node = new RegexNode(RegexNode.One, _option, ch); } _concatenation.AddChild(node); @@ -2171,8 +2158,8 @@ private bool EmptyStack() private void StartGroup(RegexNode openGroup) { _group = openGroup; - _alternation = new RegexNode(RegexNode.Alternate, _options); - _concatenation = new RegexNode(RegexNode.Concatenate, _options); + _alternation = new RegexNode(RegexNode.Alternate, _option); + _concatenation = new RegexNode(RegexNode.Concatenate, _option); } /* @@ -2191,7 +2178,7 @@ private void AddAlternate() _alternation.AddChild(_concatenation.ReverseLeft()); } - _concatenation = new RegexNode(RegexNode.Concatenate, _options); + _concatenation = new RegexNode(RegexNode.Concatenate, _option); } /* @@ -2230,7 +2217,7 @@ private void AddUnitOne(char ch) if (UseOptionI()) ch = _culture.TextInfo.ToLower(ch); - _unit = new RegexNode(RegexNode.One, _options, ch); + _unit = new RegexNode(RegexNode.One, _option, ch); } /* @@ -2241,7 +2228,7 @@ private void AddUnitNotone(char ch) if (UseOptionI()) ch = _culture.TextInfo.ToLower(ch); - _unit = new RegexNode(RegexNode.Notone, _options, ch); + _unit = new RegexNode(RegexNode.Notone, _option, ch); } /* @@ -2249,7 +2236,7 @@ private void AddUnitNotone(char ch) */ private void AddUnitSet(string cc) { - _unit = new RegexNode(RegexNode.Set, _options, cc); + _unit = new RegexNode(RegexNode.Set, _option, cc); } /* @@ -2265,7 +2252,7 @@ private void AddUnitNode(RegexNode node) */ private void AddUnitType(int type) { - _unit = new RegexNode(type, _options); + _unit = new RegexNode(type, _option); } /* @@ -2294,7 +2281,7 @@ private void AddGroup() */ private void PushOptions() { - _optionsStack.Add(_options); + _optionsStack.Append(_option); } /* @@ -2302,8 +2289,7 @@ private void PushOptions() */ private void PopOptions() { - _options = _optionsStack[_optionsStack.Count - 1]; - _optionsStack.RemoveAt(_optionsStack.Count - 1); + _option = _optionsStack.Pop(); } /* @@ -2311,7 +2297,7 @@ private void PopOptions() */ private bool EmptyOptionsStack() { - return (_optionsStack.Count == 0); + return _optionsStack.Length == 0; } /* @@ -2319,7 +2305,7 @@ private bool EmptyOptionsStack() */ private void PopKeepOptions() { - _optionsStack.RemoveAt(_optionsStack.Count - 1); + _optionsStack.Length--; } /* diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs index 5200341df9b8..1c6f126a6390 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs @@ -94,7 +94,7 @@ public static RegexReplacement GetOrCreate(WeakReference replR if (!replRef.TryGetTarget(out repl) || !repl.Pattern.Equals(replacement)) { - repl = RegexParser.ParseReplacement(replacement, caps, capsize, capnames, roptions); + repl = RegexParser.ParseReplacement(replacement, roptions, caps, capsize, capnames); replRef.SetTarget(repl); } From fbfcda6ed4d5c9fd97adedd12cf0fa50bbc20a90 Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 13:41:51 +0200 Subject: [PATCH 02/17] Add test coverage for Group.Synchronized --- .../tests/Regex.Groups.Tests.cs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs index c1ade0a8c3cb..faab74c3eba5 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs @@ -759,5 +759,22 @@ public void GroupsBasic() return SuccessExitCode; }).Dispose(); } + + [Fact] + public void Synchronized_NullGroup_Throws() + { + AssertExtensions.Throws("inner", () => Group.Synchronized(null)); + } + + [Theory] + [InlineData(@"(cat)([\v]*)(dog)", "cat\v\v\vdog")] + [InlineData("abc", "def")] // no match + public void Synchronized_ValidGroup_Success(string pattern, string input) + { + Match match = Regex.Match(input, pattern); + + Group synchronizedGroup = Group.Synchronized(match.Groups[0]); + Assert.NotNull(synchronizedGroup); + } } } From a52d8f7d9f5d91834aea23556ac53c48cafedff8 Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 15:08:07 +0200 Subject: [PATCH 03/17] Adjust options mode test case --- src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index edad94f24f6b..4b2c247b2e5b 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -112,7 +112,7 @@ public static IEnumerable Match_Basic_TestData() yield return new object[] { "aaa(?-i:match this)bbb", "AaAmatch thisBBb", RegexOptions.IgnoreCase, 0, 16, true, "AaAmatch thisBBb" }; // Turning on/off all the options at once : Actual - "aaa(?imnsx-imnsx:match this)bbb", "i" - yield return new object[] { "aaa(?-i:match this)bbb", "AaAmatcH thisBBb", RegexOptions.IgnoreCase, 0, 16, false, string.Empty }; + yield return new object[] { "aaa(?imnsx-imnsx:match this)bbb", "AaAmatcH thisBBb", RegexOptions.IgnoreCase, 0, 16, false, string.Empty }; // Actual - "aaa(?#ignore this completely)bbb" yield return new object[] { "aaa(?#ignore this completely)bbb", "aaabbb", RegexOptions.None, 0, 6, true, "aaabbb" }; From 948217e93054ba186937d94698de1d7ba1c8730f Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 15:24:31 +0200 Subject: [PATCH 04/17] Add inline comment '#' test branch --- src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 4b2c247b2e5b..09b8a2dc291f 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -97,7 +97,7 @@ public static IEnumerable Match_Basic_TestData() yield return new object[] { "(abbc)(?(1)111|222)", "abbc222", RegexOptions.None, 0, 7, false, string.Empty }; // "x" option. Removes unescaped whitespace from the pattern: Actual - " ([^/]+) ","x" - yield return new object[] { " ((.)+) ", "abc", RegexOptions.IgnorePatternWhitespace, 0, 3, true, "abc" }; + yield return new object[] { " ((.)+) #comment ", "abc", RegexOptions.IgnorePatternWhitespace, 0, 3, true, "abc" }; // "x" option. Removes unescaped whitespace from the pattern. : Actual - "\x20([^/]+)\x20","x" yield return new object[] { "\x20([^/]+)\x20\x20\x20\x20\x20\x20\x20", " abc ", RegexOptions.IgnorePatternWhitespace, 0, 10, true, " abc " }; From 1765741dd6e8a4de735a9f8d43727f836e4248ce Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 15:34:16 +0200 Subject: [PATCH 05/17] Add comments --- .../Text/RegularExpressions/RegexParser.cs | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 43a6e96998d5..a6598370fb2d 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -48,7 +48,7 @@ internal ref struct RegexParser private RegexOptions _option; private ValueListBuilder _optionsStack; - private bool _ignoreNextParen; + private bool _ignoreNextParen; // flag to skip capturing a parentheses group private RegexParser(string pattern, RegexOptions option, CultureInfo culture, Hashtable caps, int capsize, Hashtable capnames, Span optionSpan) { @@ -269,10 +269,10 @@ private RegexNode ScanRegex() // move past all of the normal characters. We'll stop when we hit some kind of control character, // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace. if (UseOptionX()) - while (CharsRight() > 0 && (!IsStopperX(ch = RightChar()) || ch == '{' && !IsTrueQuantifier())) + while (CharsRight() > 0 && (!IsStopperX(ch = RightChar()) || (ch == '{' && !IsTrueQuantifier()))) MoveRight(); else - while (CharsRight() > 0 && (!IsSpecial(ch = RightChar()) || ch == '{' && !IsTrueQuantifier())) + while (CharsRight() > 0 && (!IsSpecial(ch = RightChar()) || (ch == '{' && !IsTrueQuantifier()))) MoveRight(); int endpos = Textpos(); @@ -991,6 +991,7 @@ private void ScanBlank() RightChar(1) != '?' || RightChar() != '(') return; + // skip comment (?# ...) while (CharsRight() > 0 && RightChar() != ')') MoveRight(); if (CharsRight() == 0) @@ -1497,13 +1498,9 @@ private bool IsOnlyTopOption(RegexOptions option) */ private void ScanOptions() { - char ch; - bool off; - RegexOptions option; - - for (off = false; CharsRight() > 0; MoveRight()) + for (bool off = false; CharsRight() > 0; MoveRight()) { - ch = RightChar(); + char ch = RightChar(); if (ch == '-') { @@ -1515,7 +1512,7 @@ private void ScanOptions() } else { - option = OptionFromCode(ch); + RegexOptions option = OptionFromCode(ch); if (option == 0 || IsOnlyTopOption(option)) return; @@ -1703,6 +1700,7 @@ private void CountCaptures() case '(': if (CharsRight() >= 2 && RightChar(1) == '#' && RightChar() == '?') { + // we have a comment (?# MoveLeft(); ScanBlank(); } @@ -1760,6 +1758,9 @@ private void CountCaptures() } else { + // Simple (unnamed) capture group. + // Add unnamend parentheses if ExplicitCapture is not set + // and the next parentheses is not ignored. if (!UseOptionN() && !_ignoreNextParen) NoteCaptureSlot(_autocap++, pos); } @@ -1881,7 +1882,7 @@ private void AssignNameSlots() for (int i = 0; i < _capcount; i++) { - int j = (_capnumlist == null) ? i : (int)_capnumlist[i]; + int j = (_capnumlist == null) ? i : _capnumlist[i]; if (next == j) { From 6cc6da62768de6014250c22569b141d3242854ba Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 15:35:05 +0200 Subject: [PATCH 06/17] Change runtime check to Assert in Parser --- .../src/System/Text/RegularExpressions/RegexParser.cs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index a6598370fb2d..a1920a882be5 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -2022,9 +2022,7 @@ private static bool IsQuantifier(char ch) private bool IsTrueQuantifier() { - int nChars = CharsRight(); - if (nChars == 0) - return false; + Debug.Assert(CharsRight() > 0, "IsTrueQuantifier requires characters to be read"); int startpos = Textpos(); char ch = CharAt(startpos); @@ -2032,6 +2030,7 @@ private bool IsTrueQuantifier() return ch <= '{' && s_category[ch] >= Q; int pos = startpos; + int nChars = CharsRight(); while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ; if (nChars == 0 || pos - startpos == 1) From ea7161b582d50696684037f4eca180fbe9758a80 Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 16:14:44 +0200 Subject: [PATCH 07/17] Replace manual ToLower calls by Span.ToLower --- .../Text/RegularExpressions/RegexBoyerMoore.cs | 10 +--------- .../System/Text/RegularExpressions/RegexParser.cs | 12 +----------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs index e44cb58986c3..38c28518e345 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs @@ -43,15 +43,7 @@ public RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, C if (caseInsensitive) { pattern = string.Create(pattern.Length, (pattern, culture), (span, state) => - { - // We do the ToLower character by character for consistency. With surrogate chars, doing - // a ToLower on the entire string could actually change the surrogate pair. This is more correct - // linguistically, but since Regex doesn't support surrogates, it's more important to be - // consistent. - TextInfo textInfo = state.culture.TextInfo; - for (int i = 0; i < state.pattern.Length; i++) - span[i] = textInfo.ToLower(state.pattern[i]); - }); + state.pattern.AsSpan().ToLower(span, state.culture)); } Pattern = pattern; diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index a1920a882be5..96fb8e6b2ce8 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -2080,17 +2080,7 @@ private void AddConcatenate(int pos, int cch, bool isReplacement) if (UseOptionI() && !isReplacement) { str = string.Create(cch, (_pattern, _culture, pos, cch), (span, state) => - { - ReadOnlySpan input = state._pattern.AsSpan(pos, cch); - - // We do the ToLower character by character for consistency. With surrogate chars, doing - // a ToLower on the entire string could actually change the surrogate pair. This is more correct - // linguistically, but since Regex doesn't support surrogates, it's more important to be - // consistent. - TextInfo textInfo = state._culture.TextInfo; - for (int i = 0; i < input.Length; i++) - span[i] = textInfo.ToLower(input[i]); - }); + state._pattern.AsSpan(pos, cch).ToLower(span, state._culture)); } else { From 1059f3ff39dc0640a43a03910947e7ac48c19307 Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 16:19:57 +0200 Subject: [PATCH 08/17] Make applicable fields readonly in parser --- .../src/System/Text/RegularExpressions/RegexParser.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 96fb8e6b2ce8..2a646a566bb1 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -30,9 +30,9 @@ internal ref struct RegexParser private RegexNode _concatenation; private RegexNode _unit; - private string _pattern; + private readonly string _pattern; private int _currentPos; - private CultureInfo _culture; + private readonly CultureInfo _culture; private int _autocap; private int _capcount; From c1fa82b43a155a9f9415b8616e191e186d576776 Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 16:37:14 +0200 Subject: [PATCH 09/17] Change to Assert to reduce an if check in one branch --- .../Text/RegularExpressions/RegexParser.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 2a646a566bb1..092a4dc0f136 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -349,6 +349,9 @@ private RegexNode ScanRegex() break; case '\\': + if (CharsRight() == 0) + throw MakeException(RegexParseError.IllegalEndEscape, SR.IllegalEndEscape); + AddUnitNode(ScanBackslash(scanOnly: false)); break; @@ -696,11 +699,6 @@ private RegexCharClass ScanCharClass(bool caseInsensitive, bool scanOnly) */ private RegexNode ScanGroupOpen() { - char ch = '\0'; - int NodeType; - char close = '>'; - - // just return a RegexNode if we have: // 1. "(" followed by nothing // 2. "(x" where x != ? @@ -723,6 +721,9 @@ private RegexNode ScanGroupOpen() if (CharsRight() == 0) break; + int NodeType; + char close = '>'; + char ch; switch (ch = RightCharMoveRight()) { case ':': @@ -1007,8 +1008,7 @@ private void ScanBlank() */ private RegexNode ScanBackslash(bool scanOnly) { - if (CharsRight() == 0) - throw MakeException(RegexParseError.IllegalEndEscape, SR.IllegalEndEscape); + Debug.Assert(CharsRight() > 0, "The current reading position must not be at the end of the pattern"); char ch; switch (ch = RightChar()) @@ -2022,7 +2022,7 @@ private static bool IsQuantifier(char ch) private bool IsTrueQuantifier() { - Debug.Assert(CharsRight() > 0, "IsTrueQuantifier requires characters to be read"); + Debug.Assert(CharsRight() > 0, "The current reading position must not be at the end of the pattern"); int startpos = Textpos(); char ch = CharAt(startpos); From 298c6039534da3108881edad93382a1999af3902 Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 16:50:17 +0200 Subject: [PATCH 10/17] Code formatting --- .../Text/RegularExpressions/RegexParser.cs | 47 +++++-------------- 1 file changed, 13 insertions(+), 34 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 092a4dc0f136..bc37a772f269 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -1097,13 +1097,10 @@ private RegexNode ScanBasicBackslash(bool scanOnly) if (CharsRight() == 0) throw MakeException(RegexParseError.IllegalEndEscape, SR.IllegalEndEscape); - char ch; - bool angled = false; + int backpos = Textpos(); char close = '\0'; - int backpos; - - backpos = Textpos(); - ch = RightChar(); + bool angled = false; + char ch = RightChar(); // allow \k instead of \, which is now deprecated @@ -1366,13 +1363,10 @@ private string ScanCapname() */ private char ScanOctal() { + // Consume octal chars only up to 3 digits and value 0377 + int c = 3; int d; int i; - int c; - - // Consume octal chars only up to 3 digits and value 0377 - - c = 3; if (c > CharsRight()) c = CharsRight(); @@ -1420,11 +1414,9 @@ private int ScanDecimal() */ private char ScanHex(int c) { - int i; + int i = 0; int d; - i = 0; - if (CharsRight() >= c) { for (; c > 0 && ((d = HexDigit(RightCharMoveRight())) >= 0); c -= 1) @@ -1464,12 +1456,10 @@ private static int HexDigit(char ch) */ private char ScanControl() { - char ch; - - if (CharsRight() <= 0) + if (CharsRight() == 0) throw MakeException(RegexParseError.MissingControl, SR.MissingControl); - ch = RightCharMoveRight(); + char ch = RightCharMoveRight(); // \ca interpreted as \cA @@ -1487,10 +1477,9 @@ private char ScanControl() */ private bool IsOnlyTopOption(RegexOptions option) { - return (option == RegexOptions.RightToLeft - || option == RegexOptions.CultureInvariant - || option == RegexOptions.ECMAScript - ); + return option == RegexOptions.RightToLeft || + option == RegexOptions.CultureInvariant || + option == RegexOptions.ECMAScript; } /* @@ -1577,6 +1566,7 @@ private string ParseProperty() { throw MakeException(RegexParseError.IncompleteSlashP, SR.IncompleteSlashP); } + char ch = RightCharMoveRight(); if (ch != '{') { @@ -1811,16 +1801,6 @@ private void NoteCaptureName(string name, int pos) } } - /* - * For when all the used captures are known: note them all at once - */ - private void NoteCaptures(Hashtable caps, int capsize, Hashtable capnames) - { - _caps = caps; - _capsize = capsize; - _capnames = capnames; - } - /* * Assigns unused slot numbers to the capture names */ @@ -2069,11 +2049,10 @@ private static bool IsMetachar(char ch) */ private void AddConcatenate(int pos, int cch, bool isReplacement) { - RegexNode node; - if (cch == 0) return; + RegexNode node; if (cch > 1) { string str; From 67d8a2414f2db2e424e0cb340d091c4911077b7e Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 20:01:11 +0200 Subject: [PATCH 11/17] Revert RegexOptions rename in parser --- .../Text/RegularExpressions/RegexParser.cs | 158 +++++++++--------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index bc37a772f269..3864920ec59d 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -45,18 +45,18 @@ internal ref struct RegexParser private int[] _capnumlist; private List _capnamelist; - private RegexOptions _option; + private RegexOptions _options; private ValueListBuilder _optionsStack; private bool _ignoreNextParen; // flag to skip capturing a parentheses group - private RegexParser(string pattern, RegexOptions option, CultureInfo culture, Hashtable caps, int capsize, Hashtable capnames, Span optionSpan) + private RegexParser(string pattern, RegexOptions options, CultureInfo culture, Hashtable caps, int capsize, Hashtable capnames, Span optionSpan) { Debug.Assert(pattern != null, "Pattern must be set"); Debug.Assert(culture != null, "Culture must be set"); _pattern = pattern; - _option = option; + _options = options; _culture = culture; _caps = caps; _capsize = capsize; @@ -77,21 +77,21 @@ private RegexParser(string pattern, RegexOptions option, CultureInfo culture, Ha _ignoreNextParen = false; } - private RegexParser(string pattern, RegexOptions option, CultureInfo culture, Span optionSpan) - : this(pattern, option, culture, new Hashtable(), default, null, optionSpan) + private RegexParser(string pattern, RegexOptions options, CultureInfo culture, Span optionSpan) + : this(pattern, options, culture, new Hashtable(), default, null, optionSpan) { } - public static RegexTree Parse(string pattern, RegexOptions option, CultureInfo culture) + public static RegexTree Parse(string pattern, RegexOptions options, CultureInfo culture) { Span optionSpan = stackalloc RegexOptions[OptionStackDefaultSize]; - var parser = new RegexParser(pattern, option, culture, optionSpan); + var parser = new RegexParser(pattern, options, culture, optionSpan); parser.CountCaptures(); - parser.Reset(option); + parser.Reset(options); RegexNode root = parser.ScanRegex(); string[] capnamelist = parser._capnamelist?.ToArray(); - var tree = new RegexTree(root, parser._caps, parser._capnumlist, parser._captop, parser._capnames, capnamelist, option); + var tree = new RegexTree(root, parser._caps, parser._capnumlist, parser._captop, parser._capnames, capnamelist, options); parser.Dispose(); return tree; @@ -100,11 +100,11 @@ public static RegexTree Parse(string pattern, RegexOptions option, CultureInfo c /// /// This static call constructs a flat concatenation node given a replacement pattern. /// - public static RegexReplacement ParseReplacement(string pattern, RegexOptions option, Hashtable caps, int capsize, Hashtable capnames) + public static RegexReplacement ParseReplacement(string pattern, RegexOptions options, Hashtable caps, int capsize, Hashtable capnames) { - CultureInfo culture = (option & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; + CultureInfo culture = (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; Span optionSpan = stackalloc RegexOptions[OptionStackDefaultSize]; - var parser = new RegexParser(pattern, option, culture, caps, capsize, capnames, optionSpan); + var parser = new RegexParser(pattern, options, culture, caps, capsize, capnames, optionSpan); RegexNode root = parser.ScanReplacement(); var regexReplacement = new RegexReplacement(pattern, root, caps); @@ -227,17 +227,17 @@ private static string UnescapeImpl(string input, int i) return vsb.ToString(); } - + /// /// Resets parsing to the beginning of the pattern. /// - private void Reset(RegexOptions option) + private void Reset(RegexOptions options) { _currentPos = 0; _autocap = 1; _ignoreNextParen = false; _optionsStack.Length = 0; - _option = option; + _options = options; _stack = null; } @@ -255,7 +255,7 @@ private RegexNode ScanRegex() char ch = '@'; // nonspecial ch, means at beginning bool isQuantifier = false; - StartGroup(new RegexNode(RegexNode.Capture, _option, 0, -1)); + StartGroup(new RegexNode(RegexNode.Capture, _options, 0, -1)); while (CharsRight() > 0) { @@ -488,7 +488,7 @@ private RegexNode ScanRegex() */ private RegexNode ScanReplacement() { - _concatenation = new RegexNode(RegexNode.Concatenate, _option); + _concatenation = new RegexNode(RegexNode.Concatenate, _options); for (; ;) { @@ -708,10 +708,10 @@ private RegexNode ScanGroupOpen() if (UseOptionN() || _ignoreNextParen) { _ignoreNextParen = false; - return new RegexNode(RegexNode.Group, _option); + return new RegexNode(RegexNode.Group, _options); } else - return new RegexNode(RegexNode.Capture, _option, _autocap++, -1); + return new RegexNode(RegexNode.Capture, _options, _autocap++, -1); } MoveRight(); @@ -733,13 +733,13 @@ private RegexNode ScanGroupOpen() case '=': // lookahead assertion - _option &= ~(RegexOptions.RightToLeft); + _options &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Require; break; case '!': // negative lookahead assertion - _option &= ~(RegexOptions.RightToLeft); + _options &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Prevent; break; @@ -764,7 +764,7 @@ private RegexNode ScanGroupOpen() goto BreakRecognize; // lookbehind assertion - _option |= RegexOptions.RightToLeft; + _options |= RegexOptions.RightToLeft; NodeType = RegexNode.Require; break; @@ -773,7 +773,7 @@ private RegexNode ScanGroupOpen() goto BreakRecognize; // negative lookbehind assertion - _option |= RegexOptions.RightToLeft; + _options |= RegexOptions.RightToLeft; NodeType = RegexNode.Prevent; break; @@ -861,7 +861,7 @@ private RegexNode ScanGroupOpen() if ((capnum != -1 || uncapnum != -1) && CharsRight() > 0 && RightCharMoveRight() == close) { - return new RegexNode(RegexNode.Capture, _option, capnum, uncapnum); + return new RegexNode(RegexNode.Capture, _options, capnum, uncapnum); } goto BreakRecognize; } @@ -882,7 +882,7 @@ private RegexNode ScanGroupOpen() if (CharsRight() > 0 && RightCharMoveRight() == ')') { if (IsCaptureSlot(capnum)) - return new RegexNode(RegexNode.Testref, _option, capnum); + return new RegexNode(RegexNode.Testref, _options, capnum); else throw MakeException(RegexParseError.UndefinedReference, SR.Format(SR.UndefinedReference, capnum.ToString(CultureInfo.CurrentCulture))); } @@ -894,7 +894,7 @@ private RegexNode ScanGroupOpen() string capname = ScanCapname(); if (IsCaptureName(capname) && CharsRight() > 0 && RightCharMoveRight() == ')') - return new RegexNode(RegexNode.Testref, _option, CaptureSlotFromName(capname)); + return new RegexNode(RegexNode.Testref, _options, CaptureSlotFromName(capname)); } } // not a backref @@ -941,7 +941,7 @@ private RegexNode ScanGroupOpen() break; } - return new RegexNode(NodeType, _option); + return new RegexNode(NodeType, _options); } BreakRecognize: @@ -1022,55 +1022,55 @@ private RegexNode ScanBackslash(bool scanOnly) MoveRight(); if (scanOnly) return null; - return new RegexNode(TypeFromCode(ch), _option); + return new RegexNode(TypeFromCode(ch), _options); case 'w': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _option, RegexCharClass.ECMAWordClass); - return new RegexNode(RegexNode.Set, _option, RegexCharClass.WordClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMAWordClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.WordClass); case 'W': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotECMAWordClass); - return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotWordClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMAWordClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotWordClass); case 's': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _option, RegexCharClass.ECMASpaceClass); - return new RegexNode(RegexNode.Set, _option, RegexCharClass.SpaceClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMASpaceClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.SpaceClass); case 'S': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotECMASpaceClass); - return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotSpaceClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMASpaceClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotSpaceClass); case 'd': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _option, RegexCharClass.ECMADigitClass); - return new RegexNode(RegexNode.Set, _option, RegexCharClass.DigitClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMADigitClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.DigitClass); case 'D': MoveRight(); if (scanOnly) return null; if (UseOptionE()) - return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotECMADigitClass); - return new RegexNode(RegexNode.Set, _option, RegexCharClass.NotDigitClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMADigitClass); + return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotDigitClass); case 'p': case 'P': @@ -1082,7 +1082,7 @@ private RegexNode ScanBackslash(bool scanOnly) if (UseOptionI()) cc.AddLowercase(_culture); - return new RegexNode(RegexNode.Set, _option, cc.ToStringClass()); + return new RegexNode(RegexNode.Set, _options, cc.ToStringClass()); default: return ScanBasicBackslash(scanOnly); @@ -1146,7 +1146,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) if (scanOnly) return null; if (IsCaptureSlot(capnum)) - return new RegexNode(RegexNode.Ref, _option, capnum); + return new RegexNode(RegexNode.Ref, _options, capnum); else throw MakeException(RegexParseError.UndefinedBackref, SR.Format(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture))); } @@ -1171,7 +1171,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) newcapnum = newcapnum * 10 + (int)(ch - '0'); } if (capnum >= 0) - return scanOnly ? null : new RegexNode(RegexNode.Ref, _option, capnum); + return scanOnly ? null : new RegexNode(RegexNode.Ref, _options, capnum); } else { @@ -1179,7 +1179,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) if (scanOnly) return null; if (IsCaptureSlot(capnum)) - return new RegexNode(RegexNode.Ref, _option, capnum); + return new RegexNode(RegexNode.Ref, _options, capnum); else if (capnum <= 9) throw MakeException(RegexParseError.UndefinedBackref, SR.Format(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture))); } @@ -1197,7 +1197,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) if (scanOnly) return null; if (IsCaptureName(capname)) - return new RegexNode(RegexNode.Ref, _option, CaptureSlotFromName(capname)); + return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname)); else throw MakeException(RegexParseError.UndefinedNameRef, SR.Format(SR.UndefinedNameRef, capname)); } @@ -1211,7 +1211,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) if (UseOptionI()) ch = _culture.TextInfo.ToLower(ch); - return scanOnly ? null : new RegexNode(RegexNode.One, _option, ch); + return scanOnly ? null : new RegexNode(RegexNode.One, _options, ch); } /* @@ -1220,7 +1220,7 @@ private RegexNode ScanBasicBackslash(bool scanOnly) private RegexNode ScanDollar() { if (CharsRight() == 0) - return new RegexNode(RegexNode.One, _option, '$'); + return new RegexNode(RegexNode.One, _options, '$'); char ch = RightChar(); bool angled; @@ -1272,7 +1272,7 @@ private RegexNode ScanDollar() } Textto(lastEndPos); if (capnum >= 0) - return new RegexNode(RegexNode.Ref, _option, capnum); + return new RegexNode(RegexNode.Ref, _options, capnum); } else { @@ -1280,7 +1280,7 @@ private RegexNode ScanDollar() if (!angled || CharsRight() > 0 && RightCharMoveRight() == '}') { if (IsCaptureSlot(capnum)) - return new RegexNode(RegexNode.Ref, _option, capnum); + return new RegexNode(RegexNode.Ref, _options, capnum); } } } @@ -1291,7 +1291,7 @@ private RegexNode ScanDollar() if (CharsRight() > 0 && RightCharMoveRight() == '}') { if (IsCaptureName(capname)) - return new RegexNode(RegexNode.Ref, _option, CaptureSlotFromName(capname)); + return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname)); } } else if (!angled) @@ -1302,7 +1302,7 @@ private RegexNode ScanDollar() { case '$': MoveRight(); - return new RegexNode(RegexNode.One, _option, '$'); + return new RegexNode(RegexNode.One, _options, '$'); case '&': capnum = 0; @@ -1328,14 +1328,14 @@ private RegexNode ScanDollar() if (capnum != 1) { MoveRight(); - return new RegexNode(RegexNode.Ref, _option, capnum); + return new RegexNode(RegexNode.Ref, _options, capnum); } } // unrecognized $: literalize Textto(backpos); - return new RegexNode(RegexNode.One, _option, '$'); + return new RegexNode(RegexNode.One, _options, '$'); } /* @@ -1475,11 +1475,11 @@ private char ScanControl() /* * Returns true for options allowed only at the top level */ - private bool IsOnlyTopOption(RegexOptions option) + private bool IsOnlyTopOption(RegexOptions options) { - return option == RegexOptions.RightToLeft || - option == RegexOptions.CultureInvariant || - option == RegexOptions.ECMAScript; + return options == RegexOptions.RightToLeft || + options == RegexOptions.CultureInvariant || + options == RegexOptions.ECMAScript; } /* @@ -1501,14 +1501,14 @@ private void ScanOptions() } else { - RegexOptions option = OptionFromCode(ch); - if (option == 0 || IsOnlyTopOption(option)) + RegexOptions options = OptionFromCode(ch); + if (options == 0 || IsOnlyTopOption(options)) return; if (off) - _option &= ~option; + _options &= ~options; else - _option |= option; + _options |= options; } } } @@ -1914,7 +1914,7 @@ private bool IsCaptureName(string capname) */ private bool UseOptionN() { - return (_option & RegexOptions.ExplicitCapture) != 0; + return (_options & RegexOptions.ExplicitCapture) != 0; } /* @@ -1922,7 +1922,7 @@ private bool UseOptionN() */ private bool UseOptionI() { - return (_option & RegexOptions.IgnoreCase) != 0; + return (_options & RegexOptions.IgnoreCase) != 0; } /* @@ -1930,7 +1930,7 @@ private bool UseOptionI() */ private bool UseOptionM() { - return (_option & RegexOptions.Multiline) != 0; + return (_options & RegexOptions.Multiline) != 0; } /* @@ -1938,7 +1938,7 @@ private bool UseOptionM() */ private bool UseOptionS() { - return (_option & RegexOptions.Singleline) != 0; + return (_options & RegexOptions.Singleline) != 0; } /* @@ -1946,7 +1946,7 @@ private bool UseOptionS() */ private bool UseOptionX() { - return (_option & RegexOptions.IgnorePatternWhitespace) != 0; + return (_options & RegexOptions.IgnorePatternWhitespace) != 0; } /* @@ -1954,7 +1954,7 @@ private bool UseOptionX() */ private bool UseOptionE() { - return (_option & RegexOptions.ECMAScript) != 0; + return (_options & RegexOptions.ECMAScript) != 0; } private const byte Q = 5; // quantifier @@ -2066,7 +2066,7 @@ private void AddConcatenate(int pos, int cch, bool isReplacement) str = _pattern.Substring(pos, cch); } - node = new RegexNode(RegexNode.Multi, _option, str); + node = new RegexNode(RegexNode.Multi, _options, str); } else { @@ -2075,7 +2075,7 @@ private void AddConcatenate(int pos, int cch, bool isReplacement) if (UseOptionI() && !isReplacement) ch = _culture.TextInfo.ToLower(ch); - node = new RegexNode(RegexNode.One, _option, ch); + node = new RegexNode(RegexNode.One, _options, ch); } _concatenation.AddChild(node); @@ -2127,8 +2127,8 @@ private bool EmptyStack() private void StartGroup(RegexNode openGroup) { _group = openGroup; - _alternation = new RegexNode(RegexNode.Alternate, _option); - _concatenation = new RegexNode(RegexNode.Concatenate, _option); + _alternation = new RegexNode(RegexNode.Alternate, _options); + _concatenation = new RegexNode(RegexNode.Concatenate, _options); } /* @@ -2147,7 +2147,7 @@ private void AddAlternate() _alternation.AddChild(_concatenation.ReverseLeft()); } - _concatenation = new RegexNode(RegexNode.Concatenate, _option); + _concatenation = new RegexNode(RegexNode.Concatenate, _options); } /* @@ -2186,7 +2186,7 @@ private void AddUnitOne(char ch) if (UseOptionI()) ch = _culture.TextInfo.ToLower(ch); - _unit = new RegexNode(RegexNode.One, _option, ch); + _unit = new RegexNode(RegexNode.One, _options, ch); } /* @@ -2197,7 +2197,7 @@ private void AddUnitNotone(char ch) if (UseOptionI()) ch = _culture.TextInfo.ToLower(ch); - _unit = new RegexNode(RegexNode.Notone, _option, ch); + _unit = new RegexNode(RegexNode.Notone, _options, ch); } /* @@ -2205,7 +2205,7 @@ private void AddUnitNotone(char ch) */ private void AddUnitSet(string cc) { - _unit = new RegexNode(RegexNode.Set, _option, cc); + _unit = new RegexNode(RegexNode.Set, _options, cc); } /* @@ -2221,7 +2221,7 @@ private void AddUnitNode(RegexNode node) */ private void AddUnitType(int type) { - _unit = new RegexNode(type, _option); + _unit = new RegexNode(type, _options); } /* @@ -2250,7 +2250,7 @@ private void AddGroup() */ private void PushOptions() { - _optionsStack.Append(_option); + _optionsStack.Append(_options); } /* @@ -2258,7 +2258,7 @@ private void PushOptions() */ private void PopOptions() { - _option = _optionsStack.Pop(); + _options = _optionsStack.Pop(); } /* @@ -2270,7 +2270,7 @@ private bool EmptyOptionsStack() } /* - * Pops the option stack, but keeps the current options unchanged. + * Pops the options stack, but keeps the current options unchanged. */ private void PopKeepOptions() { From f84c8f3fe5124563f332c95d4391bcb00c921bad Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 20:01:42 +0200 Subject: [PATCH 12/17] Avoid string allocation when IgnoreCase set Prefix patterns which are passed to RegexBoyerMoore are already lowercased by the parser. Remove the redundant ToLower() call and assert the patterns lowercase state --- .../RegularExpressions/RegexBoyerMoore.cs | 21 ++----------------- .../tests/Regex.Match.Tests.cs | 4 ++-- .../tests/Regex.Replace.Tests.cs | 2 ++ 3 files changed, 6 insertions(+), 21 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs index 38c28518e345..a2daac6ce453 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs @@ -13,8 +13,6 @@ using System.Diagnostics; using System.Globalization; -using System.IO; -using System.Text; namespace System.Text.RegularExpressions { @@ -39,12 +37,7 @@ public RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, C // Sorry, you just can't use Boyer-Moore to find an empty pattern. // We're doing this for your own protection. (Really, for speed.) Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf"); - - if (caseInsensitive) - { - pattern = string.Create(pattern.Length, (pattern, culture), (span, state) => - state.pattern.AsSpan().ToLower(span, state.culture)); - } + Debug.Assert(!caseInsensitive || pattern.ToLower(culture) == pattern, "RegexBoyerMoore called with a pattern which is not lowercased with caseInsensitive true."); Pattern = pattern; RightToLeft = rightToLeft; @@ -221,17 +214,7 @@ private bool MatchPattern(string text, int index) return false; } - TextInfo textinfo = _culture.TextInfo; - for (int i = 0; i < Pattern.Length; i++) - { - Debug.Assert(textinfo.ToLower(Pattern[i]) == Pattern[i], "pattern should be converted to lower case in constructor!"); - if (textinfo.ToLower(text[index + i]) != Pattern[i]) - { - return false; - } - } - - return true; + return (0 == string.Compare(Pattern, 0, text, index, Pattern.Length, CaseInsensitive, _culture)); } else { diff --git a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 09b8a2dc291f..62d2e6af0e72 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -109,13 +109,13 @@ public static IEnumerable Match_Basic_TestData() } // Turning off case insensitive option in mid-pattern : Actual - "aaa(?-i:match this)bbb", "i" - yield return new object[] { "aaa(?-i:match this)bbb", "AaAmatch thisBBb", RegexOptions.IgnoreCase, 0, 16, true, "AaAmatch thisBBb" }; + yield return new object[] { "aAa(?-i:match this)bbb", "AaAmatch thisBBb", RegexOptions.IgnoreCase, 0, 16, true, "AaAmatch thisBBb" }; // Turning on/off all the options at once : Actual - "aaa(?imnsx-imnsx:match this)bbb", "i" yield return new object[] { "aaa(?imnsx-imnsx:match this)bbb", "AaAmatcH thisBBb", RegexOptions.IgnoreCase, 0, 16, false, string.Empty }; // Actual - "aaa(?#ignore this completely)bbb" - yield return new object[] { "aaa(?#ignore this completely)bbb", "aaabbb", RegexOptions.None, 0, 6, true, "aaabbb" }; + yield return new object[] { "aAa(?#ignore this completely)bbb", "aAabbb", RegexOptions.None, 0, 6, true, "aAabbb" }; // Trying empty string: Actual "[a-z0-9]+", "" yield return new object[] { "[a-z0-9]+", "", RegexOptions.None, 0, 0, false, string.Empty }; diff --git a/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs index 0cbf545bd7a1..8f5ed15291c6 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs @@ -60,6 +60,8 @@ public static IEnumerable Replace_String_TestData() yield return new object[] { @"(hello)\s+(world)", "START hello world END", "$2 $1 $1 $2 $3$4", RegexOptions.None, 24, 0, "START world hello hello world $3$4 END" }; yield return new object[] { @"(hello)\s+(world)", "START hello world END", "$2 $1 $1 $2 $123$234", RegexOptions.None, 24, 0, "START world hello hello world $123$234 END" }; + yield return new object[] { "aAa( CcC )bbb", "AaA CcC BBb", "$`blub$'", RegexOptions.IgnoreCase, 0, 10, "AaAblubBBb" }; + yield return new object[] { @"aaa", "My dog cat has fleas.", "$01$02$03$04$05$06$07$08$09$10$11", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Multiline, 21, 0, "My dog cat has fleas." }; yield return new object[] { @"(d)(o)(g)(\s)(c)(a)(t)(\s)(h)(a)(s)", "My dog cat has fleas.", "$01$02$03$04$05$06$07$08$09$10$11", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Multiline, 21, 0, "My dog cat has fleas." }; yield return new object[] { @"(d)(o)(g)(\s)(c)(a)(t)(\s)(h)(a)(s)", "My dog cat has fleas.", "$05$06$07$04$01$02$03$08$09$10$11", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Multiline, 21, 0, "My cat dog has fleas." }; From 0c7947a44628c6c4a2981c969623af3252632e0e Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 20:39:04 +0200 Subject: [PATCH 13/17] Remove wrong test case --- src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs index 8f5ed15291c6..d132a9e0fd14 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs @@ -60,7 +60,6 @@ public static IEnumerable Replace_String_TestData() yield return new object[] { @"(hello)\s+(world)", "START hello world END", "$2 $1 $1 $2 $3$4", RegexOptions.None, 24, 0, "START world hello hello world $3$4 END" }; yield return new object[] { @"(hello)\s+(world)", "START hello world END", "$2 $1 $1 $2 $123$234", RegexOptions.None, 24, 0, "START world hello hello world $123$234 END" }; - yield return new object[] { "aAa( CcC )bbb", "AaA CcC BBb", "$`blub$'", RegexOptions.IgnoreCase, 0, 10, "AaAblubBBb" }; yield return new object[] { @"aaa", "My dog cat has fleas.", "$01$02$03$04$05$06$07$08$09$10$11", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Multiline, 21, 0, "My dog cat has fleas." }; yield return new object[] { @"(d)(o)(g)(\s)(c)(a)(t)(\s)(h)(a)(s)", "My dog cat has fleas.", "$01$02$03$04$05$06$07$08$09$10$11", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Multiline, 21, 0, "My dog cat has fleas." }; yield return new object[] { @"(d)(o)(g)(\s)(c)(a)(t)(\s)(h)(a)(s)", "My dog cat has fleas.", "$05$06$07$04$01$02$03$08$09$10$11", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Multiline, 21, 0, "My cat dog has fleas." }; From 7ab140e0d38bc870560029e2d814944babe86d49 Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Sun, 24 Jun 2018 22:56:43 +0200 Subject: [PATCH 14/17] Add surrogate pair positive & negative tests --- src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs | 3 +++ src/System.Text.RegularExpressions/tests/RegexParserTests.cs | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 62d2e6af0e72..dcb33f95f15c 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -291,6 +291,9 @@ public static IEnumerable Match_Basic_TestData() // \c if (!PlatformDetection.IsFullFramework) // missing fix for #26501 yield return new object[] { @"(cat)(\c[*)(dog)", "asdlkcat\u00FFdogiwod", RegexOptions.None, 0, 15, false, string.Empty }; + + // Surrogate pairs splitted up into UTF-16 code units. + yield return new object[] { @"(\uD82F[\uDCA0-\uDCA3])", "\uD82F\uDCA2", RegexOptions.CultureInvariant, 0, 2, true, "\uD82F\uDCA2" }; } [Theory] diff --git a/src/System.Text.RegularExpressions/tests/RegexParserTests.cs b/src/System.Text.RegularExpressions/tests/RegexParserTests.cs index de3a50015eef..15ee898d399b 100644 --- a/src/System.Text.RegularExpressions/tests/RegexParserTests.cs +++ b/src/System.Text.RegularExpressions/tests/RegexParserTests.cs @@ -862,6 +862,8 @@ public void Parse(string pattern, RegexOptions options, object errorObj) [InlineData("a{2147483648,}", RegexOptions.None, RegexParseError.CaptureGroupOutOfRange)] [InlineData("a{0,2147483647}", RegexOptions.None, null)] [InlineData("a{0,2147483648}", RegexOptions.None, RegexParseError.CaptureGroupOutOfRange)] + // Surrogate pair which is parsed as [char,char-char,char] as we operate on UTF-16 code units. + [InlineData("[\uD82F\uDCA0-\uD82F\uDCA3]", RegexOptions.IgnoreCase, RegexParseError.ReversedCharRange)] public void Parse_NotNetFramework(string pattern, RegexOptions options, object error) { Parse(pattern, options, error); From d15bac9ce11dc17d174e4b1142d3f08a564d757f Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Mon, 25 Jun 2018 16:03:22 +0200 Subject: [PATCH 15/17] Remove wrong test case --- src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs index d132a9e0fd14..0cbf545bd7a1 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Replace.Tests.cs @@ -60,7 +60,6 @@ public static IEnumerable Replace_String_TestData() yield return new object[] { @"(hello)\s+(world)", "START hello world END", "$2 $1 $1 $2 $3$4", RegexOptions.None, 24, 0, "START world hello hello world $3$4 END" }; yield return new object[] { @"(hello)\s+(world)", "START hello world END", "$2 $1 $1 $2 $123$234", RegexOptions.None, 24, 0, "START world hello hello world $123$234 END" }; - yield return new object[] { @"aaa", "My dog cat has fleas.", "$01$02$03$04$05$06$07$08$09$10$11", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Multiline, 21, 0, "My dog cat has fleas." }; yield return new object[] { @"(d)(o)(g)(\s)(c)(a)(t)(\s)(h)(a)(s)", "My dog cat has fleas.", "$01$02$03$04$05$06$07$08$09$10$11", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Multiline, 21, 0, "My dog cat has fleas." }; yield return new object[] { @"(d)(o)(g)(\s)(c)(a)(t)(\s)(h)(a)(s)", "My dog cat has fleas.", "$05$06$07$04$01$02$03$08$09$10$11", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Multiline, 21, 0, "My cat dog has fleas." }; From 4f31b011065f1ad14d1f8304f9112919da1c8d0f Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Mon, 25 Jun 2018 16:28:43 +0200 Subject: [PATCH 16/17] Add test cases for rtl anchor Add tests for rtl anchored patterns. --- .../tests/Regex.Match.Tests.cs | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index dcb33f95f15c..e9ae205347b5 100644 --- a/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -76,7 +76,11 @@ public static IEnumerable Match_Basic_TestData() yield return new object[] { "[^0-9]+(?>[0-9]+)3", "abc123", RegexOptions.None, 0, 6, false, string.Empty }; // Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z" - yield return new object[] { @"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzz", RegexOptions.None, 0, 17, true, "aaaasdfajsdlfjzzz" }; + yield return new object[] { @"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzz", RegexOptions.IgnoreCase, 0, 17, true, "aaaasdfajsdlfjzzz" }; + yield return new object[] { @"\Aaaaaa\w+zzz\Z", "aaaa", RegexOptions.IgnoreCase, 0, 4, false, string.Empty }; + yield return new object[] { @"\Aaaaaa\w+zzz\Z", "aaaa", RegexOptions.RightToLeft, 0, 4, false, string.Empty }; + yield return new object[] { @"\Aaaaaa\w+zzzzz\Z", "aaaa", RegexOptions.RightToLeft, 0, 4, false, string.Empty }; + yield return new object[] { @"\Aaaaaa\w+zzz\Z", "aaaa", RegexOptions.RightToLeft | RegexOptions.IgnoreCase, 0, 4, false, string.Empty }; // Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z" yield return new object[] { @"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzza", RegexOptions.None, 0, 18, false, string.Empty }; @@ -634,6 +638,24 @@ public static IEnumerable Match_Advanced_TestData() new CaptureData("aaa", 0, 3) } }; + + // RightToLeft with anchor + yield return new object[] + { + "^aaa", "aaabbb", RegexOptions.RightToLeft, 3, 3, + new CaptureData[] + { + new CaptureData("aaa", 0, 3) + } + }; + yield return new object[] + { + "bbb$", "aaabbb", RegexOptions.RightToLeft, 0, 3, + new CaptureData[] + { + new CaptureData("bbb", 0, 3) + } + }; } [Theory] From 0e114e6bd516ccd96ce4039c1217fbd383bb9a85 Mon Sep 17 00:00:00 2001 From: Viktor Hofer Date: Fri, 29 Jun 2018 16:11:41 +0200 Subject: [PATCH 17/17] Use state locals in RegexParser --- .../src/System/Text/RegularExpressions/RegexParser.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 3864920ec59d..d9c7500c045d 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -2059,7 +2059,7 @@ private void AddConcatenate(int pos, int cch, bool isReplacement) if (UseOptionI() && !isReplacement) { str = string.Create(cch, (_pattern, _culture, pos, cch), (span, state) => - state._pattern.AsSpan(pos, cch).ToLower(span, state._culture)); + state._pattern.AsSpan(state.pos, state.cch).ToLower(span, state._culture)); } else {