Skip to content

Commit 4b0a26b

Browse files
committed
feat(core): add support for COMMENTS and UNICODE_CASE global flags
- Introduced SiftGlobalFlag.COMMENTS (?x) to allow verbose regex validation. - Introduced SiftGlobalFlag.UNICODE_CASE (?u) for Unicode-aware case-insensitive matching. - Validated correct flag concatenation and engine propagation via unit tests.
1 parent 099ea96 commit 4b0a26b

File tree

2 files changed

+64
-6
lines changed

2 files changed

+64
-6
lines changed

sift-core/src/main/java/com/mirkoddd/sift/core/SiftGlobalFlag.java

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,33 +19,54 @@
1919
* Inline flags to modify the regular expression engine behavior globally.
2020
* <p>
2121
* These flags are applied at the very beginning of the pattern
22-
* (e.g., {@code (?im)}) to instruct the Regex engine.
22+
* (e.g., {@code (?imx)}) to instruct the Regex engine.
2323
*/
2424
public enum SiftGlobalFlag {
2525

2626
/**
2727
* Enables case-insensitive matching ({@code (?i)}).
2828
* <p>
2929
* By default, case-insensitive matching assumes that only characters in the US-ASCII charset are being matched.
30+
* To enable Unicode-aware case-insensitive matching, combine this with {@link #UNICODE_CASE}.
3031
*/
3132
CASE_INSENSITIVE("i"),
3233

3334
/**
3435
* Enables multiline mode ({@code (?m)}).
3536
* <p>
36-
* In multiline mode, the expressions {@code ^} (fromStart) and {@code $} (andNothingElse)
37-
* match just after or just before a line terminator, rather than only at the
37+
* In multiline mode, the expressions {@code ^} (fromStart) and {@code $} (andNothingElse)
38+
* match just after or just before a line terminator, rather than only at the
3839
* beginning or end of the entire input string.
3940
*/
4041
MULTILINE("m"),
4142

4243
/**
4344
* Enables dotall mode ({@code (?s)}).
4445
* <p>
45-
* In dotall mode, the {@code .any()} expression matches any character,
46+
* In dotall mode, the {@code .any()} expression matches any character,
4647
* including a line terminator (which it normally ignores).
4748
*/
48-
DOTALL("s");
49+
DOTALL("s"),
50+
51+
/**
52+
* Enables verbose/comments mode ({@code (?x)}).
53+
* <p>
54+
* In this mode, whitespace is ignored, and embedded comments starting with
55+
* {@code #} are ignored until the end of a line.
56+
* <p>
57+
* <i>Note: Sift's internal {@code literal()} generator automatically escapes spaces
58+
* and {@code #} characters to ensure your literals remain strictly matched even when
59+
* this flag is active.</i>
60+
*/
61+
COMMENTS("x"),
62+
63+
/**
64+
* Enables Unicode-aware case folding ({@code (?u)}).
65+
* <p>
66+
* When this flag is specified alongside {@link #CASE_INSENSITIVE}, case-insensitive matching,
67+
* it will be done in a manner consistent with the Unicode Standard.
68+
*/
69+
UNICODE_CASE("u");
4970

5071
private final String symbol;
5172

@@ -56,7 +77,7 @@ public enum SiftGlobalFlag {
5677
/**
5778
* Retrieves the regex symbol associated with this flag.
5879
*
59-
* @return The literal character used by the regex engine (e.g., 'i', 'm', 's').
80+
* @return The literal character used by the regex engine (e.g., 'i', 'm', 's', 'x', 'u').
6081
*/
6182
public String getSymbol() {
6283
return symbol;

sift-core/src/test/java/com/mirkoddd/sift/core/SiftTest.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,6 +1187,43 @@ void flagsFromWordBoundary() {
11871187
assertRegexMatches(regex, "fatal ERROR occurred");
11881188
assertRegexMatches(regex, "fatal error occurred");
11891189
}
1190+
1191+
@Test
1192+
@DisplayName("Should handle COMMENTS (verbose) mode correctly")
1193+
void commentsFlag() {
1194+
String regex = filteringWith(SiftGlobalFlag.COMMENTS)
1195+
.fromStart()
1196+
.exactly(3).letters()
1197+
.andNothingElse()
1198+
.shake();
1199+
1200+
// Ensure the 'x' flag is correctly injected
1201+
assertEquals("(?x)^[a-zA-Z]{3}$", regex);
1202+
1203+
// Ensure standard matching still works perfectly
1204+
assertRegexMatches(regex, "abc");
1205+
assertRegexDoesNotMatch(regex, "a c"); // Space is not matched unless explicitly requested
1206+
}
1207+
1208+
@Test
1209+
@DisplayName("Should handle UNICODE_CASE mode correctly alongside CASE_INSENSITIVE")
1210+
void unicodeCaseFlag() {
1211+
// Unicode Case is usually combined with Case Insensitive
1212+
String regex = filteringWith(CASE_INSENSITIVE, SiftGlobalFlag.UNICODE_CASE)
1213+
.fromStart()
1214+
.oneOrMore().lettersUnicode() // Using the \p{L} class
1215+
.andNothingElse()
1216+
.shake();
1217+
1218+
// Ensure 'i' and 'u' are concatenated correctly inside the flag group
1219+
assertEquals("(?iu)^[\\p{L}]+$", regex);
1220+
1221+
// Should match Unicode letters case-insensitively
1222+
assertRegexMatches(regex, "è");
1223+
assertRegexMatches(regex, "È");
1224+
assertRegexMatches(regex, "ω"); // Greek lowercase omega
1225+
assertRegexMatches(regex, "Ω"); // Greek uppercase omega
1226+
}
11901227
}
11911228

11921229
@Test

0 commit comments

Comments
 (0)