From 2c7af0f65321c2d0c50a86d3887530c501aa03e1 Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Fri, 7 Oct 2016 17:24:22 -0700 Subject: [PATCH 1/9] Fix backslash escaping in 'LIKE' patterns. --- .../spark/sql/catalyst/util/StringUtils.scala | 36 +++++++++---------- .../expressions/RegexpExpressionsSuite.scala | 2 ++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index cde8bd5b9614c..5ff87b6e5f611 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -25,26 +25,24 @@ object StringUtils { // replace the _ with .{1} exactly match 1 time of any character // replace the % with .*, match 0 or more times with any character - def escapeLikeRegex(v: String): String = { - if (!v.isEmpty) { - "(?s)" + (' ' +: v.init).zip(v).flatMap { - case (prev, '\\') => "" - case ('\\', c) => - c match { - case '_' => "_" - case '%' => "%" - case _ => Pattern.quote("\\" + c) - } - case (prev, c) => - c match { - case '_' => "." - case '%' => ".*" - case _ => Pattern.quote(Character.toString(c)) - } - }.mkString - } else { - v + def escapeLikeRegex(str: String): String = { + val builder = new StringBuilder() + str.foldLeft(false) { case (escaping, next) => + if (escaping) { + builder ++= Pattern.quote(Character.toString(next)) + false + } else if (next == '\\') { + true + } else { + builder ++= (next match { + case '_' => "." + case '%' => ".*" + case _ => Pattern.quote(Character.toString(next)) + }) + false + } } + "(?s)" + builder.result() // (?s) enables dotall mode, causing "." to match new lines } private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 5299549e7b4da..9b42489cb29eb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -53,6 +53,8 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\nb" like "a_b", true) checkEvaluation("ab" like "a%b", true) checkEvaluation("a\nb" like "a%b", true) + + checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping } test("LIKE Non-literal Regular Expression") { From 6d39df3c352d4af7e071adb099ddecd07e60fbf5 Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Fri, 7 Oct 2016 18:10:20 -0700 Subject: [PATCH 2/9] Add more tests for various combinations of backslashes and SQL pattern characters --- .../expressions/RegexpExpressionsSuite.scala | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 9b42489cb29eb..9316f761904df 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -55,6 +55,20 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\nb" like "a%b", true) checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping + checkEvaluation("""\_%""" like """%\\__""", true) + checkEvaluation("""\_%""" like "%\\\\__", true) + checkEvaluation("""\_%""" like """%\\_%""", true) + checkEvaluation("""\\\\%%""" like """\\%""", true) + checkEvaluation("""\%\""" like """%\%%""", true) + checkEvaluation("\\\n\n%\\" like "\\\\___\\\\", true) + checkEvaluation("""%%""" like """%%""", true) + checkEvaluation("""\__""" like """\\\__""", true) + + checkEvaluation("""\\\__""" like """%\\%\%""", false) + checkEvaluation("""\_""" like """\_\_""", false) + checkEvaluation("""_\\\%""" like """%\\""", false) + checkEvaluation("""_\\__""" like """_\___""", false) + } test("LIKE Non-literal Regular Expression") { From 25f31bbb73276798fce3e4d0360cd5710c947ea0 Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Fri, 7 Oct 2016 18:16:45 -0700 Subject: [PATCH 3/9] Factor out foldLeft to a for loop and mutable state --- .../org/apache/spark/sql/catalyst/util/StringUtils.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index 5ff87b6e5f611..c03cc4c718755 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -27,19 +27,20 @@ object StringUtils { // replace the % with .*, match 0 or more times with any character def escapeLikeRegex(str: String): String = { val builder = new StringBuilder() - str.foldLeft(false) { case (escaping, next) => + var escaping = false + for (next <- str) { if (escaping) { builder ++= Pattern.quote(Character.toString(next)) - false + escaping = false } else if (next == '\\') { - true + escaping = true } else { builder ++= (next match { case '_' => "." case '%' => ".*" case _ => Pattern.quote(Character.toString(next)) }) - false + escaping = false } } "(?s)" + builder.result() // (?s) enables dotall mode, causing "." to match new lines From cc3d9be1e584fd831b35e4aa4dac9ffe5588e515 Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Fri, 7 Oct 2016 19:57:02 -0700 Subject: [PATCH 4/9] Fix tests --- .../org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala index 2ffc18a8d14fb..78fee5135c3ae 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala @@ -24,9 +24,9 @@ class StringUtilsSuite extends SparkFunSuite { test("escapeLikeRegex") { assert(escapeLikeRegex("abdef") === "(?s)\\Qa\\E\\Qb\\E\\Qd\\E\\Qe\\E\\Qf\\E") - assert(escapeLikeRegex("a\\__b") === "(?s)\\Qa\\E_.\\Qb\\E") + assert(escapeLikeRegex("a\\__b") === "(?s)\\Qa\\E\\Q_\\E.\\Qb\\E") assert(escapeLikeRegex("a_%b") === "(?s)\\Qa\\E..*\\Qb\\E") - assert(escapeLikeRegex("a%\\%b") === "(?s)\\Qa\\E.*%\\Qb\\E") + assert(escapeLikeRegex("a%\\%b") === "(?s)\\Qa\\E.*\\Q%\\E\\Qb\\E") assert(escapeLikeRegex("a%") === "(?s)\\Qa\\E.*") assert(escapeLikeRegex("**") === "(?s)\\Q*\\E\\Q*\\E") assert(escapeLikeRegex("a_b") === "(?s)\\Qa\\E.\\Qb\\E") From 70aaeac52765b111b98ab2a6ab1f3138e9956266 Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Thu, 13 Oct 2016 11:36:43 -0700 Subject: [PATCH 5/9] Add more documentation --- .../expressions/regexpExpressions.scala | 17 +++++- .../expressions/RegexpExpressionsSuite.scala | 57 +++++++++++++++---- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 4896a6225aa80..1504c7c62e104 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -68,7 +68,22 @@ trait StringRegexExpression extends ImplicitCastInputTypes { * Simple RegEx pattern matching function */ @ExpressionDescription( - usage = "str _FUNC_ pattern - Returns true if `str` matches `pattern`, or false otherwise.") + usage = "str _FUNC_ pattern - Returns true if str matches pattern, " + + "null if any arguments are null, false otherwise.", + extended = """ + |The pattern is a string which is matched literally, with exception to the + |following symbols: + | + | _ matches any one character in the input (similar to . in posix + | regular expressions) + | + | % matches zero ore more characters in the input (similar to .* in + | posix regular expressions) + | + |The default escape character is '\\'. Any character after the escape + |character will be matched against literally. + | + |Use RLIKE to match with standard regular expressions.""") case class Like(left: Expression, right: Expression) extends BinaryExpression with StringRegexExpression { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 9316f761904df..4f41d7d5de92e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.types.StringType +import org.apache.spark.sql.types.{IntegerType, StringType} /** * Unit tests for regular expression (regexp) related SQL expressions. @@ -27,6 +27,8 @@ import org.apache.spark.sql.types.StringType class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("LIKE literal Regular Expression") { + + // null handling checkEvaluation(Literal.create(null, StringType).like("a"), null) checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null) checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null) @@ -39,6 +41,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation( Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null) + // simple patterns checkEvaluation("abdef" like "abdef", true) checkEvaluation("a_%b" like "a\\__b", true) checkEvaluation("addb" like "a_%b", true) @@ -54,20 +57,32 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("ab" like "a%b", true) checkEvaluation("a\nb" like "a%b", true) + //empty input + checkEvaluation("" like "", true) + checkEvaluation("a" like "", false) + checkEvaluation("" like "a", false) + + + // SI-17647 double-escaping backslash checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping - checkEvaluation("""\_%""" like """%\\__""", true) - checkEvaluation("""\_%""" like "%\\\\__", true) - checkEvaluation("""\_%""" like """%\\_%""", true) - checkEvaluation("""\\\\%%""" like """\\%""", true) - checkEvaluation("""\%\""" like """%\%%""", true) - checkEvaluation("\\\n\n%\\" like "\\\\___\\\\", true) checkEvaluation("""%%""" like """%%""", true) checkEvaluation("""\__""" like """\\\__""", true) - checkEvaluation("""\\\__""" like """%\\%\%""", false) - checkEvaluation("""\_""" like """\_\_""", false) checkEvaluation("""_\\\%""" like """%\\""", false) - checkEvaluation("""_\\__""" like """_\___""", false) + + // unicode + checkEvaluation("a\u20ACa" like "_\u20AC_", true) + checkEvaluation("a€a" like "_€_", true) + checkEvaluation("a€a" like "_\u20AC_", true) + checkEvaluation("a\u20ACa" like "_€_", true) + + // escaping at end position + checkEvaluation("""a\""" like """a\""", false) // TODO: should throw an exception? + + // case + checkEvaluation("A" like "a%", false) + checkEvaluation("a" like "A%", false) + checkEvaluation("AaA" like "_a_", true) } @@ -90,6 +105,28 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\nb" like regEx, true, create_row("a%b")) checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%")) + + checkEvaluation("" like regEx, true, create_row("")) + checkEvaluation("a" like regEx, false, create_row("")) + checkEvaluation("" like regEx, false, create_row("a")) + + checkEvaluation("""\\\\""" like regEx, true, create_row("""%\\%""")) + checkEvaluation("""%%""" like regEx, true, create_row("""%%""")) + checkEvaluation("""\__""" like regEx, true, create_row("""\\\__""")) + checkEvaluation("""\\\__""" like regEx, false, create_row("""%\\%\%""")) + checkEvaluation("""_\\\%""" like regEx, false, create_row("""%\\""")) + + checkEvaluation("a\u20ACa" like regEx, true, create_row("_\u20AC_")) + checkEvaluation("a€a" like regEx, true, create_row("_€_")) + checkEvaluation("a€a" like regEx, true, create_row("_\u20AC_")) + checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_")) + + checkEvaluation("""a\""" like regEx, false, create_row("""a\""")) // TODO: should throw an exception? + + checkEvaluation("A" like regEx, false, create_row("a%")) + checkEvaluation("a" like regEx, false, create_row("A%")) + checkEvaluation("AaA" like regEx, true, create_row("_a_")) + } test("RLIKE literal Regular Expression") { From 8a04504a13257f5a4f00908c1a0dd486df60808d Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Thu, 13 Oct 2016 16:46:35 -0700 Subject: [PATCH 6/9] fix style issues --- .../catalyst/expressions/RegexpExpressionsSuite.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 4f41d7d5de92e..65c08d90eacb4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -57,7 +57,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("ab" like "a%b", true) checkEvaluation("a\nb" like "a%b", true) - //empty input + // empty input checkEvaluation("" like "", true) checkEvaluation("a" like "", false) checkEvaluation("" like "a", false) @@ -71,10 +71,12 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("""_\\\%""" like """%\\""", false) // unicode + // scalastyle:off nonascii checkEvaluation("a\u20ACa" like "_\u20AC_", true) checkEvaluation("a€a" like "_€_", true) checkEvaluation("a€a" like "_\u20AC_", true) checkEvaluation("a\u20ACa" like "_€_", true) + // scalastyle:on nonascii // escaping at end position checkEvaluation("""a\""" like """a\""", false) // TODO: should throw an exception? @@ -116,12 +118,15 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("""\\\__""" like regEx, false, create_row("""%\\%\%""")) checkEvaluation("""_\\\%""" like regEx, false, create_row("""%\\""")) + // scalastyle:off nonascii checkEvaluation("a\u20ACa" like regEx, true, create_row("_\u20AC_")) checkEvaluation("a€a" like regEx, true, create_row("_€_")) checkEvaluation("a€a" like regEx, true, create_row("_\u20AC_")) checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_")) + // scalastyle:on nonascii - checkEvaluation("""a\""" like regEx, false, create_row("""a\""")) // TODO: should throw an exception? + // TODO: should throw an exception? + checkEvaluation("""a\""" like regEx, false, create_row("""a\""")) checkEvaluation("A" like regEx, false, create_row("a%")) checkEvaluation("a" like regEx, false, create_row("A%")) From 8bca7a5e0ec60957c0a847e18f5ed40b763c752e Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Tue, 18 Oct 2016 15:07:55 -0700 Subject: [PATCH 7/9] Implement Hive's 'like' behaviour --- .../expressions/regexpExpressions.scala | 26 +++++++------- .../spark/sql/catalyst/util/StringUtils.scala | 36 +++++++++---------- .../expressions/RegexpExpressionsSuite.scala | 11 +++--- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 1504c7c62e104..8ee987e5ba8c7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -70,20 +70,18 @@ trait StringRegexExpression extends ImplicitCastInputTypes { @ExpressionDescription( usage = "str _FUNC_ pattern - Returns true if str matches pattern, " + "null if any arguments are null, false otherwise.", - extended = """ - |The pattern is a string which is matched literally, with exception to the - |following symbols: - | - | _ matches any one character in the input (similar to . in posix - | regular expressions) - | - | % matches zero ore more characters in the input (similar to .* in - | posix regular expressions) - | - |The default escape character is '\\'. Any character after the escape - |character will be matched against literally. - | - |Use RLIKE to match with standard regular expressions.""") + extended = + "The pattern is a string which is matched literally, with exception to the " + + "following special symbols:\n\n" + + " _ matches any one character in the input (similar to . in posix " + + "regular expressions)\n\n" + + " % matches zero ore more characters in the input (similar to .* in " + + "posix regular expressions\n\n" + + "The default escape character is '\\'. If an escape character precedes a special symbol or " + + "another escape character, the following character is matched literally, otherwise the " + + "escape character is treated literally. I.e. '\\%' would match '%', whereas '\\a' matches " + + "'\\a'.\n\n" + + "Use RLIKE to match with standard regular expressions.") case class Like(left: Expression, right: Expression) extends BinaryExpression with StringRegexExpression { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index c03cc4c718755..79c61023b480e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -23,27 +23,27 @@ import org.apache.spark.unsafe.types.UTF8String object StringUtils { - // replace the _ with .{1} exactly match 1 time of any character - // replace the % with .*, match 0 or more times with any character + /** Convert 'like' pattern to Java regex. */ def escapeLikeRegex(str: String): String = { - val builder = new StringBuilder() - var escaping = false - for (next <- str) { - if (escaping) { - builder ++= Pattern.quote(Character.toString(next)) - escaping = false - } else if (next == '\\') { - escaping = true - } else { - builder ++= (next match { - case '_' => "." - case '%' => ".*" - case _ => Pattern.quote(Character.toString(next)) - }) - escaping = false + val in = str.toIterator + val out = new StringBuilder() + + while (in.hasNext) { + in.next match { + case '\\' if in.hasNext => + in.next match { + case '\\' => out ++= Pattern.quote("\\") + case '_' => out ++= Pattern.quote("_") + case '%' => out ++= Pattern.quote("%") + // escape before non-escapable character treated literally + case c => out ++= Pattern.quote("\\" + c) + } + case '_' => out ++= "." + case '%' => out ++= ".*" + case c => out ++= Pattern.quote(Character.toString(c)) } } - "(?s)" + builder.result() // (?s) enables dotall mode, causing "." to match new lines + "(?s)" + out.result() // (?s) enables dotall mode, causing "." to match new lines } private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 65c08d90eacb4..71292dd7013ed 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -62,7 +62,6 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a" like "", false) checkEvaluation("" like "a", false) - // SI-17647 double-escaping backslash checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping checkEvaluation("""%%""" like """%%""", true) @@ -78,8 +77,9 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like "_€_", true) // scalastyle:on nonascii - // escaping at end position - checkEvaluation("""a\""" like """a\""", false) // TODO: should throw an exception? + // escaping non-escapable should match literally + checkEvaluation("""\a""" like """\a""", true) + checkEvaluation("""a\""" like """a\""", true) // case checkEvaluation("A" like "a%", false) @@ -125,8 +125,9 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_")) // scalastyle:on nonascii - // TODO: should throw an exception? - checkEvaluation("""a\""" like regEx, false, create_row("""a\""")) + // escaping non-escapable should match literally + checkEvaluation("""\a""" like regEx, true, create_row("""\a""")) + checkEvaluation("""a\""" like regEx, true, create_row("""a\""")) checkEvaluation("A" like regEx, false, create_row("a%")) checkEvaluation("a" like regEx, false, create_row("A%")) From ba6d0584e4a6417cd9e130d831219cccc194f791 Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Fri, 21 Oct 2016 18:19:04 -0700 Subject: [PATCH 8/9] Throw error on invalid escape pattern --- .../expressions/regexpExpressions.scala | 14 ++++++------- .../spark/sql/catalyst/util/StringUtils.scala | 15 +++++++------ .../expressions/RegexpExpressionsSuite.scala | 21 +++++++++++++------ 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 8ee987e5ba8c7..3df6effb6f7d6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -71,16 +71,16 @@ trait StringRegexExpression extends ImplicitCastInputTypes { usage = "str _FUNC_ pattern - Returns true if str matches pattern, " + "null if any arguments are null, false otherwise.", extended = - "The pattern is a string which is matched literally, with exception to the " + - "following special symbols:\n\n" + + "The pattern is a string which is matched literally, with exception to the following " + + "special symbols:\n\n" + " _ matches any one character in the input (similar to . in posix " + "regular expressions)\n\n" + " % matches zero ore more characters in the input (similar to .* in " + - "posix regular expressions\n\n" + - "The default escape character is '\\'. If an escape character precedes a special symbol or " + - "another escape character, the following character is matched literally, otherwise the " + - "escape character is treated literally. I.e. '\\%' would match '%', whereas '\\a' matches " + - "'\\a'.\n\n" + + "posix regular expressions)\n\n" + + "The escape character is '\\'. If an escape character precedes a special symbol or " + + "another escape character, the following character is matched literally, For example, " + + "the expression ` like \\%SystemDrive\\%\\\\Users%` will match any `` that " + + "starts with '%SystemDrive%\\Users'. It is invalid to escape any other character.\n\n" + "Use RLIKE to match with standard regular expressions.") case class Like(left: Expression, right: Expression) extends BinaryExpression with StringRegexExpression { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index 79c61023b480e..b760b994f2ff2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.util import java.util.regex.{Pattern, PatternSyntaxException} +import org.apache.spark.sql.AnalysisException import org.apache.spark.unsafe.types.UTF8String object StringUtils { @@ -28,16 +29,18 @@ object StringUtils { val in = str.toIterator val out = new StringBuilder() + def fail(message: String) = throw new AnalysisException( + s"the pattern '$str' is invalid, $message") + while (in.hasNext) { in.next match { case '\\' if in.hasNext => - in.next match { - case '\\' => out ++= Pattern.quote("\\") - case '_' => out ++= Pattern.quote("_") - case '%' => out ++= Pattern.quote("%") - // escape before non-escapable character treated literally - case c => out ++= Pattern.quote("\\" + c) + val c = in.next + c match { + case '_' | '%' | '\\' => out ++= Pattern.quote(Character.toString(c)) + case _ => fail(s"the escape character is not allowed to precede '$c'") } + case '\\' => fail("it is not allowed to end with the escape character") case '_' => out ++= "." case '%' => out ++= ".*" case c => out ++= Pattern.quote(Character.toString(c)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 71292dd7013ed..6de2ba2131cd0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.types.{IntegerType, StringType} @@ -77,9 +78,13 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like "_€_", true) // scalastyle:on nonascii - // escaping non-escapable should match literally - checkEvaluation("""\a""" like """\a""", true) - checkEvaluation("""a\""" like """a\""", true) + // invalid escaping + intercept[AnalysisException] { + evaluate("""a""" like """\a""") + } + intercept[AnalysisException] { + evaluate("""a""" like """a\""") + } // case checkEvaluation("A" like "a%", false) @@ -125,9 +130,13 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_")) // scalastyle:on nonascii - // escaping non-escapable should match literally - checkEvaluation("""\a""" like regEx, true, create_row("""\a""")) - checkEvaluation("""a\""" like regEx, true, create_row("""a\""")) + // invalid escaping + intercept[AnalysisException] { + evaluate("""a""" like regEx, create_row("""\a""")) + } + intercept[AnalysisException] { + evaluate("""a""" like regEx, create_row("""a\""")) + } checkEvaluation("A" like regEx, false, create_row("a%")) checkEvaluation("a" like regEx, false, create_row("A%")) From dbd305c195ba1c1661f82e3e19085778b8783794 Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Mon, 24 Oct 2016 17:07:14 -0700 Subject: [PATCH 9/9] Refactor tests and add documentation --- .../expressions/regexpExpressions.scala | 34 ++-- .../spark/sql/catalyst/util/StringUtils.scala | 18 +- .../expressions/RegexpExpressionsSuite.scala | 187 +++++++----------- 3 files changed, 111 insertions(+), 128 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 3df6effb6f7d6..0325d0e8370f4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -70,18 +70,28 @@ trait StringRegexExpression extends ImplicitCastInputTypes { @ExpressionDescription( usage = "str _FUNC_ pattern - Returns true if str matches pattern, " + "null if any arguments are null, false otherwise.", - extended = - "The pattern is a string which is matched literally, with exception to the following " + - "special symbols:\n\n" + - " _ matches any one character in the input (similar to . in posix " + - "regular expressions)\n\n" + - " % matches zero ore more characters in the input (similar to .* in " + - "posix regular expressions)\n\n" + - "The escape character is '\\'. If an escape character precedes a special symbol or " + - "another escape character, the following character is matched literally, For example, " + - "the expression ` like \\%SystemDrive\\%\\\\Users%` will match any `` that " + - "starts with '%SystemDrive%\\Users'. It is invalid to escape any other character.\n\n" + - "Use RLIKE to match with standard regular expressions.") + extended = """ + Arguments: + str - a string expression + pattern - a string expression. The pattern is a string which is matched literally, with + exception to the following special symbols: + + _ matches any one character in the input (similar to . in posix regular expressions) + + % matches zero ore more characters in the input (similar to .* in posix regular + expressions) + + The escape character is '\'. If an escape character precedes a special symbol or another + escape character, the following character is matched literally. It is invalid to escape + any other character. + + Examples: + > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%' + true + + See also: + Use RLIKE to match with standard regular expressions. +""") case class Like(left: Expression, right: Expression) extends BinaryExpression with StringRegexExpression { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index b760b994f2ff2..ca22ea24207e1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -24,13 +24,23 @@ import org.apache.spark.unsafe.types.UTF8String object StringUtils { - /** Convert 'like' pattern to Java regex. */ - def escapeLikeRegex(str: String): String = { - val in = str.toIterator + /** + * Validate and convert SQL 'like' pattern to a Java regular expression. + * + * Underscores (_) are converted to '.' and percent signs (%) are converted to '.*', other + * characters are quoted literally. Escaping is done according to the rules specified in + * [[org.apache.spark.sql.catalyst.expressions.Like]] usage documentation. An invalid pattern will + * throw an [[AnalysisException]]. + * + * @param pattern the SQL pattern to convert + * @return the equivalent Java regular expression of the pattern + */ + def escapeLikeRegex(pattern: String): String = { + val in = pattern.toIterator val out = new StringBuilder() def fail(message: String) = throw new AnalysisException( - s"the pattern '$str' is invalid, $message") + s"the pattern '$pattern' is invalid, $message") while (in.hasNext) { in.next match { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 6de2ba2131cd0..1ce150e091981 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -27,10 +27,29 @@ import org.apache.spark.sql.types.{IntegerType, StringType} */ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { - test("LIKE literal Regular Expression") { + /** + * Check if a given expression evaluates to an expected output, in case the input is + * a literal and in case the input is in the form of a row. + * @tparam A type of input + * @param mkExpr the expression to test for a given input + * @param input value that will be used to create the expression, as literal and in the form + * of a row + * @param expected the expected output of the expression + * @param inputToExpression an implicit conversion from the input type to its corresponding + * sql expression + */ + def checkLiteralRow[A](mkExpr: Expression => Expression, input: A, expected: Any) + (implicit inputToExpression: A => Expression): Unit = { + checkEvaluation(mkExpr(input), expected) // check literal input + + val regex = 'a.string.at(0) + checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input + } + + test("LIKE Pattern") { // null handling - checkEvaluation(Literal.create(null, StringType).like("a"), null) + checkLiteralRow(Literal.create(null, StringType).like(_), "a", null) checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null) checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null) checkEvaluation( @@ -43,109 +62,63 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null) // simple patterns - checkEvaluation("abdef" like "abdef", true) - checkEvaluation("a_%b" like "a\\__b", true) - checkEvaluation("addb" like "a_%b", true) - checkEvaluation("addb" like "a\\__b", false) - checkEvaluation("addb" like "a%\\%b", false) - checkEvaluation("a_%b" like "a%\\%b", true) - checkEvaluation("addb" like "a%", true) - checkEvaluation("addb" like "**", false) - checkEvaluation("abc" like "a%", true) - checkEvaluation("abc" like "b%", false) - checkEvaluation("abc" like "bc%", false) - checkEvaluation("a\nb" like "a_b", true) - checkEvaluation("ab" like "a%b", true) - checkEvaluation("a\nb" like "a%b", true) + checkLiteralRow("abdef" like _, "abdef", true) + checkLiteralRow("a_%b" like _, "a\\__b", true) + checkLiteralRow("addb" like _, "a_%b", true) + checkLiteralRow("addb" like _, "a\\__b", false) + checkLiteralRow("addb" like _, "a%\\%b", false) + checkLiteralRow("a_%b" like _, "a%\\%b", true) + checkLiteralRow("addb" like _, "a%", true) + checkLiteralRow("addb" like _, "**", false) + checkLiteralRow("abc" like _, "a%", true) + checkLiteralRow("abc" like _, "b%", false) + checkLiteralRow("abc" like _, "bc%", false) + checkLiteralRow("a\nb" like _, "a_b", true) + checkLiteralRow("ab" like _, "a%b", true) + checkLiteralRow("a\nb" like _, "a%b", true) // empty input - checkEvaluation("" like "", true) - checkEvaluation("a" like "", false) - checkEvaluation("" like "a", false) + checkLiteralRow("" like _, "", true) + checkLiteralRow("a" like _, "", false) + checkLiteralRow("" like _, "a", false) // SI-17647 double-escaping backslash - checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping - checkEvaluation("""%%""" like """%%""", true) - checkEvaluation("""\__""" like """\\\__""", true) - checkEvaluation("""\\\__""" like """%\\%\%""", false) - checkEvaluation("""_\\\%""" like """%\\""", false) + checkLiteralRow("""\\\\""" like _, """%\\%""", true) + checkLiteralRow("""%%""" like _, """%%""", true) + checkLiteralRow("""\__""" like _, """\\\__""", true) + checkLiteralRow("""\\\__""" like _, """%\\%\%""", false) + checkLiteralRow("""_\\\%""" like _, """%\\""", false) // unicode // scalastyle:off nonascii - checkEvaluation("a\u20ACa" like "_\u20AC_", true) - checkEvaluation("a€a" like "_€_", true) - checkEvaluation("a€a" like "_\u20AC_", true) - checkEvaluation("a\u20ACa" like "_€_", true) + checkLiteralRow("a\u20ACa" like _, "_\u20AC_", true) + checkLiteralRow("a€a" like _, "_€_", true) + checkLiteralRow("a€a" like _, "_\u20AC_", true) + checkLiteralRow("a\u20ACa" like _, "_€_", true) // scalastyle:on nonascii // invalid escaping - intercept[AnalysisException] { + val invalidEscape = intercept[AnalysisException] { evaluate("""a""" like """\a""") } - intercept[AnalysisException] { + assert(invalidEscape.getMessage.contains("pattern")) + + val endEscape = intercept[AnalysisException] { evaluate("""a""" like """a\""") } + assert(endEscape.getMessage.contains("pattern")) // case - checkEvaluation("A" like "a%", false) - checkEvaluation("a" like "A%", false) - checkEvaluation("AaA" like "_a_", true) - - } - - test("LIKE Non-literal Regular Expression") { - val regEx = 'a.string.at(0) - checkEvaluation("abcd" like regEx, null, create_row(null)) - checkEvaluation("abdef" like regEx, true, create_row("abdef")) - checkEvaluation("a_%b" like regEx, true, create_row("a\\__b")) - checkEvaluation("addb" like regEx, true, create_row("a_%b")) - checkEvaluation("addb" like regEx, false, create_row("a\\__b")) - checkEvaluation("addb" like regEx, false, create_row("a%\\%b")) - checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b")) - checkEvaluation("addb" like regEx, true, create_row("a%")) - checkEvaluation("addb" like regEx, false, create_row("**")) - checkEvaluation("abc" like regEx, true, create_row("a%")) - checkEvaluation("abc" like regEx, false, create_row("b%")) - checkEvaluation("abc" like regEx, false, create_row("bc%")) - checkEvaluation("a\nb" like regEx, true, create_row("a_b")) - checkEvaluation("ab" like regEx, true, create_row("a%b")) - checkEvaluation("a\nb" like regEx, true, create_row("a%b")) - - checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%")) - - checkEvaluation("" like regEx, true, create_row("")) - checkEvaluation("a" like regEx, false, create_row("")) - checkEvaluation("" like regEx, false, create_row("a")) - - checkEvaluation("""\\\\""" like regEx, true, create_row("""%\\%""")) - checkEvaluation("""%%""" like regEx, true, create_row("""%%""")) - checkEvaluation("""\__""" like regEx, true, create_row("""\\\__""")) - checkEvaluation("""\\\__""" like regEx, false, create_row("""%\\%\%""")) - checkEvaluation("""_\\\%""" like regEx, false, create_row("""%\\""")) - - // scalastyle:off nonascii - checkEvaluation("a\u20ACa" like regEx, true, create_row("_\u20AC_")) - checkEvaluation("a€a" like regEx, true, create_row("_€_")) - checkEvaluation("a€a" like regEx, true, create_row("_\u20AC_")) - checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_")) - // scalastyle:on nonascii - - // invalid escaping - intercept[AnalysisException] { - evaluate("""a""" like regEx, create_row("""\a""")) - } - intercept[AnalysisException] { - evaluate("""a""" like regEx, create_row("""a\""")) - } - - checkEvaluation("A" like regEx, false, create_row("a%")) - checkEvaluation("a" like regEx, false, create_row("A%")) - checkEvaluation("AaA" like regEx, true, create_row("_a_")) + checkLiteralRow("A" like _, "a%", false) + checkLiteralRow("a" like _, "A%", false) + checkLiteralRow("AaA" like _, "_a_", true) + // example + checkLiteralRow("""%SystemDrive%\Users\John""" like _, """\%SystemDrive\%\\Users%""", true) } - test("RLIKE literal Regular Expression") { - checkEvaluation(Literal.create(null, StringType) rlike "abdef", null) + test("RLIKE Regular Expression") { + checkLiteralRow(Literal.create(null, StringType) rlike _, "abdef", null) checkEvaluation("abdef" rlike Literal.create(null, StringType), null) checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null) checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true) @@ -155,42 +128,32 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation( Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null) - checkEvaluation("abdef" rlike "abdef", true) - checkEvaluation("abbbbc" rlike "a.*c", true) + checkLiteralRow("abdef" rlike _, "abdef", true) + checkLiteralRow("abbbbc" rlike _, "a.*c", true) - checkEvaluation("fofo" rlike "^fo", true) - checkEvaluation("fo\no" rlike "^fo\no$", true) - checkEvaluation("Bn" rlike "^Ba*n", true) - checkEvaluation("afofo" rlike "fo", true) - checkEvaluation("afofo" rlike "^fo", false) - checkEvaluation("Baan" rlike "^Ba?n", false) - checkEvaluation("axe" rlike "pi|apa", false) - checkEvaluation("pip" rlike "^(pi)*$", false) + checkLiteralRow("fofo" rlike _, "^fo", true) + checkLiteralRow("fo\no" rlike _, "^fo\no$", true) + checkLiteralRow("Bn" rlike _, "^Ba*n", true) + checkLiteralRow("afofo" rlike _, "fo", true) + checkLiteralRow("afofo" rlike _, "^fo", false) + checkLiteralRow("Baan" rlike _, "^Ba?n", false) + checkLiteralRow("axe" rlike _, "pi|apa", false) + checkLiteralRow("pip" rlike _, "^(pi)*$", false) - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) + checkLiteralRow("abc" rlike _, "^ab", true) + checkLiteralRow("abc" rlike _, "^bc", false) + checkLiteralRow("abc" rlike _, "^ab", true) + checkLiteralRow("abc" rlike _, "^bc", false) intercept[java.util.regex.PatternSyntaxException] { evaluate("abbbbc" rlike "**") } - } - - test("RLIKE Non-literal Regular Expression") { - val regEx = 'a.string.at(0) - checkEvaluation("abdef" rlike regEx, true, create_row("abdef")) - checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c")) - checkEvaluation("fofo" rlike regEx, true, create_row("^fo")) - checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$")) - checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n")) - intercept[java.util.regex.PatternSyntaxException] { - evaluate("abbbbc" rlike regEx, create_row("**")) + val regex = 'a.string.at(0) + evaluate("abbbbc" rlike regex, create_row("**")) } } - test("RegexReplace") { val row1 = create_row("100-200", "(\\d+)", "num") val row2 = create_row("100-200", "(\\d+)", "###")