From fe93873d83f2cc2575d36e0236f979bd26f0ff4a Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 18 Jun 2015 14:21:27 -0700
Subject: [PATCH 1/3] Implement n-gram feature transformer

---
 .../org/apache/spark/ml/feature/NGram.scala   | 67 ++++++++++++++++++
 .../apache/spark/ml/feature/NGramSuite.scala  | 68 +++++++++++++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
new file mode 100644
index 0000000000000..caa8f7bb55094
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.UnaryTransformer
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
+
+/**
+ * :: Experimental ::
+ * A feature transformer that converts the input array of strings into an array of n-grams. Null
+ * values in the input array are ignored.
+ * It returns an array of n-grams where each n-gram is represented by a space-separated string of
+ * words.
+ */
+@Experimental
+class NGram(override val uid: String)
+  extends UnaryTransformer[Seq[String], Seq[String], NGram] {
+
+  def this() = this(Identifiable.randomUID("ngram"))
+
+  /**
+   * Minimum n-gram length, >= 1.
+   * Defauult: 2, bigram features
+   * @group param
+   */
+  val NGramLength: IntParam = new IntParam(this, "NGramLength", "number elements per n-gram (>=1)",
+    ParamValidators.gtEq(1))
+
+  /** @group setParam */
+  def setNGramLength(value: Int): this.type = set(NGramLength, value)
+
+  /** @group getParam */
+  def getNGramLength: Int = $(NGramLength)
+
+  setDefault(NGramLength -> 2)
+
+  override protected def createTransformFunc: Seq[String] => Seq[String] = {
+    val minLength = $(NGramLength)
+    _.sliding(minLength).map(_.mkString(" ")).toSeq
+  }
+
+  override protected def validateInputType(inputType: DataType): Unit = {
+    require(
+      inputType.sameType(ArrayType(StringType)),
+      s"Input type must be ArrayType(StringType) but got $inputType.")
+  }
+
+  override protected def outputDataType: DataType = new ArrayType(StringType, false)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
new file mode 100644
index 0000000000000..a90d967fb48fa
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.beans.BeanInfo
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{DataFrame, Row}
+
+@BeanInfo
+case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String])
+
+class NGramSuite extends SparkFunSuite with MLlibTestSparkContext {
+  import org.apache.spark.ml.feature.NGramSuite._
+
+  test("default behavior yields bigram features") {
+    val tokenizer = new NGram()
+      .setInputCol("inputTokens")
+      .setOutputCol("NGrams")
+    val dataset = sqlContext.createDataFrame(Seq(
+      NGramTestData(
+        Array("Test", "for", "ngram", "."),
+        Array("Test for", "for ngram", "ngram .")
+    )))
+    testNGram(tokenizer, dataset)
+  }
+
+  test("NGramLength=4 yields length 4 n-grams") {
+    val tokenizer = new NGram()
+      .setInputCol("inputTokens")
+      .setOutputCol("NGrams")
+      .setNGramLength(4)
+    val dataset = sqlContext.createDataFrame(Seq(
+      NGramTestData(
+        Array("a", "b", "c", "d", "e"),
+        Array("a b c d", "b c d e")
+      )))
+    testNGram(tokenizer, dataset)
+  }
+}
+
+object NGramSuite extends SparkFunSuite {
+
+  def testNGram(t: NGram, dataset: DataFrame): Unit = {
+    t.transform(dataset)
+      .select("NGrams", "wantedNGrams")
+      .collect()
+      .foreach { case Row(actualNGrams, wantedNGrams) =>
+      assert(actualNGrams === wantedNGrams)
+    }
+  }
+}

From 9fadd36417388ba25af86ea5d66be92b1f83338a Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 18 Jun 2015 19:52:02 -0700
Subject: [PATCH 2/3] Add empty and corner test cases, fix names and spaces

---
 .../org/apache/spark/ml/feature/NGram.scala   | 19 +++++----
 .../apache/spark/ml/feature/NGramSuite.scala  | 39 +++++++++++++++----
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
index caa8f7bb55094..68d919da579fb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -29,6 +29,10 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
  * values in the input array are ignored.
  * It returns an array of n-grams where each n-gram is represented by a space-separated string of
  * words.
+ *
+ * When the input is empty, an empty array is returned.
+ * When the input array length is less than n (number of elements per n-gram), a single n-gram
+ * consisting of the input array is returned.
  */
 @Experimental
 class NGram(override val uid: String)
@@ -38,28 +42,27 @@ class NGram(override val uid: String)
 
   /**
    * Minimum n-gram length, >= 1.
-   * Defauult: 2, bigram features
+   * Default: 2, bigram features
    * @group param
    */
-  val NGramLength: IntParam = new IntParam(this, "NGramLength", "number elements per n-gram (>=1)",
+  val n: IntParam = new IntParam(this, "n", "number elements per n-gram (>=1)",
     ParamValidators.gtEq(1))
 
   /** @group setParam */
-  def setNGramLength(value: Int): this.type = set(NGramLength, value)
+  def setN(value: Int): this.type = set(n, value)
 
   /** @group getParam */
-  def getNGramLength: Int = $(NGramLength)
+  def getN: Int = $(n)
 
-  setDefault(NGramLength -> 2)
+  setDefault(n -> 2)
 
   override protected def createTransformFunc: Seq[String] => Seq[String] = {
-    val minLength = $(NGramLength)
+    val minLength = $(n)
     _.sliding(minLength).map(_.mkString(" ")).toSeq
   }
 
   override protected def validateInputType(inputType: DataType): Unit = {
-    require(
-      inputType.sameType(ArrayType(StringType)),
+    require(inputType.sameType(ArrayType(StringType)),
       s"Input type must be ArrayType(StringType) but got $inputType.")
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
index a90d967fb48fa..034056905699f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
@@ -30,7 +30,7 @@ class NGramSuite extends SparkFunSuite with MLlibTestSparkContext {
   import org.apache.spark.ml.feature.NGramSuite._
 
   test("default behavior yields bigram features") {
-    val tokenizer = new NGram()
+    val NGramTransformer = new NGram()
       .setInputCol("inputTokens")
       .setOutputCol("NGrams")
     val dataset = sqlContext.createDataFrame(Seq(
@@ -38,20 +38,45 @@ class NGramSuite extends SparkFunSuite with MLlibTestSparkContext {
         Array("Test", "for", "ngram", "."),
         Array("Test for", "for ngram", "ngram .")
     )))
-    testNGram(tokenizer, dataset)
+    testNGram(NGramTransformer, dataset)
   }
 
   test("NGramLength=4 yields length 4 n-grams") {
-    val tokenizer = new NGram()
+    val NGramTransformer = new NGram()
       .setInputCol("inputTokens")
       .setOutputCol("NGrams")
-      .setNGramLength(4)
+      .setN(4)
     val dataset = sqlContext.createDataFrame(Seq(
       NGramTestData(
         Array("a", "b", "c", "d", "e"),
         Array("a b c d", "b c d e")
       )))
-    testNGram(tokenizer, dataset)
+    testNGram(NGramTransformer, dataset)
+  }
+
+  test("empty input yields empty output") {
+    val NGramTransformer = new NGram()
+      .setInputCol("inputTokens")
+      .setOutputCol("NGrams")
+      .setN(4)
+    val dataset = sqlContext.createDataFrame(Seq(
+      NGramTestData(
+        Array(),
+        Array()
+      )))
+    testNGram(NGramTransformer, dataset)
+  }
+  test("input array < n yields a single n-gram consisting of input array") {
+    val NGramTransformer = new NGram()
+      .setInputCol("inputTokens")
+      .setOutputCol("NGrams")
+      .setN(6)
+    val dataset = sqlContext.createDataFrame(Seq(
+      NGramTestData(
+        Array("a", "b", "c", "d", "e"),
+        Array("a b c d e")
+      )))
+    testNGram(NGramTransformer, dataset)
   }
 }
 
@@ -62,7 +87,7 @@ object NGramSuite extends SparkFunSuite {
       .select("NGrams", "wantedNGrams")
       .collect()
       .foreach { case Row(actualNGrams, wantedNGrams) =>
-      assert(actualNGrams === wantedNGrams)
-    }
+        assert(actualNGrams === wantedNGrams)
+      }
   }
 }

From d2c839faf9bdc24fb51f060a5567140fce032bbc Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Fri, 19 Jun 2015 10:15:48 -0700
Subject: [PATCH 3/3] Make n > input length yield empty output

---
 .../org/apache/spark/ml/feature/NGram.scala   |  7 ++---
 .../apache/spark/ml/feature/NGramSuite.scala  | 31 ++++++++++---------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
index 68d919da579fb..8de10eb51f923 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -31,8 +31,8 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
  * words.
  *
  * When the input is empty, an empty array is returned.
- * When the input array length is less than n (number of elements per n-gram), a single n-gram
- * consisting of the input array is returned.
+ * When the input array length is less than n (number of elements per n-gram), no n-grams are
+ * returned.
  */
 @Experimental
 class NGram(override val uid: String)
@@ -57,8 +57,7 @@ class NGram(override val uid: String)
   setDefault(n -> 2)
 
   override protected def createTransformFunc: Seq[String] => Seq[String] = {
-    val minLength = $(n)
-    _.sliding(minLength).map(_.mkString(" ")).toSeq
+    _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq
   }
 
   override protected def validateInputType(inputType: DataType): Unit = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
index 034056905699f..ab97e3dbc6ee0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
@@ -30,53 +30,54 @@ class NGramSuite extends SparkFunSuite with MLlibTestSparkContext {
   import org.apache.spark.ml.feature.NGramSuite._
 
   test("default behavior yields bigram features") {
-    val NGramTransformer = new NGram()
+    val nGram = new NGram()
       .setInputCol("inputTokens")
-      .setOutputCol("NGrams")
+      .setOutputCol("nGrams")
     val dataset = sqlContext.createDataFrame(Seq(
       NGramTestData(
         Array("Test", "for", "ngram", "."),
         Array("Test for", "for ngram", "ngram .")
     )))
-    testNGram(NGramTransformer, dataset)
+    testNGram(nGram, dataset)
   }
 
   test("NGramLength=4 yields length 4 n-grams") {
-    val NGramTransformer = new NGram()
+    val nGram = new NGram()
       .setInputCol("inputTokens")
-      .setOutputCol("NGrams")
+      .setOutputCol("nGrams")
       .setN(4)
     val dataset = sqlContext.createDataFrame(Seq(
       NGramTestData(
         Array("a", "b", "c", "d", "e"),
         Array("a b c d", "b c d e")
       )))
-    testNGram(NGramTransformer, dataset)
+    testNGram(nGram, dataset)
   }
 
   test("empty input yields empty output") {
-    val NGramTransformer = new NGram()
+    val nGram = new NGram()
       .setInputCol("inputTokens")
-      .setOutputCol("NGrams")
+      .setOutputCol("nGrams")
       .setN(4)
     val dataset = sqlContext.createDataFrame(Seq(
       NGramTestData(
         Array(),
         Array()
       )))
-    testNGram(NGramTransformer, dataset)
+    testNGram(nGram, dataset)
   }
-  test("input array < n yields a single n-gram consisting of input array") {
-    val NGramTransformer = new NGram()
+
+  test("input array < n yields empty output") {
+    val nGram = new NGram()
       .setInputCol("inputTokens")
-      .setOutputCol("NGrams")
+      .setOutputCol("nGrams")
       .setN(6)
     val dataset = sqlContext.createDataFrame(Seq(
       NGramTestData(
         Array("a", "b", "c", "d", "e"),
-        Array("a b c d e")
+        Array()
       )))
-    testNGram(NGramTransformer, dataset)
+    testNGram(nGram, dataset)
   }
 }
 
@@ -84,7 +85,7 @@ object NGramSuite extends SparkFunSuite {
 
   def testNGram(t: NGram, dataset: DataFrame): Unit = {
     t.transform(dataset)
-      .select("NGrams", "wantedNGrams")
+      .select("nGrams", "wantedNGrams")
       .collect()
       .foreach { case Row(actualNGrams, wantedNGrams) =>
         assert(actualNGrams === wantedNGrams)