From fe93873d83f2cc2575d36e0236f979bd26f0ff4a Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Thu, 18 Jun 2015 14:21:27 -0700 Subject: [PATCH 1/3] Implement n-gram feature transformer --- .../org/apache/spark/ml/feature/NGram.scala | 67 ++++++++++++++++++ .../apache/spark/ml/feature/NGramSuite.scala | 68 +++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala new file mode 100644 index 0000000000000..caa8f7bb55094 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.annotation.Experimental +import org.apache.spark.ml.UnaryTransformer +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.types.{ArrayType, DataType, StringType} + +/** + * :: Experimental :: + * A feature transformer that converts the input array of strings into an array of n-grams. Null + * values in the input array are ignored. + * It returns an array of n-grams where each n-gram is represented by a space-separated string of + * words. + */ +@Experimental +class NGram(override val uid: String) + extends UnaryTransformer[Seq[String], Seq[String], NGram] { + + def this() = this(Identifiable.randomUID("ngram")) + + /** + * Minimum n-gram length, >= 1. + * Defauult: 2, bigram features + * @group param + */ + val NGramLength: IntParam = new IntParam(this, "NGramLength", "number elements per n-gram (>=1)", + ParamValidators.gtEq(1)) + + /** @group setParam */ + def setNGramLength(value: Int): this.type = set(NGramLength, value) + + /** @group getParam */ + def getNGramLength: Int = $(NGramLength) + + setDefault(NGramLength -> 2) + + override protected def createTransformFunc: Seq[String] => Seq[String] = { + val minLength = $(NGramLength) + _.sliding(minLength).map(_.mkString(" ")).toSeq + } + + override protected def validateInputType(inputType: DataType): Unit = { + require( + inputType.sameType(ArrayType(StringType)), + s"Input type must be ArrayType(StringType) but got $inputType.") + } + + override protected def outputDataType: DataType = new ArrayType(StringType, false) +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala new file mode 100644 index 0000000000000..a90d967fb48fa --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import scala.beans.BeanInfo + +import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.{DataFrame, Row} + +@BeanInfo +case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) + +class NGramSuite extends SparkFunSuite with MLlibTestSparkContext { + import org.apache.spark.ml.feature.NGramSuite._ + + test("default behavior yields bigram features") { + val tokenizer = new NGram() + .setInputCol("inputTokens") + .setOutputCol("NGrams") + val dataset = sqlContext.createDataFrame(Seq( + NGramTestData( + Array("Test", "for", "ngram", "."), + Array("Test for", "for ngram", "ngram .") + ))) + testNGram(tokenizer, dataset) + } + + test("NGramLength=4 yields length 4 n-grams") { + val tokenizer = new NGram() + .setInputCol("inputTokens") + .setOutputCol("NGrams") + .setNGramLength(4) + val dataset = sqlContext.createDataFrame(Seq( + NGramTestData( + Array("a", "b", "c", "d", "e"), + Array("a b c d", "b c d e") + ))) + testNGram(tokenizer, dataset) + } +} + +object NGramSuite extends SparkFunSuite { + + def testNGram(t: NGram, dataset: DataFrame): Unit = { + t.transform(dataset) + .select("NGrams", "wantedNGrams") + .collect() + .foreach { case Row(actualNGrams, wantedNGrams) => + assert(actualNGrams === wantedNGrams) + } + } +} From 9fadd36417388ba25af86ea5d66be92b1f83338a Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Thu, 18 Jun 2015 19:52:02 -0700 Subject: [PATCH 2/3] Add empty and corner test cases, fix names and spaces --- .../org/apache/spark/ml/feature/NGram.scala | 19 +++++---- .../apache/spark/ml/feature/NGramSuite.scala | 39 +++++++++++++++---- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala index caa8f7bb55094..68d919da579fb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala @@ -29,6 +29,10 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} * values in the input array are ignored. * It returns an array of n-grams where each n-gram is represented by a space-separated string of * words. + * + * When the input is empty, an empty array is returned. + * When the input array length is less than n (number of elements per n-gram), a single n-gram + * consisting of the input array is returned. */ @Experimental class NGram(override val uid: String) @@ -38,28 +42,27 @@ class NGram(override val uid: String) /** * Minimum n-gram length, >= 1. - * Defauult: 2, bigram features + * Default: 2, bigram features * @group param */ - val NGramLength: IntParam = new IntParam(this, "NGramLength", "number elements per n-gram (>=1)", + val n: IntParam = new IntParam(this, "n", "number elements per n-gram (>=1)", ParamValidators.gtEq(1)) /** @group setParam */ - def setNGramLength(value: Int): this.type = set(NGramLength, value) + def setN(value: Int): this.type = set(n, value) /** @group getParam */ - def getNGramLength: Int = $(NGramLength) + def getN: Int = $(n) - setDefault(NGramLength -> 2) + setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { - val minLength = $(NGramLength) + val minLength = $(n) _.sliding(minLength).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { - require( - inputType.sameType(ArrayType(StringType)), + require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala index a90d967fb48fa..034056905699f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala @@ -30,7 +30,7 @@ class NGramSuite extends SparkFunSuite with MLlibTestSparkContext { import org.apache.spark.ml.feature.NGramSuite._ test("default behavior yields bigram features") { - val tokenizer = new NGram() + val NGramTransformer = new NGram() .setInputCol("inputTokens") .setOutputCol("NGrams") val dataset = sqlContext.createDataFrame(Seq( @@ -38,20 +38,45 @@ class NGramSuite extends SparkFunSuite with MLlibTestSparkContext { Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") ))) - testNGram(tokenizer, dataset) + testNGram(NGramTransformer, dataset) } test("NGramLength=4 yields length 4 n-grams") { - val tokenizer = new NGram() + val NGramTransformer = new NGram() .setInputCol("inputTokens") .setOutputCol("NGrams") - .setNGramLength(4) + .setN(4) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") ))) - testNGram(tokenizer, dataset) + testNGram(NGramTransformer, dataset) + } + + test("empty input yields empty output") { + val NGramTransformer = new NGram() + .setInputCol("inputTokens") + .setOutputCol("NGrams") + .setN(4) + val dataset = sqlContext.createDataFrame(Seq( + NGramTestData( + Array(), + Array() + ))) + testNGram(NGramTransformer, dataset) + } + test("input array < n yields a single n-gram consisting of input array") { + val NGramTransformer = new NGram() + .setInputCol("inputTokens") + .setOutputCol("NGrams") + .setN(6) + val dataset = sqlContext.createDataFrame(Seq( + NGramTestData( + Array("a", "b", "c", "d", "e"), + Array("a b c d e") + ))) + testNGram(NGramTransformer, dataset) } } @@ -62,7 +87,7 @@ object NGramSuite extends SparkFunSuite { .select("NGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => - assert(actualNGrams === wantedNGrams) - } + assert(actualNGrams === wantedNGrams) + } } } From d2c839faf9bdc24fb51f060a5567140fce032bbc Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Fri, 19 Jun 2015 10:15:48 -0700 Subject: [PATCH 3/3] Make n > input length yield empty output --- .../org/apache/spark/ml/feature/NGram.scala | 7 ++--- .../apache/spark/ml/feature/NGramSuite.scala | 31 ++++++++++--------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala index 68d919da579fb..8de10eb51f923 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala @@ -31,8 +31,8 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} * words. * * When the input is empty, an empty array is returned. - * When the input array length is less than n (number of elements per n-gram), a single n-gram - * consisting of the input array is returned. + * When the input array length is less than n (number of elements per n-gram), no n-grams are + * returned. */ @Experimental class NGram(override val uid: String) @@ -57,8 +57,7 @@ class NGram(override val uid: String) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { - val minLength = $(n) - _.sliding(minLength).map(_.mkString(" ")).toSeq + _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala index 034056905699f..ab97e3dbc6ee0 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala @@ -30,53 +30,54 @@ class NGramSuite extends SparkFunSuite with MLlibTestSparkContext { import org.apache.spark.ml.feature.NGramSuite._ test("default behavior yields bigram features") { - val NGramTransformer = new NGram() + val nGram = new NGram() .setInputCol("inputTokens") - .setOutputCol("NGrams") + .setOutputCol("nGrams") val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") ))) - testNGram(NGramTransformer, dataset) + testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { - val NGramTransformer = new NGram() + val nGram = new NGram() .setInputCol("inputTokens") - .setOutputCol("NGrams") + .setOutputCol("nGrams") .setN(4) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") ))) - testNGram(NGramTransformer, dataset) + testNGram(nGram, dataset) } test("empty input yields empty output") { - val NGramTransformer = new NGram() + val nGram = new NGram() .setInputCol("inputTokens") - .setOutputCol("NGrams") + .setOutputCol("nGrams") .setN(4) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array(), Array() ))) - testNGram(NGramTransformer, dataset) + testNGram(nGram, dataset) } - test("input array < n yields a single n-gram consisting of input array") { - val NGramTransformer = new NGram() + + test("input array < n yields empty output") { + val nGram = new NGram() .setInputCol("inputTokens") - .setOutputCol("NGrams") + .setOutputCol("nGrams") .setN(6) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("a", "b", "c", "d", "e"), - Array("a b c d e") + Array() ))) - testNGram(NGramTransformer, dataset) + testNGram(nGram, dataset) } } @@ -84,7 +85,7 @@ object NGramSuite extends SparkFunSuite { def testNGram(t: NGram, dataset: DataFrame): Unit = { t.transform(dataset) - .select("NGrams", "wantedNGrams") + .select("nGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => assert(actualNGrams === wantedNGrams)