From d91cbe9d403d8edb26f4efd600d92601e238e9db Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Sun, 8 May 2016 11:01:49 +0800 Subject: [PATCH 1/4] use datafile --- docs/ml-clustering.md | 5 ++ .../spark/examples/ml/JavaKMeansExample.java | 58 +++++-------------- examples/src/main/python/ml/kmeans_example.py | 46 ++++++--------- .../spark/examples/ml/KMeansExample.scala | 28 ++++----- 4 files changed, 46 insertions(+), 91 deletions(-) diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md index 440c455cd077c..e23cc956ec231 100644 --- a/docs/ml-clustering.md +++ b/docs/ml-clustering.md @@ -79,6 +79,11 @@ Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/KMeans.html {% include_example java/org/apache/spark/examples/ml/JavaKMeansExample.java %} +
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.KMeans) for more details. + +{% include_example python/ml/kmeans_example.py %} +
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java index 65e29ade299d1..a1a6be32e8d65 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java @@ -17,77 +17,45 @@ package org.apache.spark.examples.ml; -import java.util.regex.Pattern; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.expressions.GenericRow; // $example on$ import org.apache.spark.ml.clustering.KMeansModel; import org.apache.spark.ml.clustering.KMeans; import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.VectorUDT; -import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; // $example off$ +import org.apache.spark.sql.SparkSession; /** * An example demonstrating a k-means clustering. * Run with *
- * bin/run-example ml.JavaKMeansExample  
+ * bin/run-example ml.JavaKMeansExample
  * 
*/ public class JavaKMeansExample { - private static class ParsePoint implements Function { - private static final Pattern separator = Pattern.compile(" "); - - @Override - public Row call(String line) { - String[] tok = separator.split(line); - double[] point = new double[tok.length]; - for (int i = 0; i < tok.length; ++i) { - point[i] = Double.parseDouble(tok[i]); - } - Vector[] points = {Vectors.dense(point)}; - return new GenericRow(points); - } - } - public static void main(String[] args) { - if (args.length != 2) { - System.err.println("Usage: ml.JavaKMeansExample "); - System.exit(1); - } - String inputFile = args[0]; - int k = Integer.parseInt(args[1]); - - // Parses the arguments + // Create a SparkSession. SparkSession spark = SparkSession .builder() .appName("JavaKMeansExample") .getOrCreate(); // $example on$ - // Loads data - JavaRDD points = spark.read().text(inputFile).javaRDD().map(new ParsePoint()); - StructField[] fields = {new StructField("features", new VectorUDT(), false, Metadata.empty())}; - StructType schema = new StructType(fields); - Dataset dataset = spark.createDataFrame(points, schema); + // Loads data. + Dataset dataset = spark.read().format("libsvm").load("data/mllib/sample_kmeans_data.txt"); - // Trains a k-means model - KMeans kmeans = new KMeans() - .setK(k); + // Trains a k-means model. + KMeans kmeans = new KMeans().setK(2).setSeed(1L); KMeansModel model = kmeans.fit(dataset); - // Shows the result + // Evaluate clustering by computing Within Set Sum of Squared Errors. + double WSSSE = model.computeCost(dataset); + System.out.println("Within Set Sum of Squared Errors = " + WSSSE); + + // Shows the result. Vector[] centers = model.clusterCenters(); System.out.println("Cluster Centers: "); for (Vector center: centers) { diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py index 73823969554fa..672c16300e5a9 100644 --- a/examples/src/main/python/ml/kmeans_example.py +++ b/examples/src/main/python/ml/kmeans_example.py @@ -17,55 +17,43 @@ from __future__ import print_function -import sys - -import numpy as np +# $example on$ from pyspark.ml.clustering import KMeans, KMeansModel -from pyspark.mllib.linalg import VectorUDT, _convert_to_vector +# $example off$ + from pyspark.sql import SparkSession -from pyspark.sql.types import Row, StructField, StructType """ A simple example demonstrating a k-means clustering. Run with: - bin/spark-submit examples/src/main/python/ml/kmeans_example.py - -This example requires NumPy (http://www.numpy.org/). + bin/spark-submit examples/src/main/python/ml/kmeans_example.py """ -def parseVector(row): - array = np.array([float(x) for x in row.value.split(' ')]) - return _convert_to_vector(array) - - if __name__ == "__main__": - FEATURES_COL = "features" - - if len(sys.argv) != 3: - print("Usage: kmeans_example.py ", file=sys.stderr) - exit(-1) - path = sys.argv[1] - k = sys.argv[2] - spark = SparkSession\ .builder\ .appName("PythonKMeansExample")\ .getOrCreate() - lines = spark.read.text(path).rdd - data = lines.map(parseVector) - row_rdd = data.map(lambda x: Row(x)) - schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)]) - df = spark.createDataFrame(row_rdd, schema) + # $example on$ + # Loads data. + dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") - kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol(FEATURES_COL) - model = kmeans.fit(df) - centers = model.clusterCenters() + # Trains a k-means model. + kmeans = KMeans().setK(2).setSeed(1) + model = kmeans.fit(dataset) + # Evaluate clustering by computing Within Set Sum of Squared Errors. + wssse = model.computeCost(dataset) + print("Within Set Sum of Squared Errors = " + str(wssse)) + + # Shows the result. + centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) + # $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala index 2abd588c6f0e4..d2356e20a9aa2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala @@ -35,32 +35,26 @@ import org.apache.spark.sql.{DataFrame, SparkSession} object KMeansExample { def main(args: Array[String]): Unit = { - // Creates a Spark context and a SQL context + // Creates a SparkSession. val spark = SparkSession .builder .appName(s"${this.getClass.getSimpleName}") .getOrCreate() // $example on$ - // Crates a DataFrame - val dataset: DataFrame = spark.createDataFrame(Seq( - (1, Vectors.dense(0.0, 0.0, 0.0)), - (2, Vectors.dense(0.1, 0.1, 0.1)), - (3, Vectors.dense(0.2, 0.2, 0.2)), - (4, Vectors.dense(9.0, 9.0, 9.0)), - (5, Vectors.dense(9.1, 9.1, 9.1)), - (6, Vectors.dense(9.2, 9.2, 9.2)) - )).toDF("id", "features") + // Loads data. + val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") - // Trains a k-means model - val kmeans = new KMeans() - .setK(2) - .setFeaturesCol("features") - .setPredictionCol("prediction") + // Trains a k-means model. + val kmeans = new KMeans().setK(2).setSeed(1L) val model = kmeans.fit(dataset) - // Shows the result - println("Final Centers: ") + // Evaluate clustering by computing Within Set Sum of Squared Errors. + val WSSSE = model.computeCost(dataset) + println(s"Within Set Sum of Squared Errors = ${WSSSE}") + + // Shows the result. + println("Cluster Centers: ") model.clusterCenters.foreach(println) // $example off$ From d5f02c68731c3eca1f5b722ef85ae62f317e42c0 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 9 May 2016 21:16:54 +0800 Subject: [PATCH 2/4] fix one nit --- .../scala/org/apache/spark/examples/ml/KMeansExample.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala index d2356e20a9aa2..9adc6ab109945 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala @@ -21,9 +21,8 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.clustering.KMeans -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.sql.{DataFrame, SparkSession} // $example off$ +import org.apache.spark.sql.SparkSession /** * An example demonstrating a k-means clustering. @@ -51,7 +50,7 @@ object KMeansExample { // Evaluate clustering by computing Within Set Sum of Squared Errors. val WSSSE = model.computeCost(dataset) - println(s"Within Set Sum of Squared Errors = ${WSSSE}") + println(s"Within Set Sum of Squared Errors = $WSSSE") // Shows the result. println("Cluster Centers: ") From 5020773efb0e7b0e103ae298ea9c6434c6989ac2 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 11 May 2016 09:43:41 +0800 Subject: [PATCH 3/4] update comments --- .../org/apache/spark/examples/ml/JavaKMeansExample.java | 2 +- examples/src/main/python/ml/kmeans_example.py | 6 ++++-- .../scala/org/apache/spark/examples/ml/KMeansExample.scala | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java index a1a6be32e8d65..2489a9b80b074 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java @@ -28,7 +28,7 @@ /** - * An example demonstrating a k-means clustering. + * An example demonstrating k-means clustering. * Run with *
  * bin/run-example ml.JavaKMeansExample
diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py
index 672c16300e5a9..c432f58f3a6e3 100644
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@@ -24,9 +24,11 @@
 from pyspark.sql import SparkSession
 
 """
-A simple example demonstrating a k-means clustering.
+An example demonstrating k-means clustering.
 Run with:
-  bin/spark-submit examples/src/main/python/ml/kmeans_example.py
+  bin/spark-submit examples/src/main/python/ml/kmeans_example.py  
+
+This example requires NumPy (http://www.numpy.org/).
 """
 
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
index 9adc6ab109945..2341b36db2400 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
@@ -25,7 +25,7 @@ import org.apache.spark.ml.clustering.KMeans
 import org.apache.spark.sql.SparkSession
 
 /**
- * An example demonstrating a k-means clustering.
+ * An example demonstrating k-means clustering.
  * Run with
  * {{{
  * bin/run-example ml.KMeansExample

From f2ff8d6e8f877fa342cb67482b4124272bdaf76c Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng 
Date: Wed, 11 May 2016 09:53:36 +0800
Subject: [PATCH 4/4] update py cmd

---
 examples/src/main/python/ml/kmeans_example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py
index c432f58f3a6e3..4b8b7291f9188 100644
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@@ -18,7 +18,7 @@
 from __future__ import print_function
 
 # $example on$
-from pyspark.ml.clustering import KMeans, KMeansModel
+from pyspark.ml.clustering import KMeans
 # $example off$
 
 from pyspark.sql import SparkSession
@@ -26,7 +26,7 @@
 """
 An example demonstrating k-means clustering.
 Run with:
-  bin/spark-submit examples/src/main/python/ml/kmeans_example.py  
+  bin/spark-submit examples/src/main/python/ml/kmeans_example.py
 
 This example requires NumPy (http://www.numpy.org/).
 """