From 9714fdec3d671d46cd58109ccc282e8e609a9af2 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Sat, 30 May 2015 22:16:17 -0700 Subject: [PATCH 1/5] [SPARK-7710] add doc examples for DataFrameStatFunctions --- .../spark/sql/DataFrameStatFunctions.scala | 50 ++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 5d106c1ac2674..c6f6ef5bcdcac 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -35,6 +35,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param col2 the name of the second column * @return the covariance of the two columns. * + * {{{ + * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), + * (3, 3))).toDF("key", "value") + * df.stat.cov("key", "value") + * }}} + * * @since 1.4.0 */ def cov(col1: String, col2: String): Double = { @@ -43,13 +49,19 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson - * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in + * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in * MLlib's Statistics. * * @param col1 the name of the column * @param col2 the name of the column to calculate the correlation against * @return The Pearson Correlation Coefficient as a Double. * + * {{{ + * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), + * (3, 3))).toDF("key", "value") + * df.stat.corr("key", "value", "pearson") + * }}} + * * @since 1.4.0 */ def corr(col1: String, col2: String, method: String): Double = { @@ -65,6 +77,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param col2 the name of the column to calculate the correlation against * @return The Pearson Correlation Coefficient as a Double. * + * {{{ + * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), + * (3, 3))).toDF("key", "value") + * df.stat.corr("key", "value") + * }}} + * * @since 1.4.0 */ def corr(col1: String, col2: String): Double = { @@ -85,6 +103,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * of the DataFrame. * @return A DataFrame containing for the contingency table. * + * {{{ + * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), + * (3, 3))).toDF(“key”, “value”) + * val ct = df.stat.crosstab("key", "value") + * ct.show() + * }}} + * * @since 1.4.0 */ def crosstab(col1: String, col2: String): DataFrame = { @@ -102,6 +127,18 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * than 1e-4. * @return A Local DataFrame with the Array of frequent items for each column. * + * {{{ + * // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns + * // "a" and "b" + * val freqSingles = df.stat.freqItems(Array("a", "b"), 0.4) + * freqSingles.show() + * // find the pair of items with a frequency greater than 0.1 in columns "a" and "b" + * val pairDf = df.select(struct("a", "b").as("a-b")) + * val freqPairs = pairDf.stat.freqItems(Array("a-b"), 0.1) + * freqPairs.show() + * }}} + * + * * @since 1.4.0 */ def freqItems(cols: Array[String], support: Double): DataFrame = { @@ -131,6 +168,17 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @param cols the names of the columns to search frequent items in. * @return A Local DataFrame with the Array of frequent items for each column. * + * {{{ + * // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns + * // "a" and "b" + * val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4) + * freqSingles.show() + * // find the pair of items with a frequency greater than 0.1 in columns "a" and "b" + * val pairDf = df.select(struct("a", "b").as("a-b")) + * val freqPairs = pairDf.stat.freqItems(Seq("a-b"), 0.1) + * freqPairs.show() + * }}} + * * @since 1.4.0 */ def freqItems(cols: Seq[String], support: Double): DataFrame = { From d5d0ea8b4b55f8c05e630464a8c57092479d7f8b Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Mon, 1 Jun 2015 23:59:33 -0700 Subject: [PATCH 2/5] for some reason, it complains about the } --- .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 673bf94a0a552..6a2de931d851a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -87,7 +87,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { */ def corr(col1: String, col2: String): Double = { corr(col1, col2, "pearson") +// scalastyle:off } +// scalastyle:on /** * Computes a pair-wise frequency table of the given columns. Also known as a contingency table. From 36667068dda6919b300f31dafde07a3598ef7f81 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Thu, 11 Jun 2015 14:58:19 -0700 Subject: [PATCH 3/5] changed examples to blog post examples --- .../spark/sql/DataFrameStatFunctions.scala | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 6a2de931d851a..1109ef2cb7fde 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -36,9 +36,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @return the covariance of the two columns. * * {{{ - * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), - * (3, 3))).toDF("key", "value") - * df.stat.cov("key", "value") + * import org.apache.spark.sql.functions._ + * val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10)) + * .withColumn("rand2", rand(seed=27)) + * df.stat.cov("rand1", "rand2") * }}} * * @since 1.4.0 @@ -57,9 +58,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @return The Pearson Correlation Coefficient as a Double. * * {{{ - * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), - * (3, 3))).toDF("key", "value") - * df.stat.corr("key", "value", "pearson") + * import org.apache.spark.sql.functions._ + * val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10)) + * .withColumn("rand2", rand(seed=27)) + * df.stat.corr("rand1", "rand2", "pearson") * }}} * * @since 1.4.0 @@ -78,9 +80,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @return The Pearson Correlation Coefficient as a Double. * * {{{ - * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), - * (3, 3))).toDF("key", "value") - * df.stat.corr("key", "value") + * import org.apache.spark.sql.functions._ + * val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10)) + * .withColumn("rand2", rand(seed=27)) + * df.stat.corr("rand1", "rand2") * }}} * * @since 1.4.0 From e4df3b7af2ed146467bed0911ff7da82079ac2e8 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Tue, 23 Jun 2015 18:00:52 -0700 Subject: [PATCH 4/5] fixed unicode --- .../org/apache/spark/sql/DataFrameStatFunctions.scala | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 1109ef2cb7fde..832ab66a62a66 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -37,7 +37,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * * {{{ * import org.apache.spark.sql.functions._ - * val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10)) + * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10)) * .withColumn("rand2", rand(seed=27)) * df.stat.cov("rand1", "rand2") * }}} @@ -59,7 +59,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * * {{{ * import org.apache.spark.sql.functions._ - * val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10)) + * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10)) * .withColumn("rand2", rand(seed=27)) * df.stat.corr("rand1", "rand2", "pearson") * }}} @@ -81,7 +81,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * * {{{ * import org.apache.spark.sql.functions._ - * val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10)) + * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10)) * .withColumn("rand2", rand(seed=27)) * df.stat.corr("rand1", "rand2") * }}} @@ -90,9 +90,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { */ def corr(col1: String, col2: String): Double = { corr(col1, col2, "pearson") -// scalastyle:off } -// scalastyle:on /** * Computes a pair-wise frequency table of the given columns. Also known as a contingency table. @@ -110,7 +108,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * * {{{ * val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), - * (3, 3))).toDF(“key”, “value”) + * (3, 3))).toDF("key", "value") * val ct = df.stat.crosstab("key", "value") * ct.show() * }}} From 4219e66bf14ac06d81d7431cc497c492ab83b065 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Tue, 23 Jun 2015 18:13:53 -0700 Subject: [PATCH 5/5] add example df to freqItems --- .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 832ab66a62a66..6d397e2601797 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -181,6 +181,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @return A Local DataFrame with the Array of frequent items for each column. * * {{{ + * val rows = Seq.tabulate(100) { i => + * if (i % 2 == 0) (1, -1.0) else (i, i * -1.0) + * } + * val df = sqlContext.createDataFrame(rows).toDF("a", "b") * // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns * // "a" and "b" * val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4)