From 9714fdec3d671d46cd58109ccc282e8e609a9af2 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Sat, 30 May 2015 22:16:17 -0700
Subject: [PATCH 1/5] [SPARK-7710] add doc examples for DataFrameStatFunctions

---
 .../spark/sql/DataFrameStatFunctions.scala    | 50 ++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 5d106c1ac2674..c6f6ef5bcdcac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -35,6 +35,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param col2 the name of the second column
    * @return the covariance of the two columns.
    *
+   * {{{
+   *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
+   *      (3, 3))).toDF("key", "value")
+   *    df.stat.cov("key", "value")
+   * }}}
+   *
    * @since 1.4.0
    */
   def cov(col1: String, col2: String): Double = {
@@ -43,13 +49,19 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
 
   /**
    * Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
-   * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in 
+   * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in
    * MLlib's Statistics.
    *
    * @param col1 the name of the column
    * @param col2 the name of the column to calculate the correlation against
    * @return The Pearson Correlation Coefficient as a Double.
    *
+   * {{{
+   *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
+   *      (3, 3))).toDF("key", "value")
+   *    df.stat.corr("key", "value", "pearson")
+   * }}}
+   *
    * @since 1.4.0
    */
   def corr(col1: String, col2: String, method: String): Double = {
@@ -65,6 +77,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param col2 the name of the column to calculate the correlation against
    * @return The Pearson Correlation Coefficient as a Double.
    *
+   * {{{
+   *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
+   *      (3, 3))).toDF("key", "value")
+   *    df.stat.corr("key", "value")
+   * }}}
+   *
    * @since 1.4.0
    */
   def corr(col1: String, col2: String): Double = {
@@ -85,6 +103,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *             of the DataFrame.
    * @return A DataFrame containing for the contingency table.
    *
+   * {{{
+   *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
+   *      (3, 3))).toDF(“key”, “value”)
+   *    val ct = df.stat.crosstab("key", "value")
+   *    ct.show()
+   * }}}
+   *
    * @since 1.4.0
    */
   def crosstab(col1: String, col2: String): DataFrame = {
@@ -102,6 +127,18 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *                than 1e-4.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *
+   * {{{
+   *    // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
+   *    // "a" and "b"
+   *    val freqSingles = df.stat.freqItems(Array("a", "b"), 0.4)
+   *    freqSingles.show()
+   *    // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
+   *    val pairDf = df.select(struct("a", "b").as("a-b"))
+   *    val freqPairs = pairDf.stat.freqItems(Array("a-b"), 0.1)
+   *    freqPairs.show()
+   * }}}
+   *
+   *
    * @since 1.4.0
    */
   def freqItems(cols: Array[String], support: Double): DataFrame = {
@@ -131,6 +168,17 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *
+   * {{{
+   *    // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
+   *    // "a" and "b"
+   *    val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4)
+   *    freqSingles.show()
+   *    // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
+   *    val pairDf = df.select(struct("a", "b").as("a-b"))
+   *    val freqPairs = pairDf.stat.freqItems(Seq("a-b"), 0.1)
+   *    freqPairs.show()
+   * }}}
+   *
    * @since 1.4.0
    */
   def freqItems(cols: Seq[String], support: Double): DataFrame = {

From d5d0ea8b4b55f8c05e630464a8c57092479d7f8b Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Mon, 1 Jun 2015 23:59:33 -0700
Subject: [PATCH 2/5] for some reason, it complains about the }

---
 .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 673bf94a0a552..6a2de931d851a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -87,7 +87,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    */
   def corr(col1: String, col2: String): Double = {
     corr(col1, col2, "pearson")
+// scalastyle:off
   }
+// scalastyle:on
 
   /**
    * Computes a pair-wise frequency table of the given columns. Also known as a contingency table.

From 36667068dda6919b300f31dafde07a3598ef7f81 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Thu, 11 Jun 2015 14:58:19 -0700
Subject: [PATCH 3/5] changed examples to blog post examples

---
 .../spark/sql/DataFrameStatFunctions.scala    | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 6a2de931d851a..1109ef2cb7fde 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -36,9 +36,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @return the covariance of the two columns.
    *
    * {{{
-   *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
-   *      (3, 3))).toDF("key", "value")
-   *    df.stat.cov("key", "value")
+   *    import org.apache.spark.sql.functions._
+   *    val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10))
+   *      .withColumn("rand2", rand(seed=27))
+   *    df.stat.cov("rand1", "rand2")
    * }}}
    *
    * @since 1.4.0
@@ -57,9 +58,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @return The Pearson Correlation Coefficient as a Double.
    *
    * {{{
-   *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
-   *      (3, 3))).toDF("key", "value")
-   *    df.stat.corr("key", "value", "pearson")
+   *    import org.apache.spark.sql.functions._
+   *    val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10))
+   *      .withColumn("rand2", rand(seed=27))
+   *    df.stat.corr("rand1", "rand2", "pearson")
    * }}}
    *
    * @since 1.4.0
@@ -78,9 +80,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @return The Pearson Correlation Coefficient as a Double.
    *
    * {{{
-   *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
-   *      (3, 3))).toDF("key", "value")
-   *    df.stat.corr("key", "value")
+   *    import org.apache.spark.sql.functions._
+   *    val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10))
+   *      .withColumn("rand2", rand(seed=27))
+   *    df.stat.corr("rand1", "rand2")
    * }}}
    *
    * @since 1.4.0

From e4df3b7af2ed146467bed0911ff7da82079ac2e8 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 23 Jun 2015 18:00:52 -0700
Subject: [PATCH 4/5] fixed unicode

---
 .../org/apache/spark/sql/DataFrameStatFunctions.scala  | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 1109ef2cb7fde..832ab66a62a66 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -37,7 +37,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *
    * {{{
    *    import org.apache.spark.sql.functions._
-   *    val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10))
+   *    val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
    *      .withColumn("rand2", rand(seed=27))
    *    df.stat.cov("rand1", "rand2")
    * }}}
@@ -59,7 +59,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *
    * {{{
    *    import org.apache.spark.sql.functions._
-   *    val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10))
+   *    val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
    *      .withColumn("rand2", rand(seed=27))
    *    df.stat.corr("rand1", "rand2", "pearson")
    * }}}
@@ -81,7 +81,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *
    * {{{
    *    import org.apache.spark.sql.functions._
-   *    val df = sqlContext.createDataFrame(0 until 10).withColumn("rand1", rand(seed=10))
+   *    val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
    *      .withColumn("rand2", rand(seed=27))
    *    df.stat.corr("rand1", "rand2")
    * }}}
@@ -90,9 +90,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    */
   def corr(col1: String, col2: String): Double = {
     corr(col1, col2, "pearson")
-// scalastyle:off
   }
-// scalastyle:on
 
   /**
    * Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
@@ -110,7 +108,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *
    * {{{
    *    val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
-   *      (3, 3))).toDF(“key”, “value”)
+   *      (3, 3))).toDF("key", "value")
    *    val ct = df.stat.crosstab("key", "value")
    *    ct.show()
    * }}}

From 4219e66bf14ac06d81d7431cc497c492ab83b065 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 23 Jun 2015 18:13:53 -0700
Subject: [PATCH 5/5] add example df to freqItems

---
 .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 832ab66a62a66..6d397e2601797 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -181,6 +181,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @return A Local DataFrame with the Array of frequent items for each column.
    *
    * {{{
+   *    val rows = Seq.tabulate(100) { i =>
+   *      if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
+   *    }
+   *    val df = sqlContext.createDataFrame(rows).toDF("a", "b")
    *    // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
    *    // "a" and "b"
    *    val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4)