From 58357ccd85ff7b2107ef8d90539313fc1bafaa92 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrewor14@gmail.com>
Date: Thu, 1 May 2014 00:16:46 -0700
Subject: [PATCH 1/5] Include datanucleus jars in Spark distribution built with
 Hive support

---
 bin/compute-classpath.sh | 32 ++++++++++++++++++++------------
 make-distribution.sh     |  5 ++++-
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index b0218531e9eb8..f2824beccab73 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -29,6 +29,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf"
+CLASSPATH=$(echo "$CLASSPATH" | sed s/::/:/g)
 
 ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
 
@@ -45,14 +46,14 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
 
-  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
+  DEPS_ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
   CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
 else
   # Else use spark-assembly jar from either RELEASE or assembly directory
   if [ -f "$FWDIR/RELEASE" ]; then
-    ASSEMBLY_JAR=`ls "$FWDIR"/lib/spark-assembly*hadoop*.jar`
+    ASSEMBLY_JAR=$(ls "$FWDIR"/lib/spark-assembly*hadoop*.jar 2>/dev/null)
   else
-    ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar`
+    ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
   fi
   CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
 fi
@@ -63,14 +64,21 @@ fi
 # built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
 # assembly is built for Hive, before actually populating the CLASSPATH with the jars.
 # Note that this check order is faster (by up to half a second) in the case where Hive is not used.
-num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ 2>/dev/null | grep "datanucleus-.*\\.jar" | wc -l)
-if [ $num_datanucleus_jars -gt 0 ]; then
-  AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
-  num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
-  if [ $num_hive_files -gt 0 ]; then
+if [ -f "$FWDIR/RELEASE" ]; then
+  datanucleus_dir="$FWDIR"/lib
+else
+  datanucleus_dir="$FWDIR"/lib_managed/jars
+fi
+
+datanucleus_jars=$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar")
+datanucleus_jars=$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)
+
+if [ -n "$datanucleus_jars" ]; then
+  an_assembly_jar=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
+  hive_files=$(jar tvf "$an_assembly_jar" org/apache/hadoop/hive/ql/exec)
+  if [ -n "$hive_files" ]; then
     echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
-    DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
-    CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+    CLASSPATH=$CLASSPATH:$datanucleus_jars
   fi
 fi
 
@@ -90,10 +98,10 @@ fi
 # Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !
 # Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts
 # the configurtion files.
-if [ "x" != "x$HADOOP_CONF_DIR" ]; then
+if [ -n "$HADOOP_CONF_DIR" ]; then
   CLASSPATH="$CLASSPATH:$HADOOP_CONF_DIR"
 fi
-if [ "x" != "x$YARN_CONF_DIR" ]; then
+if [ -n "$YARN_CONF_DIR" ]; then
   CLASSPATH="$CLASSPATH:$YARN_CONF_DIR"
 fi
 
diff --git a/make-distribution.sh b/make-distribution.sh
index c05dcd89d90a7..6b5aa63a5cf89 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -147,6 +147,10 @@ echo "Spark $VERSION built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE
 cp $FWDIR/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
 cp $FWDIR/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
 
+if [ "$SPARK_HIVE" == "true" ]; then
+  cp $FWDIR/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
+fi
+
 # Copy other things
 mkdir "$DISTDIR"/conf
 cp "$FWDIR"/conf/*.template "$DISTDIR"/conf
@@ -155,7 +159,6 @@ cp -r "$FWDIR/bin" "$DISTDIR"
 cp -r "$FWDIR/python" "$DISTDIR"
 cp -r "$FWDIR/sbin" "$DISTDIR"
 
-
 # Download and copy in tachyon, if requested
 if [ "$SPARK_TACHYON" == "true" ]; then
   TACHYON_VERSION="0.4.1"

From 940a1bb7d804ac5e41777c19a4ccc845ba4e4cc3 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrewor14@gmail.com>
Date: Thu, 1 May 2014 00:33:46 -0700
Subject: [PATCH 2/5] Add back 2>/dev/null

---
 bin/compute-classpath.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index f2824beccab73..0ae06ceb154c4 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -75,7 +75,7 @@ datanucleus_jars=$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)
 
 if [ -n "$datanucleus_jars" ]; then
   an_assembly_jar=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
-  hive_files=$(jar tvf "$an_assembly_jar" org/apache/hadoop/hive/ql/exec)
+  hive_files=$(jar tvf "$an_assembly_jar" org/apache/hadoop/hive/ql/exec 2>/dev/null)
   if [ -n "$hive_files" ]; then
     echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
     CLASSPATH=$CLASSPATH:$datanucleus_jars

From 32f6826ac975210a6eca75b6af2fe19550d189c3 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrewor14@gmail.com>
Date: Thu, 1 May 2014 10:33:09 -0700
Subject: [PATCH 3/5] Leave the double colons

---
 bin/compute-classpath.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index 0ae06ceb154c4..2a98e25dc7381 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -29,7 +29,6 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf"
-CLASSPATH=$(echo "$CLASSPATH" | sed s/::/:/g)
 
 ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
 

From 7855f580356e006818e31a2942e523770a0fdaf3 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrewor14@gmail.com>
Date: Mon, 5 May 2014 10:49:56 -0700
Subject: [PATCH 4/5] Have jar command respect JAVA_HOME + check for jar errors
 both cases

Both cases being building the uber assembly jar and building the deps
assembly jar.
---
 bin/compute-classpath.sh | 35 ++++++++++++++++++-----------------
 make-distribution.sh     |  6 +++---
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index 475407370233b..b7112b38e02b1 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -32,8 +32,8 @@ CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf"
 
 ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
 
-if [ -n "${JAVA_HOME}" ]; then
-  JAR_CMD="${JAVA_HOME}/bin/jar"
+if [ -n "$JAVA_HOME" ]; then
+  JAR_CMD="$JAVA_HOME/bin/jar"
 else
   JAR_CMD="jar"
 fi
@@ -52,8 +52,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
 
-  DEPS_ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
-  CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
+  ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
 else
   # Else use spark-assembly jar from either RELEASE or assembly directory
   if [ -f "$FWDIR/RELEASE" ]; then
@@ -61,19 +60,22 @@ else
   else
     ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
   fi
-  jar_error_check=$($JAR_CMD -tf $ASSEMBLY_JAR org/apache/spark/SparkContext 2>&1)
-  if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
-    echo "Loading Spark jar with '$JAR_CMD' failed. "
-    echo "This is likely because Spark was compiled with Java 7 and run "
-    echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark "
-    echo "or build Spark with Java 6."
-    exit 1
-  fi
-  CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
 fi
 
+# Verify that versions of java used to build the jars and run Spark are compatible
+jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" scala/AnyVal 2>&1)
+if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
+  echo "Loading Spark jar with '$JAR_CMD' failed. "
+  echo "This is likely because Spark was compiled with Java 7 and run "
+  echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark "
+  echo "or build Spark with Java 6."
+  exit 1
+fi
+
+CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
+
 # When Hive support is needed, Datanucleus jars must be included on the classpath.
-# Datanucleus jars do not work if only included in the  uber jar as plugin.xml metadata is lost.
+# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
 # Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
 # built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
 # assembly is built for Hive, before actually populating the CLASSPATH with the jars.
@@ -88,11 +90,10 @@ datanucleus_jars=$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.
 datanucleus_jars=$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)
 
 if [ -n "$datanucleus_jars" ]; then
-  an_assembly_jar=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
-  hive_files=$(jar tvf "$an_assembly_jar" org/apache/hadoop/hive/ql/exec 2>/dev/null)
+  hive_files=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null)
   if [ -n "$hive_files" ]; then
     echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
-    CLASSPATH=$CLASSPATH:$datanucleus_jars
+    CLASSPATH="$CLASSPATH:$datanucleus_jars"
   fi
 fi
 
diff --git a/make-distribution.sh b/make-distribution.sh
index b976e66cf6eb4..ff18d01e7a616 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -51,13 +51,13 @@ if [ $? != 0 ]; then
     exit -1;
 fi
 
-if [ -z "${JAVA_HOME}" ]; then
+if [ -z "$JAVA_HOME" ]; then
   echo "Error: JAVA_HOME is not set, cannot proceed."
   exit -1
 fi
 
-JAVA_CMD=$JAVA_HOME/bin/java
-JAVA_VERSION=$($JAVA_CMD -version 2>&1)
+JAVA_CMD="$JAVA_HOME"/bin/java
+JAVA_VERSION=$("$JAVA_CMD" -version 2>&1)
 if ! [[ "$JAVA_VERSION" =~ "1.6" ]]; then
   echo "Error: JAVA_HOME must point to a JDK 6 installation (see SPARK-1703)."
   echo "Output from 'java -version' was:"

From a4bc96f3dc38802d64c3efe9c9f0911be9a55d31 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrewor14@gmail.com>
Date: Mon, 5 May 2014 12:10:40 -0700
Subject: [PATCH 5/5] Rename search path in jar error check

---
 bin/compute-classpath.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index b7112b38e02b1..7df43a555d562 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -63,7 +63,7 @@ else
 fi
 
 # Verify that versions of java used to build the jars and run Spark are compatible
-jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" scala/AnyVal 2>&1)
+jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
 if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
   echo "Loading Spark jar with '$JAR_CMD' failed. "
   echo "This is likely because Spark was compiled with Java 7 and run "