From 007a733434aa39cdb137ab9795434ae2af70fe0b Mon Sep 17 00:00:00 2001
From: Aaron Davidson <aaron@databricks.com>
Date: Mon, 24 Mar 2014 22:24:21 -0700
Subject: [PATCH] SPARK-1286: Make usage of spark-env.sh idempotent

Various spark scripts load spark-env.sh. This can cause growth of any variables that may be appended to (SPARK_CLASSPATH, SPARK_REPL_OPTS) and it makes the precedence order for options specified in spark-env.sh less clear.

One use-case for the latter is that we want to set options from the command-line of spark-shell, but these options will be overridden by subsequent loading of spark-env.sh. If we were to load the spark-env.sh first and then set our command-line options, we could guarantee correct precedence order.

Note that we use SPARK_CONF_DIR if available to support the sbin/ scripts, which always set this variable from sbin/spark-config.sh. Otherwise, we default to the ../conf/ as usual.

Author: Aaron Davidson <aaron@databricks.com>

Closes #184 from aarondav/idem and squashes the following commits:

e291f91 [Aaron Davidson] Use "private" variables in load-spark-env.sh
8da8360 [Aaron Davidson] Add .sh extension to load-spark-env.sh
93a2471 [Aaron Davidson] SPARK-1286: Make usage of spark-env.sh idempotent
---
 bin/compute-classpath.sh |  5 +----
 bin/load-spark-env.sh    | 35 +++++++++++++++++++++++++++++++++++
 bin/pyspark              |  5 +----
 bin/run-example          |  5 +----
 bin/spark-class          |  5 +----
 bin/spark-shell          |  4 +---
 sbin/slaves.sh           |  4 +---
 sbin/spark-daemon.sh     |  4 +---
 sbin/start-master.sh     |  4 +---
 sbin/start-slaves.sh     |  4 +---
 sbin/stop-slaves.sh      |  4 +---
 11 files changed, 45 insertions(+), 34 deletions(-)
 create mode 100644 bin/load-spark-env.sh

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index 5f54391418..d6f1ff9084 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -25,10 +25,7 @@ SCALA_VERSION=2.10
 # Figure out where Spark is installed
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
-# Load environment variables from conf/spark-env.sh, if it exists
-if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
-  . $FWDIR/conf/spark-env.sh
-fi
+. $FWDIR/bin/load-spark-env.sh
 
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
new file mode 100644
index 0000000000..476dd82655
--- /dev/null
+++ b/bin/load-spark-env.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script loads spark-env.sh if it exists, and ensures it is only loaded once.
+# spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's
+# conf/ subdirectory.
+
+if [ -z "$SPARK_ENV_LOADED" ]; then
+  export SPARK_ENV_LOADED=1
+
+  # Returns the parent of the directory this script lives in.
+  parent_dir="$(cd `dirname $0`/..; pwd)"
+
+  use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
+
+  if [ -f "${use_conf_dir}/spark-env.sh" ]; then
+    . "${use_conf_dir}/spark-env.sh"
+  fi
+fi
diff --git a/bin/pyspark b/bin/pyspark
index ed6f8da730..67e1f61eeb 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -36,10 +36,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
   fi
 fi
 
-# Load environment variables from conf/spark-env.sh, if it exists
-if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
-  . $FWDIR/conf/spark-env.sh
-fi
+. $FWDIR/bin/load-spark-env.sh
 
 # Figure out which Python executable to use
 if [ -z "$PYSPARK_PYTHON" ] ; then
diff --git a/bin/run-example b/bin/run-example
index adba7dd97a..5af95a08c6 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -30,10 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 # Export this as SPARK_HOME
 export SPARK_HOME="$FWDIR"
 
-# Load environment variables from conf/spark-env.sh, if it exists
-if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
-  . $FWDIR/conf/spark-env.sh
-fi
+. $FWDIR/bin/load-spark-env.sh
 
 if [ -z "$1" ]; then
   echo "Usage: run-example <example-class> [<args>]" >&2
diff --git a/bin/spark-class b/bin/spark-class
index a3efa2ff98..0dcf0e156c 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -30,10 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 # Export this as SPARK_HOME
 export SPARK_HOME="$FWDIR"
 
-# Load environment variables from conf/spark-env.sh, if it exists
-if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
-  . $FWDIR/conf/spark-env.sh
-fi
+. $FWDIR/bin/load-spark-env.sh
 
 if [ -z "$1" ]; then
   echo "Usage: spark-class <class> [<args>]" >&2
diff --git a/bin/spark-shell b/bin/spark-shell
index 7d3fe3aca7..861ab60654 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -81,9 +81,7 @@ done
 # Set MASTER from spark-env if possible
 DEFAULT_SPARK_MASTER_PORT=7077
 if [ -z "$MASTER" ]; then
-  if [ -e "$FWDIR/conf/spark-env.sh" ]; then
-    . "$FWDIR/conf/spark-env.sh"
-  fi
+  . $FWDIR/bin/load-spark-env.sh
   if [ "x" != "x$SPARK_MASTER_IP" ]; then
     if [ "y" != "y$SPARK_MASTER_PORT" ]; then
       SPARK_MASTER_PORT="${SPARK_MASTER_PORT}"
diff --git a/sbin/slaves.sh b/sbin/slaves.sh
index a5bc2183d8..f89547fef9 100755
--- a/sbin/slaves.sh
+++ b/sbin/slaves.sh
@@ -63,9 +63,7 @@ then
   shift
 fi
 
-if [ -f "${SPARK_CONF_DIR}/spark-env.sh" ]; then
-  . "${SPARK_CONF_DIR}/spark-env.sh"
-fi
+. "$SPARK_PREFIX/bin/load-spark-env.sh"
 
 if [ "$HOSTLIST" = "" ]; then
   if [ "$SPARK_SLAVES" = "" ]; then
diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index 2be2b3d7c0..323f675b17 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -86,9 +86,7 @@ spark_rotate_log ()
     fi
 }
 
-if [ -f "${SPARK_CONF_DIR}/spark-env.sh" ]; then
-  . "${SPARK_CONF_DIR}/spark-env.sh"
-fi
+. "$SPARK_PREFIX/bin/load-spark-env.sh"
 
 if [ "$SPARK_IDENT_STRING" = "" ]; then
   export SPARK_IDENT_STRING="$USER"
diff --git a/sbin/start-master.sh b/sbin/start-master.sh
index 03a3428aea..c5c02491f7 100755
--- a/sbin/start-master.sh
+++ b/sbin/start-master.sh
@@ -39,9 +39,7 @@ done
 
 . "$sbin/spark-config.sh"
 
-if [ -f "${SPARK_CONF_DIR}/spark-env.sh" ]; then
-  . "${SPARK_CONF_DIR}/spark-env.sh"
-fi
+. "$SPARK_PREFIX/bin/load-spark-env.sh"
 
 if [ "$SPARK_MASTER_PORT" = "" ]; then
   SPARK_MASTER_PORT=7077
diff --git a/sbin/start-slaves.sh b/sbin/start-slaves.sh
index da641cfe3c..4912d0c0c7 100755
--- a/sbin/start-slaves.sh
+++ b/sbin/start-slaves.sh
@@ -38,9 +38,7 @@ done
 
 . "$sbin/spark-config.sh"
 
-if [ -f "${SPARK_CONF_DIR}/spark-env.sh" ]; then
-  . "${SPARK_CONF_DIR}/spark-env.sh"
-fi
+. "$SPARK_PREFIX/bin/load-spark-env.sh"
 
 # Find the port number for the master
 if [ "$SPARK_MASTER_PORT" = "" ]; then
diff --git a/sbin/stop-slaves.sh b/sbin/stop-slaves.sh
index 6bf393ccd4..7c2201100e 100755
--- a/sbin/stop-slaves.sh
+++ b/sbin/stop-slaves.sh
@@ -22,9 +22,7 @@ sbin=`cd "$sbin"; pwd`
 
 . "$sbin/spark-config.sh"
 
-if [ -f "${SPARK_CONF_DIR}/spark-env.sh" ]; then
-  . "${SPARK_CONF_DIR}/spark-env.sh"
-fi
+. "$SPARK_PREFIX/bin/load-spark-env.sh"
 
 # do before the below calls as they exec
 if [ -e "$sbin"/../tachyon/bin/tachyon ]; then
-- 
GitLab