From 330c3e33bd10f035f49cf3d13357eb2d6d90dabc Mon Sep 17 00:00:00 2001 From: Jeff Zhang <zjffdu@apache.org> Date: Fri, 24 Feb 2017 15:04:42 -0800 Subject: [PATCH] [SPARK-13330][PYSPARK] PYTHONHASHSEED is not propgated to python worker ## What changes were proposed in this pull request? self.environment will be propagated to executor. Should set PYTHONHASHSEED as long as the python version is greater than 3.3 ## How was this patch tested? Manually tested it. Author: Jeff Zhang <zjffdu@apache.org> Closes #11211 from zjffdu/SPARK-13330. --- .../main/scala/org/apache/spark/deploy/PythonRunner.scala | 1 + python/pyspark/context.py | 6 ++---- python/pyspark/rdd.py | 3 ++- .../main/scala/org/apache/spark/deploy/yarn/Client.scala | 1 + 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala index 0b1cec2df8..a8f732b11f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala @@ -85,6 +85,7 @@ object PythonRunner { // pass conf spark.pyspark.python to python process, the only way to pass info to // python process is through environment variable. sparkConf.get(PYSPARK_PYTHON).foreach(env.put("PYSPARK_PYTHON", _)) + sys.env.get("PYTHONHASHSEED").foreach(env.put("PYTHONHASHSEED", _)) builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize try { val process = builder.start() diff --git a/python/pyspark/context.py b/python/pyspark/context.py index ac4b2b035f..2961cda553 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -173,10 +173,8 @@ class SparkContext(object): if k.startswith("spark.executorEnv."): varName = k[len("spark.executorEnv."):] self.environment[varName] = v - if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ: - # disable randomness of hash of string in worker, if this is not - # launched by spark-submit - self.environment["PYTHONHASHSEED"] = "0" + + self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0") # Create the Java SparkContext through Py4J self._jsc = jsc or self._initialize_context(self._conf._jconf) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index b384b2b507..a5e6e2b054 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -68,7 +68,8 @@ def portable_hash(x): >>> portable_hash((None, 1)) & 0xffffffff 219750521 """ - if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ: + + if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ: raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED") if x is None: diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index fa99cd3b64..e86bd54593 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -817,6 +817,7 @@ private[spark] class Client( sys.env.get(envname).foreach(env(envname) = _) } } + sys.env.get("PYTHONHASHSEED").foreach(env.put("PYTHONHASHSEED", _)) } sys.env.get(ENV_DIST_CLASSPATH).foreach { dcp => -- GitLab