From 330c3e33bd10f035f49cf3d13357eb2d6d90dabc Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Fri, 24 Feb 2017 15:04:42 -0800
Subject: [PATCH] [SPARK-13330][PYSPARK] PYTHONHASHSEED is not propgated to
 python worker

## What changes were proposed in this pull request?
self.environment will be propagated to executor. Should set PYTHONHASHSEED as long as the python version is greater than 3.3

## How was this patch tested?
Manually tested it.

Author: Jeff Zhang <zjffdu@apache.org>

Closes #11211 from zjffdu/SPARK-13330.
---
 .../main/scala/org/apache/spark/deploy/PythonRunner.scala   | 1 +
 python/pyspark/context.py                                   | 6 ++----
 python/pyspark/rdd.py                                       | 3 ++-
 .../main/scala/org/apache/spark/deploy/yarn/Client.scala    | 1 +
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index 0b1cec2df8..a8f732b11f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -85,6 +85,7 @@ object PythonRunner {
     // pass conf spark.pyspark.python to python process, the only way to pass info to
     // python process is through environment variable.
     sparkConf.get(PYSPARK_PYTHON).foreach(env.put("PYSPARK_PYTHON", _))
+    sys.env.get("PYTHONHASHSEED").foreach(env.put("PYTHONHASHSEED", _))
     builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
     try {
       val process = builder.start()
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index ac4b2b035f..2961cda553 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -173,10 +173,8 @@ class SparkContext(object):
             if k.startswith("spark.executorEnv."):
                 varName = k[len("spark.executorEnv."):]
                 self.environment[varName] = v
-        if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ:
-            # disable randomness of hash of string in worker, if this is not
-            # launched by spark-submit
-            self.environment["PYTHONHASHSEED"] = "0"
+
+        self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")
 
         # Create the Java SparkContext through Py4J
         self._jsc = jsc or self._initialize_context(self._conf._jconf)
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index b384b2b507..a5e6e2b054 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -68,7 +68,8 @@ def portable_hash(x):
     >>> portable_hash((None, 1)) & 0xffffffff
     219750521
     """
-    if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ:
+
+    if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ:
         raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED")
 
     if x is None:
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index fa99cd3b64..e86bd54593 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -817,6 +817,7 @@ private[spark] class Client(
           sys.env.get(envname).foreach(env(envname) = _)
         }
       }
+      sys.env.get("PYTHONHASHSEED").foreach(env.put("PYTHONHASHSEED", _))
     }
 
     sys.env.get(ENV_DIST_CLASSPATH).foreach { dcp =>
-- 
GitLab