From b95732632b5b06d494ebd9e539af136ab3b8490e Mon Sep 17 00:00:00 2001 From: Josh Rosen <joshrosen@eecs.berkeley.edu> Date: Sun, 28 Jul 2013 19:24:23 -0700 Subject: [PATCH] Do not inherit master's PYTHONPATH on workers. This fixes SPARK-832, an issue where PySpark would not work when the master and workers used different SPARK_HOME paths. This change may potentially break code that relied on the master's PYTHONPATH being used on workers. To have custom PYTHONPATH additions used on the workers, users should set a custom PYTHONPATH in spark-env.sh rather than setting it in the shell. --- .../main/scala/spark/api/python/PythonWorkerFactory.scala | 2 ++ python/pyspark/rdd.py | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/spark/api/python/PythonWorkerFactory.scala b/core/src/main/scala/spark/api/python/PythonWorkerFactory.scala index 078ad45ce8..84673470db 100644 --- a/core/src/main/scala/spark/api/python/PythonWorkerFactory.scala +++ b/core/src/main/scala/spark/api/python/PythonWorkerFactory.scala @@ -67,6 +67,8 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/daemon.py")) val workerEnv = pb.environment() workerEnv.putAll(envVars) + val pythonPath = sparkHome + "/python/:" + workerEnv.get("PYTHONPATH") + workerEnv.put("PYTHONPATH", pythonPath) daemon = pb.start() // Redirect the stderr to ours diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 8734cacb0b..51c2cb9806 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -756,9 +756,8 @@ class PipelinedRDD(RDD): self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() class_manifest = self._prev_jrdd.classManifest() - env = copy.copy(self.ctx.environment) - env['PYTHONPATH'] = os.environ.get("PYTHONPATH", "") - env = MapConverter().convert(env, self.ctx._gateway._gateway_client) + env = MapConverter().convert(self.ctx.environment, + self.ctx._gateway._gateway_client) python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), pipe_command, env, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, class_manifest) -- GitLab