Skip to content
Snippets Groups Projects
Commit 092121e4 authored by Davies Liu's avatar Davies Liu Committed by Matei Zaharia
Browse files

[SPARK-3239] [PySpark] randomize the dirs for each process

This can avoid the IO contention during spilling, when you have multiple disks.

Author: Davies Liu <davies.liu@gmail.com>

Closes #2152 from davies/randomize and squashes the following commits:

a4863c4 [Davies Liu] randomize the dirs for each process
parent 8f8e2a4e
No related branches found
No related tags found
No related merge requests found
...@@ -21,6 +21,7 @@ import platform ...@@ -21,6 +21,7 @@ import platform
import shutil import shutil
import warnings import warnings
import gc import gc
import random
from pyspark.serializers import BatchedSerializer, PickleSerializer from pyspark.serializers import BatchedSerializer, PickleSerializer
...@@ -216,6 +217,9 @@ class ExternalMerger(Merger): ...@@ -216,6 +217,9 @@ class ExternalMerger(Merger):
""" Get all the directories """ """ Get all the directories """
path = os.environ.get("SPARK_LOCAL_DIRS", "/tmp") path = os.environ.get("SPARK_LOCAL_DIRS", "/tmp")
dirs = path.split(",") dirs = path.split(",")
if len(dirs) > 1:
rnd = random.Random(os.getpid() + id(dirs))
random.shuffle(dirs, rnd.random)
return [os.path.join(d, "python", str(os.getpid()), str(id(self))) return [os.path.join(d, "python", str(os.getpid()), str(id(self)))
for d in dirs] for d in dirs]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment