From 092121e477bcd2e474440dbdfdfa69cbd15c4803 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Wed, 27 Aug 2014 10:40:35 -0700
Subject: [PATCH] [SPARK-3239] [PySpark] randomize the dirs for each process

This can avoid the IO contention during spilling, when you have multiple disks.

Author: Davies Liu <davies.liu@gmail.com>

Closes #2152 from davies/randomize and squashes the following commits:

a4863c4 [Davies Liu] randomize the dirs for each process
---
 python/pyspark/shuffle.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index 1ebe7df418..2750f117ba 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -21,6 +21,7 @@ import platform
 import shutil
 import warnings
 import gc
+import random
 
 from pyspark.serializers import BatchedSerializer, PickleSerializer
 
@@ -216,6 +217,9 @@ class ExternalMerger(Merger):
         """ Get all the directories """
         path = os.environ.get("SPARK_LOCAL_DIRS", "/tmp")
         dirs = path.split(",")
+        if len(dirs) > 1:
+            rnd = random.Random(os.getpid() + id(dirs))
+            random.shuffle(dirs, rnd.random)
         return [os.path.join(d, "python", str(os.getpid()), str(id(self)))
                 for d in dirs]
 
-- 
GitLab