diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index b384b2b5073329ccc1e2f0a3caa9e6b7f26b2535..ccef30cf322ee7e28ac47536b335602eceda7bbf 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2071,10 +2071,12 @@ class RDD(object): batchSize = min(10, self.ctx._batchSize or 1024) ser = BatchedSerializer(PickleSerializer(), batchSize) selfCopy = self._reserialize(ser) + jrdd_deserializer = selfCopy._jrdd_deserializer jrdd = selfCopy._jrdd.coalesce(numPartitions, shuffle) else: + jrdd_deserializer = self._jrdd_deserializer jrdd = self._jrdd.coalesce(numPartitions, shuffle) - return RDD(jrdd, self.ctx, self._jrdd_deserializer) + return RDD(jrdd, self.ctx, jrdd_deserializer) def zip(self, other): """ diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 1df91ad9568c56441f93522ee7c17fc2984240e1..8d227eac3d7ef948955033afb6df0c3ea077be6c 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -972,6 +972,12 @@ class RDDTests(ReusedPySparkTestCase): zeros = len([x for x in l if x == 0]) self.assertTrue(zeros == 0) + def test_repartition_on_textfile(self): + path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") + rdd = self.sc.textFile(path) + result = rdd.repartition(1).collect() + self.assertEqual(u"Hello World!", result[0]) + def test_distinct(self): rdd = self.sc.parallelize((1, 2, 3)*10, 10) self.assertEqual(rdd.getNumPartitions(), 10)