Skip to content
Snippets Groups Projects
Commit 22f3d333 authored by liuxian's avatar liuxian Committed by Wenchen Fan
Browse files

[SPARK-23389][CORE] When the shuffle dependency specifies aggregation ,and...

[SPARK-23389][CORE] When the shuffle dependency specifies aggregation ,and `dependency.mapSideCombine =false`, we should be able to use serialized sorting.

## What changes were proposed in this pull request?
When the shuffle dependency specifies aggregation ,and `dependency.mapSideCombine=false`, in the map side,there is no need for aggregation and sorting, so we should be able to use serialized sorting.

## How was this patch tested?
Existing unit test

Author: liuxian <liu.xian3@zte.com.cn>

Closes #20576 from 10110346/mapsidecombine.
parent 25c2776d
No related branches found
No related tags found
No related merge requests found
...@@ -76,6 +76,9 @@ class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag]( ...@@ -76,6 +76,9 @@ class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
val mapSideCombine: Boolean = false) val mapSideCombine: Boolean = false)
extends Dependency[Product2[K, V]] { extends Dependency[Product2[K, V]] {
if (mapSideCombine) {
require(aggregator.isDefined, "Map-side combine without Aggregator specified!")
}
override def rdd: RDD[Product2[K, V]] = _rdd.asInstanceOf[RDD[Product2[K, V]]] override def rdd: RDD[Product2[K, V]] = _rdd.asInstanceOf[RDD[Product2[K, V]]]
private[spark] val keyClassName: String = reflect.classTag[K].runtimeClass.getName private[spark] val keyClassName: String = reflect.classTag[K].runtimeClass.getName
......
...@@ -90,7 +90,6 @@ private[spark] class BlockStoreShuffleReader[K, C]( ...@@ -90,7 +90,6 @@ private[spark] class BlockStoreShuffleReader[K, C](
dep.aggregator.get.combineValuesByKey(keyValuesIterator, context) dep.aggregator.get.combineValuesByKey(keyValuesIterator, context)
} }
} else { } else {
require(!dep.mapSideCombine, "Map-side combine without Aggregator specified!")
interruptibleIter.asInstanceOf[Iterator[Product2[K, C]]] interruptibleIter.asInstanceOf[Iterator[Product2[K, C]]]
} }
......
...@@ -188,9 +188,9 @@ private[spark] object SortShuffleManager extends Logging { ...@@ -188,9 +188,9 @@ private[spark] object SortShuffleManager extends Logging {
log.debug(s"Can't use serialized shuffle for shuffle $shufId because the serializer, " + log.debug(s"Can't use serialized shuffle for shuffle $shufId because the serializer, " +
s"${dependency.serializer.getClass.getName}, does not support object relocation") s"${dependency.serializer.getClass.getName}, does not support object relocation")
false false
} else if (dependency.aggregator.isDefined) { } else if (dependency.mapSideCombine) {
log.debug( log.debug(s"Can't use serialized shuffle for shuffle $shufId because we need to do " +
s"Can't use serialized shuffle for shuffle $shufId because an aggregator is defined") s"map-side aggregation")
false false
} else if (numPartitions > MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE) { } else if (numPartitions > MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE) {
log.debug(s"Can't use serialized shuffle for shuffle $shufId because it has more than " + log.debug(s"Can't use serialized shuffle for shuffle $shufId because it has more than " +
......
...@@ -50,7 +50,6 @@ private[spark] class SortShuffleWriter[K, V, C]( ...@@ -50,7 +50,6 @@ private[spark] class SortShuffleWriter[K, V, C](
/** Write a bunch of records to this task's output */ /** Write a bunch of records to this task's output */
override def write(records: Iterator[Product2[K, V]]): Unit = { override def write(records: Iterator[Product2[K, V]]): Unit = {
sorter = if (dep.mapSideCombine) { sorter = if (dep.mapSideCombine) {
require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!")
new ExternalSorter[K, V, C]( new ExternalSorter[K, V, C](
context, dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer) context, dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
} else { } else {
...@@ -107,7 +106,6 @@ private[spark] object SortShuffleWriter { ...@@ -107,7 +106,6 @@ private[spark] object SortShuffleWriter {
def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = {
// We cannot bypass sorting if we need to do map-side aggregation. // We cannot bypass sorting if we need to do map-side aggregation.
if (dep.mapSideCombine) { if (dep.mapSideCombine) {
require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!")
false false
} else { } else {
val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
......
...@@ -85,6 +85,14 @@ class SortShuffleManagerSuite extends SparkFunSuite with Matchers { ...@@ -85,6 +85,14 @@ class SortShuffleManagerSuite extends SparkFunSuite with Matchers {
mapSideCombine = false mapSideCombine = false
))) )))
// We support serialized shuffle if we do not need to do map-side aggregation
assert(canUseSerializedShuffle(shuffleDep(
partitioner = new HashPartitioner(2),
serializer = kryo,
keyOrdering = None,
aggregator = Some(mock(classOf[Aggregator[Any, Any, Any]])),
mapSideCombine = false
)))
} }
test("unsupported shuffle dependencies for serialized shuffle") { test("unsupported shuffle dependencies for serialized shuffle") {
...@@ -111,14 +119,7 @@ class SortShuffleManagerSuite extends SparkFunSuite with Matchers { ...@@ -111,14 +119,7 @@ class SortShuffleManagerSuite extends SparkFunSuite with Matchers {
mapSideCombine = false mapSideCombine = false
))) )))
// We do not support shuffles that perform aggregation // We do not support serialized shuffle if we need to do map-side aggregation
assert(!canUseSerializedShuffle(shuffleDep(
partitioner = new HashPartitioner(2),
serializer = kryo,
keyOrdering = None,
aggregator = Some(mock(classOf[Aggregator[Any, Any, Any]])),
mapSideCombine = false
)))
assert(!canUseSerializedShuffle(shuffleDep( assert(!canUseSerializedShuffle(shuffleDep(
partitioner = new HashPartitioner(2), partitioner = new HashPartitioner(2),
serializer = kryo, serializer = kryo,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment