Added a new flag in Aggregator to indicate applying map side combiners.

5945bcdc · Reynold Xin · c68e820b · 5945bcdc · 5945bcdc · 5945bcdc
Commit 5945bcdc authored 12 years ago by Reynold Xin
--- a/core/src/main/scala/spark/Aggregator.scala
+++ b/core/src/main/scala/spark/Aggregator.scala
 package spark
+/** A set of functions used to aggregate data.
+  * 
+  * @param createCombiner function to create the initial value of the aggregation.
+  * @param mergeValue function to merge a new value into the aggregation result.
+  * @param mergeCombiners function to merge outputs from multiple mergeValue function.
+  * @param mapSideCombine whether to apply combiners on map partitions, also
+  *                       known as map-side aggregations. When set to false, 
+  *                       mergeCombiners function is not used.
+  */
 class Aggregator[K, V, C] (
    val createCombiner: V => C,
    val mergeValue: (C, V) => C,
-    val mergeCombiners: (C, C) => C)
+    val mergeCombiners: (C, C) => C,
+    val mapSideCombine: Boolean = true)
  extends Serializable
--- a/core/src/main/scala/spark/ShuffledRDD.scala
+++ b/core/src/main/scala/spark/ShuffledRDD.scala
@@ -29,10 +29,9 @@ class ShuffledRDD[K, V, C](
    val combiners = new JHashMap[K, C]
    val fetcher = SparkEnv.get.shuffleFetcher
-    if (aggregator.mergeCombiners != null) {
+    if (aggregator.mapSideCombine) {
-      // If mergeCombiners is specified, combiners are applied on the map
+      // Apply combiners on map partitions. In this case, post-shuffle we get a
-      // partitions. In this case, post-shuffle we get a list of outputs from
+      // list of outputs from the combiners and merge them using mergeCombiners.
-      // the combiners and merge them using mergeCombiners.
      def mergePairWithMapSideCombiners(k: K, c: C) {
        val oldC = combiners.get(k)
        if (oldC == null) {
@@ -43,9 +42,9 @@ class ShuffledRDD[K, V, C](
      }
      fetcher.fetch[K, C](dep.shuffleId, split.index, mergePairWithMapSideCombiners)
    } else {
-      // If mergeCombiners is not specified, no combiner is applied on the map
+      // Do not apply combiners on map partitions (i.e. map side aggregation is
-      // partitions (i.e. map side aggregation is turned off). Post-shuffle we
+      // turned off). Post-shuffle we get a list of values and we use mergeValue
-      // get a list of values and we use mergeValue to merge them.
+      // to merge them.
      def mergePairWithoutMapSideCombiners(k: K, v: V) {
        val oldC = combiners.get(k)
        if (oldC == null) {

--- a/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala
+++ b/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala
@@ -108,7 +108,7 @@ class ShuffleMapTask(
    val partitioner = dep.partitioner
    val bucketIterators =
-      if (aggregator.mergeCombiners != null) {
+      if (aggregator.mapSideCombine) {
        // Apply combiners (map-side aggregation) to the map output.
        val buckets = Array.tabulate(numOutputSplits)(_ => new HashMap[Any, Any])
        for (elem <- rdd.iterator(split)) {