[SPARK-9418][SQL] Use sort-merge join as the default shuffle join.

Sort-merge join is more robust in Spark since sorting can be made using the Tungsten sort operator. Author: Reynold Xin <rxin@databricks.com> Closes #7733 from rxin/smj and squashes the following commits: 61e4d34 [Reynold Xin] Fixed test case. 5ffd731 [Reynold Xin] Fixed JoinSuite. a137dc0 [Reynold Xin] [SPARK-9418][SQL] Use sort-merge join as the default shuffle join.

[SPARK-9418][SQL] Use sort-merge join as the default shuffle join.
6662ee21 · Reynold Xin · b7f54119 · 6662ee21 · 6662ee21 · 6662ee21
Commit 6662ee21 authored 9 years ago by Reynold Xin
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -322,7 +322,7 @@ private[spark] object SQLConf {
      " memory.")
  val SORTMERGE_JOIN = booleanConf("spark.sql.planner.sortMergeJoin",
-    defaultValue = Some(false),
+    defaultValue = Some(true),
    doc = "When true, use sort merge join (as opposed to hash join) by default for large joins.")
  // This is only used for the thriftserver

--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -79,9 +79,9 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
      ("SELECT * FROM testData FULL OUTER JOIN testData2 WHERE key = 2", classOf[CartesianProduct]),
      ("SELECT * FROM testData JOIN testData2 WHERE key > a", classOf[CartesianProduct]),
      ("SELECT * FROM testData FULL OUTER JOIN testData2 WHERE key > a", classOf[CartesianProduct]),
-      ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[ShuffledHashJoin]),
+      ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[SortMergeJoin]),
-      ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[ShuffledHashJoin]),
+      ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[SortMergeJoin]),
-      ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[ShuffledHashJoin]),
+      ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[SortMergeJoin]),
      ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]),
      ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2",
        classOf[ShuffledHashOuterJoin]),

--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
@@ -23,16 +23,16 @@ import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.hive.test.TestHive
 /**
- * Runs the test cases that are included in the hive distribution with sort merge join is true.
+ * Runs the test cases that are included in the hive distribution with hash joins.
 */
-class SortMergeCompatibilitySuite extends HiveCompatibilitySuite {
+class HashJoinCompatibilitySuite extends HiveCompatibilitySuite {
  override def beforeAll() {
    super.beforeAll()
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, true)
+    TestHive.setConf(SQLConf.SORTMERGE_JOIN, false)
  }
  override def afterAll() {
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, false)
+    TestHive.setConf(SQLConf.SORTMERGE_JOIN, true)
    super.afterAll()
  }

--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -172,7 +172,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
        bhj = df.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
        assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off")
-        val shj = df.queryExecution.sparkPlan.collect { case j: ShuffledHashJoin => j }
+        val shj = df.queryExecution.sparkPlan.collect { case j: SortMergeJoin => j }
        assert(shj.size === 1,
          "ShuffledHashJoin should be planned when BroadcastHashJoin is turned off")