From 06c155c90dc784b07002f33d98dcfe9be1e38002 Mon Sep 17 00:00:00 2001 From: Xiao Li <gatorsmile@gmail.com> Date: Sat, 27 May 2017 21:32:18 -0700 Subject: [PATCH] [SPARK-20908][SQL] Cache Manager: Hint should be ignored in plan matching ### What changes were proposed in this pull request? In Cache manager, the plan matching should ignore Hint. ```Scala val df1 = spark.range(10).join(broadcast(spark.range(10))) df1.cache() spark.range(10).join(spark.range(10)).explain() ``` The output plan of the above query shows that the second query is not using the cached data of the first query. ``` BroadcastNestedLoopJoin BuildRight, Inner :- *Range (0, 10, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode +- *Range (0, 10, step=1, splits=2) ``` After the fix, the plan becomes ``` InMemoryTableScan [id#20L, id#23L] +- InMemoryRelation [id#20L, id#23L], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas) +- BroadcastNestedLoopJoin BuildRight, Inner :- *Range (0, 10, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode +- *Range (0, 10, step=1, splits=2) ``` ### How was this patch tested? Added a test. Author: Xiao Li <gatorsmile@gmail.com> Closes #18131 from gatorsmile/HintCache. --- .../apache/spark/sql/catalyst/plans/logical/hints.scala | 2 ++ .../apache/spark/sql/catalyst/plans/SameResultSuite.scala | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala index b96d7bc9cf..5fe6d2d8da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala @@ -40,6 +40,8 @@ case class ResolvedHint(child: LogicalPlan, hints: HintInfo = HintInfo()) override def output: Seq[Attribute] = child.output + override lazy val canonicalized: LogicalPlan = child.canonicalized + override def computeStats(conf: SQLConf): Statistics = { val stats = child.stats(conf) stats.copy(hints = hints) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SameResultSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SameResultSuite.scala index 467f76193c..7c8ed78a49 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SameResultSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SameResultSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Union} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, ResolvedHint, Union} import org.apache.spark.sql.catalyst.util._ /** @@ -66,4 +66,10 @@ class SameResultSuite extends SparkFunSuite { assertSameResult(Union(Seq(testRelation, testRelation2)), Union(Seq(testRelation2, testRelation))) } + + test("hint") { + val df1 = testRelation.join(ResolvedHint(testRelation)) + val df2 = testRelation.join(testRelation) + assertSameResult(df1, df2) + } } -- GitLab