From a9ec033c8cf489898cc47e2043bd9e86b7df1ff8 Mon Sep 17 00:00:00 2001 From: Zongheng Yang <zongheng.y@gmail.com> Date: Mon, 9 Jun 2014 16:47:44 -0700 Subject: [PATCH] [SPARK-1704][SQL] Fully support EXPLAIN commands as SchemaRDD. This PR attempts to resolve [SPARK-1704](https://issues.apache.org/jira/browse/SPARK-1704) by introducing a physical plan for EXPLAIN commands, which just prints out the debug string (containing various SparkSQL's plans) of the corresponding QueryExecution for the actual query. Author: Zongheng Yang <zongheng.y@gmail.com> Closes #1003 from concretevitamin/explain-cmd and squashes the following commits: 5b7911f [Zongheng Yang] Add a regression test. 1bfa379 [Zongheng Yang] Modify output(). 719ada9 [Zongheng Yang] Override otherCopyArgs for ExplainCommandPhysical. 4318fd7 [Zongheng Yang] Make all output one Row. 439c6ab [Zongheng Yang] Minor cleanups. 408f574 [Zongheng Yang] SPARK-1704: Add CommandStrategy and ExplainCommandPhysical. --- .../catalyst/plans/logical/LogicalPlan.scala | 8 +++-- .../org/apache/spark/sql/SQLContext.scala | 6 ++++ .../spark/sql/execution/SparkStrategies.scala | 11 +++++++ .../apache/spark/sql/execution/commands.scala | 32 +++++++++++++++++++ .../apache/spark/sql/hive/HiveContext.scala | 3 +- .../sql/hive/execution/HiveQuerySuite.scala | 12 +++++++ 6 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 2b8fbdcde9..4f641cd3a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.types.StructType +import org.apache.spark.sql.catalyst.types.{StringType, StructType} import org.apache.spark.sql.catalyst.trees abstract class LogicalPlan extends QueryPlan[LogicalPlan] { @@ -102,7 +102,7 @@ abstract class LeafNode extends LogicalPlan with trees.LeafNode[LogicalPlan] { */ abstract class Command extends LeafNode { self: Product => - def output = Seq.empty + def output: Seq[Attribute] = Seq.empty } /** @@ -115,7 +115,9 @@ case class NativeCommand(cmd: String) extends Command * Returned by a parser when the users only wants to see what query plan would be executed, without * actually performing the execution. */ -case class ExplainCommand(plan: LogicalPlan) extends Command +case class ExplainCommand(plan: LogicalPlan) extends Command { + override def output = Seq(AttributeReference("plan", StringType, nullable = false)()) +} /** * A logical plan node with single child. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 5626f0da22..fde4c485b5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -191,6 +191,7 @@ class SQLContext(@transient val sparkContext: SparkContext) val sparkContext = self.sparkContext val strategies: Seq[Strategy] = + CommandStrategy(self) :: TakeOrdered :: PartialAggregation :: LeftSemiJoin :: @@ -256,6 +257,11 @@ class SQLContext(@transient val sparkContext: SparkContext) Batch("Prepare Expressions", Once, new BindReferences[SparkPlan]) :: Nil } + // TODO: or should we make QueryExecution protected[sql]? + protected[sql] def mkQueryExecution(plan: LogicalPlan) = new QueryExecution { + val logical = plan + } + /** * The primary workflow for executing relational queries using Spark. Designed to allow easy * access to the intermediate phases of query execution for developers. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 6463f47510..295c265b16 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -233,4 +233,15 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case _ => Nil } } + + // TODO: this should be merged with SPARK-1508's SetCommandStrategy + case class CommandStrategy(context: SQLContext) extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case logical.ExplainCommand(child) => + val qe = context.mkQueryExecution(child) + Seq(execution.ExplainCommandPhysical(qe.executedPlan, plan.output)(context)) + case _ => Nil + } + } + } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala new file mode 100644 index 0000000000..5371d2f479 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{SQLContext, Row} +import org.apache.spark.sql.catalyst.expressions.{GenericRow, Attribute} + +case class ExplainCommandPhysical(child: SparkPlan, output: Seq[Attribute]) + (@transient context: SQLContext) extends UnaryNode { + def execute(): RDD[Row] = { + val planString = new GenericRow(Array[Any](child.toString)) + context.sparkContext.parallelize(Seq(planString)) + } + + override def otherCopyArgs = context :: Nil +} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index fbab2ac16b..4b97dc25ac 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -218,6 +218,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { val hiveContext = self override val strategies: Seq[Strategy] = Seq( + CommandStrategy(self), TakeOrdered, ParquetOperations, HiveTableScans, @@ -304,7 +305,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { */ def stringResult(): Seq[String] = analyzed match { case NativeCommand(cmd) => runSqlHive(cmd) - case ExplainCommand(plan) => new QueryExecution { val logical = plan }.toString.split("\n") + case ExplainCommand(plan) => mkQueryExecution(plan).toString.split("\n") case query => val result: Seq[Seq[Any]] = toRdd.collect().toSeq // We need the types so we can output struct field names diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 125cc18bfb..c56eee2580 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.hive.test.TestHive._ +import org.apache.spark.sql.hive.test.TestHive /** * A set of test cases expressed in Hive QL that are not covered by the tests included in the hive distribution. @@ -159,4 +160,15 @@ class HiveQuerySuite extends HiveComparisonTest { hql("SHOW TABLES").toString hql("SELECT * FROM src").toString } + + test("SPARK-1704: Explain commands as a SchemaRDD") { + hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") + val rdd = hql("explain select key, count(value) from src group by key") + assert(rdd.collect().size == 1) + assert(rdd.toString.contains("ExplainCommand")) + assert(rdd.filter(row => row.toString.contains("ExplainCommand")).collect().size == 0, + "actual contents of the result should be the plans of the query to be explained") + TestHive.reset() + } + } -- GitLab