[SPARK-14251][SQL] Add SQL command for printing out generated code for debugging

## What changes were proposed in this pull request? This PR implements `EXPLAIN CODEGEN` SQL command which returns generated codes like `debugCodegen`. In `spark-shell`, we don't need to `import debug` module. In `spark-sql`, we can use this SQL command now. **Before** ``` scala> import org.apache.spark.sql.execution.debug._ scala> sql("select 'a' as a group by 1").debugCodegen() Found 2 WholeStageCodegen subtrees. == Subtree 1 / 2 == ... Generated code: ... == Subtree 2 / 2 == ... Generated code: ... ``` **After** ``` scala> sql("explain extended codegen select 'a' as a group by 1").collect().foreach(println) [Found 2 WholeStageCodegen subtrees.] [== Subtree 1 / 2 ==] ... [] [Generated code:] ... [] [== Subtree 2 / 2 ==] ... [] [Generated code:] ... ``` ## How was this patch tested? Pass the Jenkins tests (including new testcases) Author: Dongjoon Hyun <dongjoon@apache.org> Closes #12099 from dongjoon-hyun/SPARK-14251.

[SPARK-14251][SQL] Add SQL command for printing out generated code for debugging
fa1af0af · Dongjoon Hyun · Reynold Xin · 877dc712 · fa1af0af · fa1af0af
Commit fa1af0af authored 9 years ago by Dongjoon Hyun Committed by Reynold Xin 9 years ago
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -584,7 +584,7 @@ frameBound
 explainOption
-    : LOGICAL | FORMATTED | EXTENDED
+    : LOGICAL | FORMATTED | EXTENDED | CODEGEN
    ;
 transactionMode
@@ -633,7 +633,7 @@ nonReserved
    | DELIMITED | FIELDS | TERMINATED | COLLECTION | ITEMS | KEYS | ESCAPED | LINES | SEPARATED
    | EXTENDED | REFRESH | CLEAR | CACHE | UNCACHE | LAZY | TEMPORARY | OPTIONS
    | GROUPING | CUBE | ROLLUP
-    | EXPLAIN | FORMAT | LOGICAL | FORMATTED
+    | EXPLAIN | FORMAT | LOGICAL | FORMATTED | CODEGEN
    | TABLESAMPLE | USE | TO | BUCKET | PERCENTLIT | OUT | OF
    | SET
    | VIEW | REPLACE
@@ -724,6 +724,7 @@ DESCRIBE: 'DESCRIBE';
 EXPLAIN: 'EXPLAIN';
 FORMAT: 'FORMAT';
 LOGICAL: 'LOGICAL';
+CODEGEN: 'CODEGEN';
 CAST: 'CAST';
 SHOW: 'SHOW';
 TABLES: 'TABLES';

--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -136,7 +136,8 @@ class SparkSqlAstBuilder extends AstBuilder {
    // Create the explain comment.
    val statement = plan(ctx.statement)
    if (isExplainableStatement(statement)) {
-      ExplainCommand(statement, extended = options.exists(_.EXTENDED != null))
+      ExplainCommand(statement, extended = options.exists(_.EXTENDED != null),
+        codegen = options.exists(_.CODEGEN != null))
    } else {
      ExplainCommand(OneRowRelation)
    }

--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -28,10 +28,10 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.debug._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 /**
 * A logical command that is executed for its side-effects.  `RunnableCommand`s are
 * wrapped in `ExecutedCommand` during execution.
@@ -237,15 +237,22 @@ case class ExplainCommand(
    logicalPlan: LogicalPlan,
    override val output: Seq[Attribute] =
      Seq(AttributeReference("plan", StringType, nullable = true)()),
-    extended: Boolean = false)
+    extended: Boolean = false,
+    codegen: Boolean = false)
  extends RunnableCommand {
  // Run through the optimizer to generate the physical plan.
  override def run(sqlContext: SQLContext): Seq[Row] = try {
    // TODO in Hive, the "extended" ExplainCommand prints the AST as well, and detailed properties.
    val queryExecution = sqlContext.executePlan(logicalPlan)
-    val outputString = if (extended) queryExecution.toString else queryExecution.simpleString
+    val outputString =
+      if (codegen) {
+        codegenString(queryExecution.executedPlan)
+      } else if (extended) {
+        queryExecution.toString
+      } else {
+        queryExecution.simpleString
+      }
    outputString.split("\n").map(Row(_))
  } catch { case cause: TreeNodeException[_] =>
    ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_))

--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -48,6 +48,25 @@ package object debug {
    // scalastyle:on println
  }
+  def codegenString(plan: SparkPlan): String = {
+    val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegen]()
+    plan transform {
+      case s: WholeStageCodegen =>
+        codegenSubtrees += s
+        s
+      case s => s
+    }
+    var output = s"Found ${codegenSubtrees.size} WholeStageCodegen subtrees.\n"
+    for ((s, i) <- codegenSubtrees.toSeq.zipWithIndex) {
+      output += s"== Subtree ${i + 1} / ${codegenSubtrees.size} ==\n"
+      output += s
+      output += "\nGenerated code:\n"
+      val (_, source) = s.doCodeGen()
+      output += s"${CodeFormatter.format(source)}\n"
+    }
+    output
+  }
  /**
   * Augments [[SQLContext]] with debug methods.
   */
@@ -81,28 +100,7 @@ package object debug {
     * WholeStageCodegen subtree).
     */
    def debugCodegen(): Unit = {
-      debugPrint(debugCodegenString())
+      debugPrint(codegenString(query.queryExecution.executedPlan))
-    }
-    /** Visible for testing. */
-    def debugCodegenString(): String = {
-      val plan = query.queryExecution.executedPlan
-      val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegen]()
-      plan transform {
-        case s: WholeStageCodegen =>
-          codegenSubtrees += s
-          s
-        case s => s
-      }
-      var output = s"Found ${codegenSubtrees.size} WholeStageCodegen subtrees.\n"
-      for ((s, i) <- codegenSubtrees.toSeq.zipWithIndex) {
-        output += s"== Subtree ${i + 1} / ${codegenSubtrees.size} ==\n"
-        output += s
-        output += "\nGenerated code:\n"
-        val (_, source) = s.doCodeGen()
-        output += s"${CodeFormatter.format(source)}\n"
-      }
-      output
    }
  }
@@ -123,6 +121,7 @@ package object debug {
    /**
     * A collection of metrics for each column of output.
+     *
     * @param elementTypes the actual runtime types for the output.  Useful when there are bugs
     *                     causing the wrong data to be projected.
     */

--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
@@ -27,7 +27,7 @@ class DebuggingSuite extends SparkFunSuite with SharedSQLContext {
  }
  test("debugCodegen") {
-    val res = sqlContext.range(10).groupBy("id").count().debugCodegenString()
+    val res = codegenString(sqlContext.range(10).groupBy("id").count().queryExecution.executedPlan)
    assert(res.contains("Subtree 1 / 2"))
    assert(res.contains("Subtree 2 / 2"))
    assert(res.contains("Object[]"))

--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -24,7 +24,6 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.command.RunnableCommand
 import org.apache.spark.sql.execution.datasources.{BucketSpec, DataSource, LogicalRelation}
 import org.apache.spark.sql.hive.HiveContext

--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -101,4 +101,33 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
        "Physical Plan should not contain Subquery since it's eliminated by optimizer")
    }
  }
+  test("EXPLAIN CODEGEN command") {
+    checkExistence(sql("EXPLAIN CODEGEN SELECT 1"), true,
+      "WholeStageCodegen",
+      "Generated code:",
+      "/* 001 */ public Object generate(Object[] references) {",
+      "/* 002 */   return new GeneratedIterator(references);",
+      "/* 003 */ }"
+    )
+    checkExistence(sql("EXPLAIN CODEGEN SELECT 1"), false,
+      "== Physical Plan =="
+    )
+    checkExistence(sql("EXPLAIN EXTENDED CODEGEN SELECT 1"), true,
+      "WholeStageCodegen",
+      "Generated code:",
+      "/* 001 */ public Object generate(Object[] references) {",
+      "/* 002 */   return new GeneratedIterator(references);",
+      "/* 003 */ }"
+    )
+    checkExistence(sql("EXPLAIN EXTENDED CODEGEN SELECT 1"), false,
+      "== Parsed Logical Plan ==",
+      "== Analyzed Logical Plan ==",
+      "== Optimized Logical Plan ==",
+      "== Physical Plan =="
+    )
+  }
 }