From 6550086bbdf4c1581cbfa90550c5a388e531a736 Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro <yamamuro@apache.org> Date: Sat, 29 Jul 2017 10:14:47 -0700 Subject: [PATCH] [SPARK-20962][SQL] Support subquery column aliases in FROM clause ## What changes were proposed in this pull request? This pr added parsing rules to support subquery column aliases in FROM clause. This pr is a sub-task of #18079. ## How was this patch tested? Added tests in `PlanParserSuite` and `SQLQueryTestSuite`. Author: Takeshi Yamamuro <yamamuro@apache.org> Closes #18185 from maropu/SPARK-20962. --- .../spark/sql/catalyst/parser/SqlBase.g4 | 2 +- .../sql/catalyst/analysis/Analyzer.scala | 25 +++++++++++++++++++ .../sql/catalyst/analysis/unresolved.scala | 23 ++++++++++++++++- .../sql/catalyst/parser/AstBuilder.scala | 18 +++++++++---- .../sql/catalyst/analysis/AnalysisSuite.scala | 20 +++++++++++++++ .../sql/catalyst/parser/PlanParserSuite.scala | 13 +++++++++- .../sql-tests/inputs/table-aliases.sql | 3 +++ .../sql-tests/results/table-aliases.sql.out | 10 +++++++- 8 files changed, 105 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index ef9f88a902..4534b7dcf6 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -474,7 +474,7 @@ identifierComment relationPrimary : tableIdentifier sample? tableAlias #tableName - | '(' queryNoWith ')' sample? (AS? strictIdentifier)? #aliasedQuery + | '(' queryNoWith ')' sample? tableAlias #aliasedQuery | '(' relation ')' sample? (AS? strictIdentifier)? #aliasedRelation | inlineTable #inlineTableDefault2 | functionTable #tableValuedFunction diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index f987ed8801..a6d297cfd6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -141,6 +141,7 @@ class Analyzer( ResolveFunctions :: ResolveAliases :: ResolveSubquery :: + ResolveSubqueryColumnAliases :: ResolveWindowOrder :: ResolveWindowFrame :: ResolveNaturalAndUsingJoin :: @@ -1323,6 +1324,30 @@ class Analyzer( } } + /** + * Replaces unresolved column aliases for a subquery with projections. + */ + object ResolveSubqueryColumnAliases extends Rule[LogicalPlan] { + + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { + case u @ UnresolvedSubqueryColumnAliases(columnNames, child) if child.resolved => + // Resolves output attributes if a query has alias names in its subquery: + // e.g., SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2) + val outputAttrs = child.output + // Checks if the number of the aliases equals to the number of output columns + // in the subquery. + if (columnNames.size != outputAttrs.size) { + u.failAnalysis("Number of column aliases does not match number of columns. " + + s"Number of column aliases: ${columnNames.size}; " + + s"number of columns: ${outputAttrs.size}.") + } + val aliases = outputAttrs.zip(columnNames).map { case (attr, aliasName) => + Alias(attr, aliasName)() + } + Project(aliases, child) + } + } + /** * Turns projections that contain aggregate expressions into aggregations. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index fb322697c7..b7a704dc84 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode} import org.apache.spark.sql.catalyst.parser.ParserUtils -import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.catalyst.util.quoteIdentifier import org.apache.spark.sql.types.{DataType, Metadata, StructType} @@ -422,6 +422,27 @@ case class UnresolvedAlias( override lazy val resolved = false } +/** + * Aliased column names resolved by positions for subquery. We could add alias names for output + * columns in the subquery: + * {{{ + * // Assign alias names for output columns + * SELECT col1, col2 FROM testData AS t(col1, col2); + * }}} + * + * @param outputColumnNames the [[LogicalPlan]] on which this subquery column aliases apply. + * @param child the logical plan of this subquery. + */ +case class UnresolvedSubqueryColumnAliases( + outputColumnNames: Seq[String], + child: LogicalPlan) + extends UnaryNode { + + override def output: Seq[Attribute] = Nil + + override lazy val resolved = false +} + /** * Holds the deserializer expression and the attributes that are available during the resolution * for it. Deserializer expression is a special kind of expression that is not always resolved by diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 6795be758e..0757826178 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -750,20 +750,28 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging /** * Create an alias (SubqueryAlias) for a sub-query. This is practically the same as * visitAliasedRelation and visitNamedExpression, ANTLR4 however requires us to use 3 different - * hooks. + * hooks. We could add alias names for output columns, for example: + * {{{ + * SELECT col1, col2 FROM testData AS t(col1, col2) + * }}} */ override def visitAliasedQuery(ctx: AliasedQueryContext): LogicalPlan = withOrigin(ctx) { - val alias = if (ctx.strictIdentifier == null) { + val alias = if (ctx.tableAlias.strictIdentifier == null) { // For un-aliased subqueries, use a default alias name that is not likely to conflict with // normal subquery names, so that parent operators can only access the columns in subquery by // unqualified names. Users can still use this special qualifier to access columns if they // know it, but that's not recommended. "__auto_generated_subquery_name" } else { - ctx.strictIdentifier.getText + ctx.tableAlias.strictIdentifier.getText + } + val subquery = SubqueryAlias(alias, plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample)) + if (ctx.tableAlias.identifierList != null) { + val columnAliases = visitIdentifierList(ctx.tableAlias.identifierList) + UnresolvedSubqueryColumnAliases(columnAliases, subquery) + } else { + subquery } - - SubqueryAlias(alias, plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample)) } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index be26b1b26f..9bcf4773fa 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -470,4 +470,24 @@ class AnalysisSuite extends AnalysisTest with ShouldMatchers { Seq("Number of column aliases does not match number of columns. Table name: TaBlE3; " + "number of column aliases: 5; number of columns: 4.")) } + + test("SPARK-20962 Support subquery column aliases in FROM clause") { + def tableColumnsWithAliases(outputNames: Seq[String]): LogicalPlan = { + UnresolvedSubqueryColumnAliases( + outputNames, + SubqueryAlias( + "t", + UnresolvedRelation(TableIdentifier("TaBlE3"))) + ).select(star()) + } + assertAnalysisSuccess(tableColumnsWithAliases("col1" :: "col2" :: "col3" :: "col4" :: Nil)) + assertAnalysisError( + tableColumnsWithAliases("col1" :: Nil), + Seq("Number of column aliases does not match number of columns. " + + "Number of column aliases: 1; number of columns: 4.")) + assertAnalysisError( + tableColumnsWithAliases("col1" :: "col2" :: "col3" :: "col4" :: "col5" :: Nil), + Seq("Number of column aliases does not match number of columns. " + + "Number of column aliases: 5; number of columns: 4.")) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index 725bcb833f..c7f39ae181 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.parser import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAttribute, UnresolvedFunction, UnresolvedGenerator, UnresolvedInlineTable, UnresolvedRelation, UnresolvedTableValuedFunction} +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAttribute, UnresolvedFunction, UnresolvedGenerator, UnresolvedInlineTable, UnresolvedRelation, UnresolvedSubqueryColumnAliases, UnresolvedTableValuedFunction} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ @@ -495,6 +495,17 @@ class PlanParserSuite extends AnalysisTest { .select(star())) } + test("SPARK-20962 Support subquery column aliases in FROM clause") { + assertEqual( + "SELECT * FROM (SELECT a AS x, b AS y FROM t) t(col1, col2)", + UnresolvedSubqueryColumnAliases( + Seq("col1", "col2"), + SubqueryAlias( + "t", + UnresolvedRelation(TableIdentifier("t")).select('a.as("x"), 'b.as("y"))) + ).select(star())) + } + test("inline table") { assertEqual("values 1, 2, 3, 4", UnresolvedInlineTable(Seq("col1"), Seq(1, 2, 3, 4).map(x => Seq(Literal(x))))) diff --git a/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql b/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql index c90a9c7f85..85481cbbf9 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql @@ -15,3 +15,6 @@ SELECT * FROM testData AS t(col1); -- Check alias duplication SELECT a AS col1, b AS col2 FROM testData AS t(c, d); + +-- Subquery aliases in FROM clause +SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2); diff --git a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out index 7abbcd834a..4459f3186c 100644 --- a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 7 +-- Number of queries: 8 -- !query 0 @@ -61,3 +61,11 @@ struct<> -- !query 6 output org.apache.spark.sql.AnalysisException cannot resolve '`a`' given input columns: [t.c, t.d]; line 1 pos 7 + + +-- !query 7 +SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2) +-- !query 7 schema +struct<col1:int,col2:int> +-- !query 7 output +1 1 -- GitLab