From a3d5300a030fb5f1c275e671603e0745b6466735 Mon Sep 17 00:00:00 2001 From: Stan Zhai <mail@zhaishidan.cn> Date: Thu, 9 Feb 2017 21:01:25 +0100 Subject: [PATCH] [SPARK-19509][SQL] Grouping Sets do not respect nullable grouping columns ## What changes were proposed in this pull request? The analyzer currently does not check if a column used in grouping sets is actually nullable itself. This can cause the nullability of the column to be incorrect, which can cause null pointer exceptions down the line. This PR fixes that by also consider the nullability of the column. This is only a problem for Spark 2.1 and below. The latest master uses a different approach. Closes https://github.com/apache/spark/pull/16874 ## How was this patch tested? Added a regression test to `SQLQueryTestSuite.grouping_set`. Author: Herman van Hovell <hvanhovell@databricks.com> Closes #16873 from hvanhovell/SPARK-19509. --- .../sql/catalyst/analysis/Analyzer.scala | 3 +- .../sql-tests/inputs/grouping_set.sql | 12 ++++- .../sql-tests/results/grouping_set.sql.out | 53 +++++++++++++++---- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 441c891b2c..f41e43431a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -307,7 +307,8 @@ class Analyzer( val attrLength = groupByAliases.length val expandedAttributes = groupByAliases.zipWithIndex.map { case (a, idx) => - a.toAttribute.withNullability(((nullBitmask >> (attrLength - idx - 1)) & 1) == 1) + val canBeNull = ((nullBitmask >> (attrLength - idx - 1)) & 1) == 1 + a.toAttribute.withNullability(a.nullable || canBeNull) } val expand = Expand(x.bitmasks, groupByAliases, expandedAttributes, gid, x.child) diff --git a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql index 3594283505..2b54658a07 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql @@ -2,7 +2,12 @@ CREATE TEMPORARY VIEW grouping AS SELECT * FROM VALUES ("1", "2", "3", 1), ("4", "5", "6", 1), ("7", "8", "9", 1) - as grouping(a, b, c, d); + AS grouping(a, b, c, d); + +CREATE TEMPORARY VIEW grouping_null AS SELECT * FROM VALUES + CAST(NULL AS STRING), + CAST(NULL AS STRING) + AS T(e); -- SPARK-17849: grouping set throws NPE #1 SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (()); @@ -13,5 +18,8 @@ SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a)); -- SPARK-17849: grouping set throws NPE #3 SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c)); +-- SPARK-19509: grouping set should honor input nullability +SELECT COUNT(1) FROM grouping_null GROUP BY e GROUPING SETS (e); - +DROP VIEW IF EXISTS grouping; +DROP VIEW IF EXISTS grouping_null; diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out index edb38a52b7..a9c056555d 100644 --- a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 4 +-- Number of queries: 8 -- !query 0 @@ -7,7 +7,7 @@ CREATE TEMPORARY VIEW grouping AS SELECT * FROM VALUES ("1", "2", "3", 1), ("4", "5", "6", 1), ("7", "8", "9", 1) - as grouping(a, b, c, d) + AS grouping(a, b, c, d) -- !query 0 schema struct<> -- !query 0 output @@ -15,28 +15,63 @@ struct<> -- !query 1 -SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (()) +CREATE TEMPORARY VIEW grouping_null AS SELECT * FROM VALUES + CAST(NULL AS STRING), + CAST(NULL AS STRING) + AS T(e) -- !query 1 schema -struct<a:string,b:string,c:string,count(d):bigint> +struct<> -- !query 1 output -NULL NULL NULL 3 + -- !query 2 -SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a)) +SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (()) -- !query 2 schema struct<a:string,b:string,c:string,count(d):bigint> -- !query 2 output +NULL NULL NULL 3 + + +-- !query 3 +SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a)) +-- !query 3 schema +struct<a:string,b:string,c:string,count(d):bigint> +-- !query 3 output 1 NULL NULL 1 4 NULL NULL 1 7 NULL NULL 1 --- !query 3 +-- !query 4 SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c)) --- !query 3 schema +-- !query 4 schema struct<a:string,b:string,c:string,count(d):bigint> --- !query 3 output +-- !query 4 output NULL NULL 3 1 NULL NULL 6 1 NULL NULL 9 1 + + +-- !query 5 +SELECT COUNT(1) FROM grouping_null GROUP BY e GROUPING SETS (e) +-- !query 5 schema +struct<count(1):bigint> +-- !query 5 output +2 + + +-- !query 6 +DROP VIEW IF EXISTS grouping +-- !query 6 schema +struct<> +-- !query 6 output + + + +-- !query 7 +DROP VIEW IF EXISTS grouping_null +-- !query 7 schema +struct<> +-- !query 7 output + -- GitLab