From fb0d60814a79747beb68da9613679141c44f2540 Mon Sep 17 00:00:00 2001 From: wangyang <wangyang@haizhi.com> Date: Sat, 5 Nov 2016 14:32:28 +0100 Subject: [PATCH] [SPARK-17849][SQL] Fix NPE problem when using grouping sets ## What changes were proposed in this pull request? Prior this pr, the following code would cause an NPE: `case class point(a:String, b:String, c:String, d: Int)` `val data = Seq( point("1","2","3", 1), point("4","5","6", 1), point("7","8","9", 1) )` `sc.parallelize(data).toDF().registerTempTable("table")` `spark.sql("select a, b, c, count(d) from table group by a, b, c GROUPING SETS ((a)) ").show()` The reason is that when the grouping_id() behavior was changed in #10677, some code (which should be changed) was left out. Take the above code for example, prior #10677, the bit mask for set "(a)" was `001`, while after #10677 the bit mask was changed to `011`. However, the `nonNullBitmask` was not changed accordingly. This pr will fix this problem. ## How was this patch tested? add integration tests Author: wangyang <wangyang@haizhi.com> Closes #15416 from yangw1234/groupingid. --- .../sql/catalyst/analysis/Analyzer.scala | 9 +++- .../sql-tests/inputs/grouping_set.sql | 17 ++++++++ .../sql-tests/results/grouping_set.sql.out | 42 +++++++++++++++++++ 3 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 5011f2fdbf..8dbec40800 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -299,10 +299,15 @@ class Analyzer( case other => Alias(other, other.toString)() } - val nonNullBitmask = x.bitmasks.reduce(_ & _) + // The rightmost bit in the bitmasks corresponds to the last expression in groupByAliases + // with 0 indicating this expression is in the grouping set. The following line of code + // calculates the bitmask representing the expressions that absent in at least one grouping + // set (indicated by 1). + val nullBitmask = x.bitmasks.reduce(_ | _) + val attrLength = groupByAliases.length val expandedAttributes = groupByAliases.zipWithIndex.map { case (a, idx) => - a.toAttribute.withNullability((nonNullBitmask & 1 << idx) == 0) + a.toAttribute.withNullability(((nullBitmask >> (attrLength - idx - 1)) & 1) == 1) } val expand = Expand(x.bitmasks, groupByAliases, expandedAttributes, gid, x.child) diff --git a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql new file mode 100644 index 0000000000..3594283505 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql @@ -0,0 +1,17 @@ +CREATE TEMPORARY VIEW grouping AS SELECT * FROM VALUES + ("1", "2", "3", 1), + ("4", "5", "6", 1), + ("7", "8", "9", 1) + as grouping(a, b, c, d); + +-- SPARK-17849: grouping set throws NPE #1 +SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (()); + +-- SPARK-17849: grouping set throws NPE #2 +SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a)); + +-- SPARK-17849: grouping set throws NPE #3 +SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c)); + + + diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out new file mode 100644 index 0000000000..edb38a52b7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out @@ -0,0 +1,42 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 4 + + +-- !query 0 +CREATE TEMPORARY VIEW grouping AS SELECT * FROM VALUES + ("1", "2", "3", 1), + ("4", "5", "6", 1), + ("7", "8", "9", 1) + as grouping(a, b, c, d) +-- !query 0 schema +struct<> +-- !query 0 output + + + +-- !query 1 +SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (()) +-- !query 1 schema +struct<a:string,b:string,c:string,count(d):bigint> +-- !query 1 output +NULL NULL NULL 3 + + +-- !query 2 +SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a)) +-- !query 2 schema +struct<a:string,b:string,c:string,count(d):bigint> +-- !query 2 output +1 NULL NULL 1 +4 NULL NULL 1 +7 NULL NULL 1 + + +-- !query 3 +SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c)) +-- !query 3 schema +struct<a:string,b:string,c:string,count(d):bigint> +-- !query 3 output +NULL NULL 3 1 +NULL NULL 6 1 +NULL NULL 9 1 -- GitLab