Skip to content
Snippets Groups Projects
Commit e2721231 authored by Cheng Hao's avatar Cheng Hao Committed by Yin Huai
Browse files

[SPARK-8972] [SQL] Incorrect result for rollup

We don't support the complex expression keys in the rollup/cube, and we even will not report it if we have the complex group by keys, that will cause very confusing/incorrect result.

e.g. `SELECT key%100 FROM src GROUP BY key %100 with ROLLUP`

This PR adds an additional project during the analyzing for the complex GROUP BY keys, and that projection will be the child of `Expand`, so to `Expand`, the GROUP BY KEY are always the simple key(attribute names).

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7343 from chenghao-intel/expand and squashes the following commits:

1ebbb59 [Cheng Hao] update the comment
827873f [Cheng Hao] update as feedback
34def69 [Cheng Hao] Add more unit test and comments
c695760 [Cheng Hao] fix bug of incorrect result for rollup
parent ba330968
No related branches found
No related tags found
No related merge requests found
Showing
with 145 additions and 3 deletions
......@@ -194,16 +194,52 @@ class Analyzer(
}
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case a if !a.childrenResolved => a // be sure all of the children are resolved.
case a: Cube =>
GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
case a: Rollup =>
GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
case x: GroupingSets =>
val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)()
// We will insert another Projection if the GROUP BY keys contains the
// non-attribute expressions. And the top operators can references those
// expressions by its alias.
// e.g. SELECT key%5 as c1 FROM src GROUP BY key%5 ==>
// SELECT a as c1 FROM (SELECT key%5 AS a FROM src) GROUP BY a
// find all of the non-attribute expressions in the GROUP BY keys
val nonAttributeGroupByExpressions = new ArrayBuffer[Alias]()
// The pair of (the original GROUP BY key, associated attribute)
val groupByExprPairs = x.groupByExprs.map(_ match {
case e: NamedExpression => (e, e.toAttribute)
case other => {
val alias = Alias(other, other.toString)()
nonAttributeGroupByExpressions += alias // add the non-attributes expression alias
(other, alias.toAttribute)
}
})
// substitute the non-attribute expressions for aggregations.
val aggregation = x.aggregations.map(expr => expr.transformDown {
case e => groupByExprPairs.find(_._1.semanticEquals(e)).map(_._2).getOrElse(e)
}.asInstanceOf[NamedExpression])
// substitute the group by expressions.
val newGroupByExprs = groupByExprPairs.map(_._2)
val child = if (nonAttributeGroupByExpressions.length > 0) {
// insert additional projection if contains the
// non-attribute expressions in the GROUP BY keys
Project(x.child.output ++ nonAttributeGroupByExpressions, x.child)
} else {
x.child
}
Aggregate(
x.groupByExprs :+ VirtualColumn.groupingIdAttribute,
x.aggregations,
Expand(x.bitmasks, x.groupByExprs, gid, x.child))
newGroupByExprs :+ VirtualColumn.groupingIdAttribute,
aggregation,
Expand(x.bitmasks, newGroupByExprs, gid, child))
}
}
......
500 NULL 0
91 0 1
84 1 1
105 2 1
113 3 1
107 4 1
1 NULL -3 2
1 NULL -1 2
1 NULL 3 2
1 NULL 4 2
1 NULL 5 2
1 NULL 6 2
1 NULL 12 2
1 NULL 14 2
1 NULL 15 2
1 NULL 22 2
1 NULL -3 2
1 NULL -1 2
1 NULL 3 2
1 NULL 4 2
1 NULL 5 2
1 NULL 6 2
1 NULL 12 2
1 NULL 14 2
1 NULL 15 2
1 NULL 22 2
500 NULL 0
91 0 1
84 1 1
105 2 1
113 3 1
107 4 1
1 0 5 3
1 0 15 3
1 0 25 3
1 0 60 3
1 0 75 3
1 0 80 3
1 0 100 3
1 0 140 3
1 0 145 3
1 0 150 3
1 0 5 3
1 0 15 3
1 0 25 3
1 0 60 3
1 0 75 3
1 0 80 3
1 0 100 3
1 0 140 3
1 0 145 3
1 0 150 3
......@@ -85,6 +85,60 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
}
}
createQueryTest("SPARK-8976 Wrong Result for Rollup #1",
"""
SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH ROLLUP
""".stripMargin)
createQueryTest("SPARK-8976 Wrong Result for Rollup #2",
"""
SELECT
count(*) AS cnt,
key % 5 as k1,
key-5 as k2,
GROUPING__ID as k3
FROM src group by key%5, key-5
WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
""".stripMargin)
createQueryTest("SPARK-8976 Wrong Result for Rollup #3",
"""
SELECT
count(*) AS cnt,
key % 5 as k1,
key-5 as k2,
GROUPING__ID as k3
FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
""".stripMargin)
createQueryTest("SPARK-8976 Wrong Result for CUBE #1",
"""
SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH CUBE
""".stripMargin)
createQueryTest("SPARK-8976 Wrong Result for CUBE #2",
"""
SELECT
count(*) AS cnt,
key % 5 as k1,
key-5 as k2,
GROUPING__ID as k3
FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
""".stripMargin)
createQueryTest("SPARK-8976 Wrong Result for GroupingSet",
"""
SELECT
count(*) AS cnt,
key % 5 as k1,
key-5 as k2,
GROUPING__ID as k3
FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
""".stripMargin)
createQueryTest("insert table with generator with column name",
"""
| CREATE TABLE gen_tmp (key Int);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment