Skip to content
Snippets Groups Projects
Commit 5d799473 authored by Reynold Xin's avatar Reynold Xin Committed by Herman van Hovell
Browse files

[SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics

## What changes were proposed in this pull request?
This patch reduces the default number element estimation for arrays and maps from 100 to 1. The issue with the 100 number is that when nested (e.g. an array of map), 100 * 100 would be used as the default size. This sounds like just an overestimation which doesn't seem that bad (since it is usually better to overestimate than underestimate). However, due to the way we assume the size output for Project (new estimated column size / old estimated column size), this overestimation can become underestimation. It is actually in general in this case safer to assume 1 default element.

## How was this patch tested?
This should be covered by existing tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #16274 from rxin/SPARK-18853.
parent 89ae26dc
No related branches found
No related tags found
No related merge requests found
......@@ -78,10 +78,10 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT
("containsNull" -> containsNull)
/**
* The default size of a value of the ArrayType is 100 * the default size of the element type.
* (We assume that there are 100 elements).
* The default size of a value of the ArrayType is the default size of the element type.
* We assume that there is only 1 element on average in an array. See SPARK-18853.
*/
override def defaultSize: Int = 100 * elementType.defaultSize
override def defaultSize: Int = 1 * elementType.defaultSize
override def simpleString: String = s"array<${elementType.simpleString}>"
......
......@@ -56,10 +56,10 @@ case class MapType(
/**
* The default size of a value of the MapType is
* 100 * (the default size of the key type + the default size of the value type).
* (We assume that there are 100 elements).
* (the default size of the key type + the default size of the value type).
* We assume that there is only 1 element on average in a map. See SPARK-18853.
*/
override def defaultSize: Int = 100 * (keyType.defaultSize + valueType.defaultSize)
override def defaultSize: Int = 1 * (keyType.defaultSize + valueType.defaultSize)
override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>"
......
......@@ -253,7 +253,7 @@ class DataTypeSuite extends SparkFunSuite {
checkDataTypeJsonRepr(structType)
def checkDefaultSize(dataType: DataType, expectedDefaultSize: Int): Unit = {
test(s"Check the default size of ${dataType}") {
test(s"Check the default size of $dataType") {
assert(dataType.defaultSize === expectedDefaultSize)
}
}
......@@ -272,18 +272,18 @@ class DataTypeSuite extends SparkFunSuite {
checkDefaultSize(TimestampType, 8)
checkDefaultSize(StringType, 20)
checkDefaultSize(BinaryType, 100)
checkDefaultSize(ArrayType(DoubleType, true), 800)
checkDefaultSize(ArrayType(StringType, false), 2000)
checkDefaultSize(MapType(IntegerType, StringType, true), 2400)
checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 80400)
checkDefaultSize(structType, 812)
checkDefaultSize(ArrayType(DoubleType, true), 8)
checkDefaultSize(ArrayType(StringType, false), 20)
checkDefaultSize(MapType(IntegerType, StringType, true), 24)
checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 12)
checkDefaultSize(structType, 20)
def checkEqualsIgnoreCompatibleNullability(
from: DataType,
to: DataType,
expected: Boolean): Unit = {
val testName =
s"equalsIgnoreCompatibleNullability: (from: ${from}, to: ${to})"
s"equalsIgnoreCompatibleNullability: (from: $from, to: $to)"
test(testName) {
assert(DataType.equalsIgnoreCompatibleNullability(from, to) === expected)
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment