Skip to content
Snippets Groups Projects
Commit 7fe4fe63 authored by Wenchen Fan's avatar Wenchen Fan Committed by Davies Liu
Browse files

[SPARK-12888] [SQL] [FOLLOW-UP] benchmark the new hash expression

Adds the benchmark results as comments.

The codegen version is slower than the interpreted version for `simple` case becasue of 3 reasons:

1. codegen version use a more complex hash algorithm than interpreted version, i.e. `Murmur3_x86_32.hashInt` vs [simple multiplication and addition](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala#L153).
2. codegen version will write the hash value to a row first and then read it out. I tried to create a `GenerateHasher` that can generate code to return hash value directly and got about 60% speed up for the `simple` case, does it worth?
3. the row in `simple` case only has one int field, so the runtime reflection may be removed because of branch prediction, which makes the interpreted version faster.

The `array` case is also slow for similar reasons, e.g. array elements are of same type, so interpreted version can probably get rid of runtime reflection by branch prediction.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #10917 from cloud-fan/hash-benchmark.
parent 2dbb9164
No related branches found
No related tags found
No related merge requests found
...@@ -35,7 +35,8 @@ import org.apache.commons.lang3.SystemUtils ...@@ -35,7 +35,8 @@ import org.apache.commons.lang3.SystemUtils
* If outputPerIteration is true, the timing for each run will be printed to stdout. * If outputPerIteration is true, the timing for each run will be printed to stdout.
*/ */
private[spark] class Benchmark( private[spark] class Benchmark(
name: String, valuesPerIteration: Long, name: String,
valuesPerIteration: Long,
iters: Int = 5, iters: Int = 5,
outputPerIteration: Boolean = false) { outputPerIteration: Boolean = false) {
val benchmarks = mutable.ArrayBuffer.empty[Benchmark.Case] val benchmarks = mutable.ArrayBuffer.empty[Benchmark.Case]
...@@ -61,7 +62,6 @@ private[spark] class Benchmark( ...@@ -61,7 +62,6 @@ private[spark] class Benchmark(
println println
val firstBest = results.head.bestMs val firstBest = results.head.bestMs
val firstAvg = results.head.avgMs
// The results are going to be processor specific so it is useful to include that. // The results are going to be processor specific so it is useful to include that.
println(Benchmark.getProcessorName()) println(Benchmark.getProcessorName())
printf("%-35s %16s %12s %13s %10s\n", name + ":", "Best/Avg Time(ms)", "Rate(M/s)", printf("%-35s %16s %12s %13s %10s\n", name + ":", "Best/Avg Time(ms)", "Rate(M/s)",
......
...@@ -29,9 +29,7 @@ import org.apache.spark.util.Benchmark ...@@ -29,9 +29,7 @@ import org.apache.spark.util.Benchmark
*/ */
object HashBenchmark { object HashBenchmark {
def test(name: String, schema: StructType, iters: Int): Unit = { def test(name: String, schema: StructType, numRows: Int, iters: Int): Unit = {
val numRows = 1024 * 8
val generator = RandomDataGenerator.forType(schema, nullable = false).get val generator = RandomDataGenerator.forType(schema, nullable = false).get
val encoder = RowEncoder(schema) val encoder = RowEncoder(schema)
val attrs = schema.toAttributes val attrs = schema.toAttributes
...@@ -70,7 +68,14 @@ object HashBenchmark { ...@@ -70,7 +68,14 @@ object HashBenchmark {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val simple = new StructType().add("i", IntegerType) val simple = new StructType().add("i", IntegerType)
test("simple", simple, 1024) /*
Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
Hash For simple: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------
interpreted version 941 / 955 142.6 7.0 1.0X
codegen version 1737 / 1775 77.3 12.9 0.5X
*/
test("simple", simple, 1 << 13, 1 << 14)
val normal = new StructType() val normal = new StructType()
.add("null", NullType) .add("null", NullType)
...@@ -87,18 +92,39 @@ object HashBenchmark { ...@@ -87,18 +92,39 @@ object HashBenchmark {
.add("binary", BinaryType) .add("binary", BinaryType)
.add("date", DateType) .add("date", DateType)
.add("timestamp", TimestampType) .add("timestamp", TimestampType)
test("normal", normal, 128) /*
Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------
interpreted version 2209 / 2271 0.9 1053.4 1.0X
codegen version 1887 / 2018 1.1 899.9 1.2X
*/
test("normal", normal, 1 << 10, 1 << 11)
val arrayOfInt = ArrayType(IntegerType) val arrayOfInt = ArrayType(IntegerType)
val array = new StructType() val array = new StructType()
.add("array", arrayOfInt) .add("array", arrayOfInt)
.add("arrayOfArray", ArrayType(arrayOfInt)) .add("arrayOfArray", ArrayType(arrayOfInt))
test("array", array, 64) /*
Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------
interpreted version 1481 / 1529 0.1 11301.7 1.0X
codegen version 2591 / 2636 0.1 19771.1 0.6X
*/
test("array", array, 1 << 8, 1 << 9)
val mapOfInt = MapType(IntegerType, IntegerType) val mapOfInt = MapType(IntegerType, IntegerType)
val map = new StructType() val map = new StructType()
.add("map", mapOfInt) .add("map", mapOfInt)
.add("mapOfMap", MapType(IntegerType, mapOfInt)) .add("mapOfMap", MapType(IntegerType, mapOfInt))
test("map", map, 64) /*
Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------
interpreted version 1820 / 1861 0.0 444347.2 1.0X
codegen version 205 / 223 0.0 49936.5 8.9X
*/
test("map", map, 1 << 6, 1 << 6)
} }
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment