[SPARK-12888][SQL] benchmark the new hash expression

Benchmark it on 4 different schemas, the result: ``` Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz Hash For simple: Avg Time(ms) Avg Rate(M/s) Relative Rate ------------------------------------------------------------------------------- interpreted version 31.47 266.54 1.00 X codegen version 64.52 130.01 0.49 X ``` ``` Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz Hash For normal: Avg Time(ms) Avg Rate(M/s) Relative Rate ------------------------------------------------------------------------------- interpreted version 4068.11 0.26 1.00 X codegen version 1175.92 0.89 3.46 X ``` ``` Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz Hash For array: Avg Time(ms) Avg Rate(M/s) Relative Rate ------------------------------------------------------------------------------- interpreted version 9276.70 0.06 1.00 X codegen version 14762.23 0.04 0.63 X ``` ``` Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz Hash For map: Avg Time(ms) Avg Rate(M/s) Relative Rate ------------------------------------------------------------------------------- interpreted version 58869.79 0.01 1.00 X codegen version 9285.36 0.06 6.34 X ``` Author: Wenchen Fan <wenchen@databricks.com> Closes #10816 from cloud-fan/hash-benchmark.

[SPARK-12888][SQL] benchmark the new hash expression
f3934a8d · Wenchen Fan · Reynold Xin · 8f90c151 · f3934a8d
Commit f3934a8d authored 9 years ago by Wenchen Fan Committed by Reynold Xin 9 years ago
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.catalyst.expressions.{Murmur3Hash, UnsafeProjection}
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Benchmark
+/**
+ * Benchmark for the previous interpreted hash function(InternalRow.hashCode) vs the new codegen
+ * hash expression(Murmur3Hash).
+ */
+object HashBenchmark {
+  def test(name: String, schema: StructType, iters: Int): Unit = {
+    val numRows = 1024 * 8
+    val generator = RandomDataGenerator.forType(schema, nullable = false).get
+    val encoder = RowEncoder(schema)
+    val attrs = schema.toAttributes
+    val safeProjection = GenerateSafeProjection.generate(attrs, attrs)
+    val rows = (1 to numRows).map(_ =>
+      // The output of encoder is UnsafeRow, use safeProjection to turn in into safe format.
+      safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy()
+    ).toArray
+    val benchmark = new Benchmark("Hash For " + name, iters * numRows)
+    benchmark.addCase("interpreted version") { _: Int =>
+      for (_ <- 0L until iters) {
+        var sum = 0
+        var i = 0
+        while (i < numRows) {
+          sum += rows(i).hashCode()
+          i += 1
+        }
+      }
+    }
+    val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs)
+    benchmark.addCase("codegen version") { _: Int =>
+      for (_ <- 0L until iters) {
+        var sum = 0
+        var i = 0
+        while (i < numRows) {
+          sum += getHashCode(rows(i)).getInt(0)
+          i += 1
+        }
+      }
+    }
+    benchmark.run()
+  }
+  def main(args: Array[String]): Unit = {
+    val simple = new StructType().add("i", IntegerType)
+    test("simple", simple, 1024)
+    val normal = new StructType()
+      .add("null", NullType)
+      .add("boolean", BooleanType)
+      .add("byte", ByteType)
+      .add("short", ShortType)
+      .add("int", IntegerType)
+      .add("long", LongType)
+      .add("float", FloatType)
+      .add("double", DoubleType)
+      .add("bigDecimal", DecimalType.SYSTEM_DEFAULT)
+      .add("smallDecimal", DecimalType.USER_DEFAULT)
+      .add("string", StringType)
+      .add("binary", BinaryType)
+      .add("date", DateType)
+      .add("timestamp", TimestampType)
+    test("normal", normal, 128)
+    val arrayOfInt = ArrayType(IntegerType)
+    val array = new StructType()
+      .add("array", arrayOfInt)
+      .add("arrayOfArray", ArrayType(arrayOfInt))
+    test("array", array, 64)
+    val mapOfInt = MapType(IntegerType, IntegerType)
+    val map = new StructType()
+      .add("map", mapOfInt)
+      .add("mapOfMap", MapType(IntegerType, mapOfInt))
+    test("map", map, 64)
+  }
+}