Skip to content
Snippets Groups Projects
Commit 63eee86c authored by Liang-Chi Hsieh's avatar Liang-Chi Hsieh Committed by Davies Liu
Browse files

[SPARK-9297] [SQL] Add covar_pop and covar_samp

JIRA: https://issues.apache.org/jira/browse/SPARK-9297

Add two aggregation functions: covar_pop and covar_samp.

Author: Liang-Chi Hsieh <viirya@gmail.com>
Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #10029 from viirya/covar-funcs.
parent d6fd9b37
No related branches found
No related tags found
No related merge requests found
......@@ -182,6 +182,8 @@ object FunctionRegistry {
expression[Average]("avg"),
expression[Corr]("corr"),
expression[Count]("count"),
expression[CovPopulation]("covar_pop"),
expression[CovSample]("covar_samp"),
expression[First]("first"),
expression[First]("first_value"),
expression[Last]("last"),
......
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.catalyst.expressions.aggregate
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.util.TypeUtils
import org.apache.spark.sql.types._
/**
* Compute the covariance between two expressions.
* When applied on empty data (i.e., count is zero), it returns NULL.
*
*/
abstract class Covariance(left: Expression, right: Expression) extends ImperativeAggregate
with Serializable {
override def children: Seq[Expression] = Seq(left, right)
override def nullable: Boolean = true
override def dataType: DataType = DoubleType
override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType)
override def checkInputDataTypes(): TypeCheckResult = {
if (left.dataType.isInstanceOf[DoubleType] && right.dataType.isInstanceOf[DoubleType]) {
TypeCheckResult.TypeCheckSuccess
} else {
TypeCheckResult.TypeCheckFailure(
s"covariance requires that both arguments are double type, " +
s"not (${left.dataType}, ${right.dataType}).")
}
}
override def aggBufferSchema: StructType = StructType.fromAttributes(aggBufferAttributes)
override def inputAggBufferAttributes: Seq[AttributeReference] = {
aggBufferAttributes.map(_.newInstance())
}
override val aggBufferAttributes: Seq[AttributeReference] = Seq(
AttributeReference("xAvg", DoubleType)(),
AttributeReference("yAvg", DoubleType)(),
AttributeReference("Ck", DoubleType)(),
AttributeReference("count", LongType)())
// Local cache of mutableAggBufferOffset(s) that will be used in update and merge
val xAvgOffset = mutableAggBufferOffset
val yAvgOffset = mutableAggBufferOffset + 1
val CkOffset = mutableAggBufferOffset + 2
val countOffset = mutableAggBufferOffset + 3
// Local cache of inputAggBufferOffset(s) that will be used in update and merge
val inputXAvgOffset = inputAggBufferOffset
val inputYAvgOffset = inputAggBufferOffset + 1
val inputCkOffset = inputAggBufferOffset + 2
val inputCountOffset = inputAggBufferOffset + 3
override def initialize(buffer: MutableRow): Unit = {
buffer.setDouble(xAvgOffset, 0.0)
buffer.setDouble(yAvgOffset, 0.0)
buffer.setDouble(CkOffset, 0.0)
buffer.setLong(countOffset, 0L)
}
override def update(buffer: MutableRow, input: InternalRow): Unit = {
val leftEval = left.eval(input)
val rightEval = right.eval(input)
if (leftEval != null && rightEval != null) {
val x = leftEval.asInstanceOf[Double]
val y = rightEval.asInstanceOf[Double]
var xAvg = buffer.getDouble(xAvgOffset)
var yAvg = buffer.getDouble(yAvgOffset)
var Ck = buffer.getDouble(CkOffset)
var count = buffer.getLong(countOffset)
val deltaX = x - xAvg
val deltaY = y - yAvg
count += 1
xAvg += deltaX / count
yAvg += deltaY / count
Ck += deltaX * (y - yAvg)
buffer.setDouble(xAvgOffset, xAvg)
buffer.setDouble(yAvgOffset, yAvg)
buffer.setDouble(CkOffset, Ck)
buffer.setLong(countOffset, count)
}
}
// Merge counters from other partitions. Formula can be found at:
// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
val count2 = buffer2.getLong(inputCountOffset)
// We only go to merge two buffers if there is at least one record aggregated in buffer2.
// We don't need to check count in buffer1 because if count2 is more than zero, totalCount
// is more than zero too, then we won't get a divide by zero exception.
if (count2 > 0) {
var xAvg = buffer1.getDouble(xAvgOffset)
var yAvg = buffer1.getDouble(yAvgOffset)
var Ck = buffer1.getDouble(CkOffset)
var count = buffer1.getLong(countOffset)
val xAvg2 = buffer2.getDouble(inputXAvgOffset)
val yAvg2 = buffer2.getDouble(inputYAvgOffset)
val Ck2 = buffer2.getDouble(inputCkOffset)
val totalCount = count + count2
val deltaX = xAvg - xAvg2
val deltaY = yAvg - yAvg2
Ck += Ck2 + deltaX * deltaY * count / totalCount * count2
xAvg = (xAvg * count + xAvg2 * count2) / totalCount
yAvg = (yAvg * count + yAvg2 * count2) / totalCount
count = totalCount
buffer1.setDouble(xAvgOffset, xAvg)
buffer1.setDouble(yAvgOffset, yAvg)
buffer1.setDouble(CkOffset, Ck)
buffer1.setLong(countOffset, count)
}
}
}
case class CovSample(
left: Expression,
right: Expression,
mutableAggBufferOffset: Int = 0,
inputAggBufferOffset: Int = 0)
extends Covariance(left, right) {
override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
copy(mutableAggBufferOffset = newMutableAggBufferOffset)
override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
copy(inputAggBufferOffset = newInputAggBufferOffset)
override def eval(buffer: InternalRow): Any = {
val count = buffer.getLong(countOffset)
if (count > 1) {
val Ck = buffer.getDouble(CkOffset)
val cov = Ck / (count - 1)
if (cov.isNaN) {
null
} else {
cov
}
} else {
null
}
}
}
case class CovPopulation(
left: Expression,
right: Expression,
mutableAggBufferOffset: Int = 0,
inputAggBufferOffset: Int = 0)
extends Covariance(left, right) {
override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
copy(mutableAggBufferOffset = newMutableAggBufferOffset)
override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
copy(inputAggBufferOffset = newInputAggBufferOffset)
override def eval(buffer: InternalRow): Any = {
val count = buffer.getLong(countOffset)
if (count > 0) {
val Ck = buffer.getDouble(CkOffset)
if (Ck.isNaN) {
null
} else {
Ck / count
}
} else {
null
}
}
}
......@@ -308,6 +308,46 @@ object functions extends LegacyFunctions {
def countDistinct(columnName: String, columnNames: String*): Column =
countDistinct(Column(columnName), columnNames.map(Column.apply) : _*)
/**
* Aggregate function: returns the population covariance for two columns.
*
* @group agg_funcs
* @since 2.0.0
*/
def covar_pop(column1: Column, column2: Column): Column = withAggregateFunction {
CovPopulation(column1.expr, column2.expr)
}
/**
* Aggregate function: returns the population covariance for two columns.
*
* @group agg_funcs
* @since 2.0.0
*/
def covar_pop(columnName1: String, columnName2: String): Column = {
covar_pop(Column(columnName1), Column(columnName2))
}
/**
* Aggregate function: returns the sample covariance for two columns.
*
* @group agg_funcs
* @since 2.0.0
*/
def covar_samp(column1: Column, column2: Column): Column = withAggregateFunction {
CovSample(column1.expr, column2.expr)
}
/**
* Aggregate function: returns the sample covariance for two columns.
*
* @group agg_funcs
* @since 2.0.0
*/
def covar_samp(columnName1: String, columnName2: String): Column = {
covar_samp(Column(columnName1), Column(columnName2))
}
/**
* Aggregate function: returns the first value in a group.
*
......
......@@ -807,6 +807,38 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
assert(math.abs(corr7 - 0.6633880657639323) < 1e-12)
}
test("covariance: covar_pop and covar_samp") {
// non-trivial example. To reproduce in python, use:
// >>> import numpy as np
// >>> a = np.array(range(20))
// >>> b = np.array([x * x - 2 * x + 3.5 for x in range(20)])
// >>> np.cov(a, b, bias = 0)[0][1]
// 595.0
// >>> np.cov(a, b, bias = 1)[0][1]
// 565.25
val df = Seq.tabulate(20)(x => (1.0 * x, x * x - 2 * x + 3.5)).toDF("a", "b")
val cov_samp = df.groupBy().agg(covar_samp("a", "b")).collect()(0).getDouble(0)
assert(math.abs(cov_samp - 595.0) < 1e-12)
val cov_pop = df.groupBy().agg(covar_pop("a", "b")).collect()(0).getDouble(0)
assert(math.abs(cov_pop - 565.25) < 1e-12)
val df2 = Seq.tabulate(20)(x => (1 * x, x * x * x - 2)).toDF("a", "b")
val cov_samp2 = df2.groupBy().agg(covar_samp("a", "b")).collect()(0).getDouble(0)
assert(math.abs(cov_samp2 - 11564.0) < 1e-12)
val cov_pop2 = df2.groupBy().agg(covar_pop("a", "b")).collect()(0).getDouble(0)
assert(math.abs(cov_pop2 - 10985.799999999999) < 1e-12)
// one row test
val df3 = Seq.tabulate(1)(x => (1 * x, x * x * x - 2)).toDF("a", "b")
val cov_samp3 = df3.groupBy().agg(covar_samp("a", "b")).collect()(0).get(0)
assert(cov_samp3 == null)
val cov_pop3 = df3.groupBy().agg(covar_pop("a", "b")).collect()(0).getDouble(0)
assert(cov_pop3 == 0.0)
}
test("no aggregation function (SPARK-11486)") {
val df = sqlContext.range(20).selectExpr("id", "repeat(id, 1) as s")
.groupBy("s").count()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment