Skip to content
Snippets Groups Projects
Commit 1d04dc95 authored by Davies Liu's avatar Davies Liu Committed by Reynold Xin
Browse files

[SPARK-11467][SQL] add Python API for stddev/variance

Add Python API for stddev/stddev_pop/stddev_samp/variance/var_pop/var_samp/skewness/kurtosis

Author: Davies Liu <davies@databricks.com>

Closes #9424 from davies/py_var.
parent a9676cc7
No related branches found
No related tags found
No related merge requests found
......@@ -122,6 +122,21 @@ _functions_1_4 = {
'bitwiseNOT': 'Computes bitwise not.',
}
_functions_1_6 = {
# unary math functions
"stddev": "Aggregate function: returns the unbiased sample standard deviation of" +
" the expression in a group.",
"stddev_samp": "Aggregate function: returns the unbiased sample standard deviation of" +
" the expression in a group.",
"stddev_pop": "Aggregate function: returns population standard deviation of" +
" the expression in a group.",
"variance": "Aggregate function: returns the population variance of the values in a group.",
"var_samp": "Aggregate function: returns the unbiased variance of the values in a group.",
"var_pop": "Aggregate function: returns the population variance of the values in a group.",
"skewness": "Aggregate function: returns the skewness of the values in a group.",
"kurtosis": "Aggregate function: returns the kurtosis of the values in a group."
}
# math functions that take two arguments as input
_binary_mathfunctions = {
'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' +
......@@ -172,6 +187,8 @@ for _name, _doc in _binary_mathfunctions.items():
globals()[_name] = since(1.4)(_create_binary_mathfunction(_name, _doc))
for _name, _doc in _window_functions.items():
globals()[_name] = since(1.4)(_create_window_function(_name, _doc))
for _name, _doc in _functions_1_6.items():
globals()[_name] = since(1.6)(_create_function(_name, _doc))
del _name, _doc
......
......@@ -167,6 +167,94 @@ class GroupedData(object):
[Row(sum(age)=7, sum(height)=165)]
"""
@df_varargs_api
@since(1.6)
def stddev(self, *cols):
"""Compute the sample standard deviation for each numeric columns for each group.
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df3.groupBy().stddev('age', 'height').collect()
[Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)]
"""
@df_varargs_api
@since(1.6)
def stddev_samp(self, *cols):
"""Compute the sample standard deviation for each numeric columns for each group.
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df3.groupBy().stddev_samp('age', 'height').collect()
[Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)]
"""
@df_varargs_api
@since(1.6)
def stddev_pop(self, *cols):
"""Compute the population standard deviation for each numeric columns for each group.
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df3.groupBy().stddev_pop('age', 'height').collect()
[Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)]
"""
@df_varargs_api
@since(1.6)
def variance(self, *cols):
"""Compute the sample variance for each numeric columns for each group.
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df3.groupBy().variance('age', 'height').collect()
[Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)]
"""
@df_varargs_api
@since(1.6)
def var_pop(self, *cols):
"""Compute the sample variance for each numeric columns for each group.
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df3.groupBy().var_pop('age', 'height').collect()
[Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)]
"""
@df_varargs_api
@since(1.6)
def var_samp(self, *cols):
"""Compute the sample variance for each numeric columns for each group.
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df3.groupBy().var_samp('age', 'height').collect()
[Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)]
"""
@df_varargs_api
@since(1.6)
def skewness(self, *cols):
"""Compute the skewness for each numeric columns for each group.
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df3.groupBy().skewness('age', 'height').collect()
[Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)]
"""
@df_varargs_api
@since(1.6)
def kurtosis(self, *cols):
"""Compute the kurtosis for each numeric columns for each group.
:param cols: list of column names (string). Non-numeric columns are ignored.
>>> df3.groupBy().kurtosis('age', 'height').collect()
[Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)]
"""
def _test():
import doctest
......
......@@ -254,14 +254,6 @@ object functions {
*/
def kurtosis(e: Column): Column = Kurtosis(e.expr)
/**
* Aggregate function: returns the kurtosis of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def kurtosis(columnName: String): Column = kurtosis(Column(columnName))
/**
* Aggregate function: returns the last value in a group.
*
......@@ -336,14 +328,6 @@ object functions {
*/
def skewness(e: Column): Column = Skewness(e.expr)
/**
* Aggregate function: returns the skewness of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def skewness(columnName: String): Column = skewness(Column(columnName))
/**
* Aggregate function: returns the unbiased sample standard deviation of
* the expression in a group.
......@@ -353,15 +337,6 @@ object functions {
*/
def stddev(e: Column): Column = Stddev(e.expr)
/**
* Aggregate function: returns the unbiased sample standard deviation of
* the expression in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def stddev(columnName: String): Column = stddev(Column(columnName))
/**
* Aggregate function: returns the unbiased sample standard deviation of
* the expression in a group.
......@@ -371,15 +346,6 @@ object functions {
*/
def stddev_samp(e: Column): Column = StddevSamp(e.expr)
/**
* Aggregate function: returns the unbiased sample standard deviation of
* the expression in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def stddev_samp(columnName: String): Column = stddev_samp(Column(columnName))
/**
* Aggregate function: returns the population standard deviation of
* the expression in a group.
......@@ -389,15 +355,6 @@ object functions {
*/
def stddev_pop(e: Column): Column = StddevPop(e.expr)
/**
* Aggregate function: returns the population standard deviation of
* the expression in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def stddev_pop(columnName: String): Column = stddev_pop(Column(columnName))
/**
* Aggregate function: returns the sum of all values in the expression.
*
......@@ -438,14 +395,6 @@ object functions {
*/
def variance(e: Column): Column = Variance(e.expr)
/**
* Aggregate function: returns the population variance of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def variance(columnName: String): Column = variance(Column(columnName))
/**
* Aggregate function: returns the unbiased variance of the values in a group.
*
......@@ -454,14 +403,6 @@ object functions {
*/
def var_samp(e: Column): Column = VarianceSamp(e.expr)
/**
* Aggregate function: returns the unbiased variance of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def var_samp(columnName: String): Column = var_samp(Column(columnName))
/**
* Aggregate function: returns the population variance of the values in a group.
*
......@@ -470,14 +411,6 @@ object functions {
*/
def var_pop(e: Column): Column = VariancePop(e.expr)
/**
* Aggregate function: returns the population variance of the values in a group.
*
* @group agg_funcs
* @since 1.6.0
*/
def var_pop(columnName: String): Column = var_pop(Column(columnName))
//////////////////////////////////////////////////////////////////////////////////////////////
// Window functions
//////////////////////////////////////////////////////////////////////////////////////////////
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment