diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index c529d83060f5094ca82b0c98db31b9c60f174c82..f28d26a51baa04c7a82a76aa09c225a4e0d5f56a 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -336,7 +336,8 @@ setMethod("asin", }) #' @details -#' \code{atan}: Computes the tangent inverse of the given value. +#' \code{atan}: Computes the tangent inverse of the given value; the returned angle is in the range +#' -pi/2 through pi/2. #' #' @rdname column_math_functions #' @export @@ -599,7 +600,7 @@ setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOr }) #' @details -#' \code{cos}: Computes the cosine of the given value. +#' \code{cos}: Computes the cosine of the given value. Units in radians. #' #' @rdname column_math_functions #' @aliases cos cos,Column-method @@ -1407,7 +1408,7 @@ setMethod("sign", signature(x = "Column"), }) #' @details -#' \code{sin}: Computes the sine of the given value. +#' \code{sin}: Computes the sine of the given value. Units in radians. #' #' @rdname column_math_functions #' @aliases sin sin,Column-method @@ -1597,7 +1598,7 @@ setMethod("sumDistinct", }) #' @details -#' \code{tan}: Computes the tangent of the given value. +#' \code{tan}: Computes the tangent of the given value. Units in radians. #' #' @rdname column_math_functions #' @aliases tan tan,Column-method @@ -1896,7 +1897,7 @@ setMethod("year", #' @details #' \code{atan2}: Returns the angle theta from the conversion of rectangular coordinates -#' (x, y) to polar coordinates (r, theta). +#' (x, y) to polar coordinates (r, theta). Units in radians. #' #' @rdname column_math_functions #' @aliases atan2 atan2,Column-method diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 3416c4b118a077e0a086f50f69b8207c628e902b..5d8ded83f667d055273b601b1e19059b0d522f8d 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -67,9 +67,14 @@ def _create_window_function(name, doc=''): _.__doc__ = 'Window function: ' + doc return _ +_lit_doc = """ + Creates a :class:`Column` of literal value. + >>> df.select(lit(5).alias('height')).withColumn('spark_user', lit(True)).take(1) + [Row(height=5, spark_user=True)] + """ _functions = { - 'lit': 'Creates a :class:`Column` of literal value.', + 'lit': _lit_doc, 'col': 'Returns a :class:`Column` based on the given column name.', 'column': 'Returns a :class:`Column` based on the given column name.', 'asc': 'Returns a sort expression based on the ascending order of the given column name.', @@ -95,10 +100,13 @@ _functions_1_4 = { '0.0 through pi.', 'asin': 'Computes the sine inverse of the given value; the returned angle is in the range' + '-pi/2 through pi/2.', - 'atan': 'Computes the tangent inverse of the given value.', + 'atan': 'Computes the tangent inverse of the given value; the returned angle is in the range' + + '-pi/2 through pi/2', 'cbrt': 'Computes the cube-root of the given value.', 'ceil': 'Computes the ceiling of the given value.', - 'cos': 'Computes the cosine of the given value.', + 'cos': """Computes the cosine of the given value. + + :param col: :class:`DoubleType` column, units in radians.""", 'cosh': 'Computes the hyperbolic cosine of the given value.', 'exp': 'Computes the exponential of the given value.', 'expm1': 'Computes the exponential of the given value minus one.', @@ -109,15 +117,33 @@ _functions_1_4 = { 'rint': 'Returns the double value that is closest in value to the argument and' + ' is equal to a mathematical integer.', 'signum': 'Computes the signum of the given value.', - 'sin': 'Computes the sine of the given value.', + 'sin': """Computes the sine of the given value. + + :param col: :class:`DoubleType` column, units in radians.""", 'sinh': 'Computes the hyperbolic sine of the given value.', - 'tan': 'Computes the tangent of the given value.', + 'tan': """Computes the tangent of the given value. + + :param col: :class:`DoubleType` column, units in radians.""", 'tanh': 'Computes the hyperbolic tangent of the given value.', - 'toDegrees': '.. note:: Deprecated in 2.1, use degrees instead.', - 'toRadians': '.. note:: Deprecated in 2.1, use radians instead.', + 'toDegrees': '.. note:: Deprecated in 2.1, use :func:`degrees` instead.', + 'toRadians': '.. note:: Deprecated in 2.1, use :func:`radians` instead.', 'bitwiseNOT': 'Computes bitwise not.', } +_collect_list_doc = """ + Aggregate function: returns a list of objects with duplicates. + + >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) + >>> df2.agg(collect_list('age')).collect() + [Row(collect_list(age)=[2, 5, 5])] + """ +_collect_set_doc = """ + Aggregate function: returns a set of objects with duplicate elements eliminated. + + >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) + >>> df2.agg(collect_set('age')).collect() + [Row(collect_set(age)=[5, 2])] + """ _functions_1_6 = { # unary math functions 'stddev': 'Aggregate function: returns the unbiased sample standard deviation of' + @@ -131,9 +157,8 @@ _functions_1_6 = { 'var_pop': 'Aggregate function: returns the population variance of the values in a group.', 'skewness': 'Aggregate function: returns the skewness of the values in a group.', 'kurtosis': 'Aggregate function: returns the kurtosis of the values in a group.', - 'collect_list': 'Aggregate function: returns a list of objects with duplicates.', - 'collect_set': 'Aggregate function: returns a set of objects with duplicate elements' + - ' eliminated.', + 'collect_list': _collect_list_doc, + 'collect_set': _collect_set_doc } _functions_2_1 = { @@ -147,7 +172,7 @@ _functions_2_1 = { # math functions that take two arguments as input _binary_mathfunctions = { 'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' + - 'polar coordinates (r, theta).', + 'polar coordinates (r, theta). Units in radians.', 'hypot': 'Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.', 'pow': 'Returns the value of the first argument raised to the power of the second argument.', } @@ -200,17 +225,20 @@ del _name, _doc @since(1.3) def approxCountDistinct(col, rsd=None): """ - .. note:: Deprecated in 2.1, use approx_count_distinct instead. + .. note:: Deprecated in 2.1, use :func:`approx_count_distinct` instead. """ return approx_count_distinct(col, rsd) @since(2.1) def approx_count_distinct(col, rsd=None): - """Returns a new :class:`Column` for approximate distinct count of ``col``. + """Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`. - >>> df.agg(approx_count_distinct(df.age).alias('c')).collect() - [Row(c=2)] + :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more + efficient to use :func:`countDistinct` + + >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect() + [Row(distinct_ages=2)] """ sc = SparkContext._active_spark_context if rsd is None: @@ -267,8 +295,7 @@ def coalesce(*cols): @since(1.6) def corr(col1, col2): - """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` - and ``col2``. + """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``. >>> a = range(20) >>> b = [2 * x for x in range(20)] @@ -282,8 +309,7 @@ def corr(col1, col2): @since(2.0) def covar_pop(col1, col2): - """Returns a new :class:`Column` for the population covariance of ``col1`` - and ``col2``. + """Returns a new :class:`Column` for the population covariance of ``col1`` and ``col2``. >>> a = [1] * 10 >>> b = [1] * 10 @@ -297,8 +323,7 @@ def covar_pop(col1, col2): @since(2.0) def covar_samp(col1, col2): - """Returns a new :class:`Column` for the sample covariance of ``col1`` - and ``col2``. + """Returns a new :class:`Column` for the sample covariance of ``col1`` and ``col2``. >>> a = [1] * 10 >>> b = [1] * 10 @@ -450,7 +475,7 @@ def monotonically_increasing_id(): def nanvl(col1, col2): """Returns col1 if it is not NaN, or col2 if col1 is NaN. - Both inputs should be floating point columns (DoubleType or FloatType). + Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`). >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) >>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect() @@ -460,10 +485,15 @@ def nanvl(col1, col2): return Column(sc._jvm.functions.nanvl(_to_java_column(col1), _to_java_column(col2))) +@ignore_unicode_prefix @since(1.4) def rand(seed=None): """Generates a random column with independent and identically distributed (i.i.d.) samples from U[0.0, 1.0]. + + >>> df.withColumn('rand', rand(seed=42) * 3).collect() + [Row(age=2, name=u'Alice', rand=1.1568609015300986), + Row(age=5, name=u'Bob', rand=1.403379671529166)] """ sc = SparkContext._active_spark_context if seed is not None: @@ -473,10 +503,15 @@ def rand(seed=None): return Column(jc) +@ignore_unicode_prefix @since(1.4) def randn(seed=None): """Generates a column with independent and identically distributed (i.i.d.) samples from the standard normal distribution. + + >>> df.withColumn('randn', randn(seed=42)).collect() + [Row(age=2, name=u'Alice', randn=-0.7556247885860078), + Row(age=5, name=u'Bob', randn=-0.0861619008451133)] """ sc = SparkContext._active_spark_context if seed is not None: @@ -760,7 +795,7 @@ def ntile(n): @since(1.5) def current_date(): """ - Returns the current date as a date column. + Returns the current date as a :class:`DateType` column. """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.current_date()) @@ -768,7 +803,7 @@ def current_date(): def current_timestamp(): """ - Returns the current timestamp as a timestamp column. + Returns the current timestamp as a :class:`TimestampType` column. """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.current_timestamp()) @@ -787,8 +822,8 @@ def date_format(date, format): .. note:: Use when ever possible specialized functions like `year`. These benefit from a specialized implementation. - >>> df = spark.createDataFrame([('2015-04-08',)], ['a']) - >>> df.select(date_format('a', 'MM/dd/yyy').alias('date')).collect() + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect() [Row(date=u'04/08/2015')] """ sc = SparkContext._active_spark_context @@ -800,8 +835,8 @@ def year(col): """ Extract the year of a given date as integer. - >>> df = spark.createDataFrame([('2015-04-08',)], ['a']) - >>> df.select(year('a').alias('year')).collect() + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(year('dt').alias('year')).collect() [Row(year=2015)] """ sc = SparkContext._active_spark_context @@ -813,8 +848,8 @@ def quarter(col): """ Extract the quarter of a given date as integer. - >>> df = spark.createDataFrame([('2015-04-08',)], ['a']) - >>> df.select(quarter('a').alias('quarter')).collect() + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(quarter('dt').alias('quarter')).collect() [Row(quarter=2)] """ sc = SparkContext._active_spark_context @@ -826,8 +861,8 @@ def month(col): """ Extract the month of a given date as integer. - >>> df = spark.createDataFrame([('2015-04-08',)], ['a']) - >>> df.select(month('a').alias('month')).collect() + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(month('dt').alias('month')).collect() [Row(month=4)] """ sc = SparkContext._active_spark_context @@ -839,8 +874,8 @@ def dayofmonth(col): """ Extract the day of the month of a given date as integer. - >>> df = spark.createDataFrame([('2015-04-08',)], ['a']) - >>> df.select(dayofmonth('a').alias('day')).collect() + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(dayofmonth('dt').alias('day')).collect() [Row(day=8)] """ sc = SparkContext._active_spark_context @@ -852,8 +887,8 @@ def dayofyear(col): """ Extract the day of the year of a given date as integer. - >>> df = spark.createDataFrame([('2015-04-08',)], ['a']) - >>> df.select(dayofyear('a').alias('day')).collect() + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(dayofyear('dt').alias('day')).collect() [Row(day=98)] """ sc = SparkContext._active_spark_context @@ -865,8 +900,8 @@ def hour(col): """ Extract the hours of a given date as integer. - >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['a']) - >>> df.select(hour('a').alias('hour')).collect() + >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts']) + >>> df.select(hour('ts').alias('hour')).collect() [Row(hour=13)] """ sc = SparkContext._active_spark_context @@ -878,8 +913,8 @@ def minute(col): """ Extract the minutes of a given date as integer. - >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['a']) - >>> df.select(minute('a').alias('minute')).collect() + >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts']) + >>> df.select(minute('ts').alias('minute')).collect() [Row(minute=8)] """ sc = SparkContext._active_spark_context @@ -891,8 +926,8 @@ def second(col): """ Extract the seconds of a given date as integer. - >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['a']) - >>> df.select(second('a').alias('second')).collect() + >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts']) + >>> df.select(second('ts').alias('second')).collect() [Row(second=15)] """ sc = SparkContext._active_spark_context @@ -904,8 +939,8 @@ def weekofyear(col): """ Extract the week number of a given date as integer. - >>> df = spark.createDataFrame([('2015-04-08',)], ['a']) - >>> df.select(weekofyear(df.a).alias('week')).collect() + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(weekofyear(df.dt).alias('week')).collect() [Row(week=15)] """ sc = SparkContext._active_spark_context @@ -917,9 +952,9 @@ def date_add(start, days): """ Returns the date that is `days` days after `start` - >>> df = spark.createDataFrame([('2015-04-08',)], ['d']) - >>> df.select(date_add(df.d, 1).alias('d')).collect() - [Row(d=datetime.date(2015, 4, 9))] + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(date_add(df.dt, 1).alias('next_date')).collect() + [Row(next_date=datetime.date(2015, 4, 9))] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.date_add(_to_java_column(start), days)) @@ -930,9 +965,9 @@ def date_sub(start, days): """ Returns the date that is `days` days before `start` - >>> df = spark.createDataFrame([('2015-04-08',)], ['d']) - >>> df.select(date_sub(df.d, 1).alias('d')).collect() - [Row(d=datetime.date(2015, 4, 7))] + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(date_sub(df.dt, 1).alias('prev_date')).collect() + [Row(prev_date=datetime.date(2015, 4, 7))] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.date_sub(_to_java_column(start), days)) @@ -956,9 +991,9 @@ def add_months(start, months): """ Returns the date that is `months` months after `start` - >>> df = spark.createDataFrame([('2015-04-08',)], ['d']) - >>> df.select(add_months(df.d, 1).alias('d')).collect() - [Row(d=datetime.date(2015, 5, 8))] + >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> df.select(add_months(df.dt, 1).alias('next_month')).collect() + [Row(next_month=datetime.date(2015, 5, 8))] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.add_months(_to_java_column(start), months)) @@ -969,8 +1004,8 @@ def months_between(date1, date2): """ Returns the number of months between date1 and date2. - >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['t', 'd']) - >>> df.select(months_between(df.t, df.d).alias('months')).collect() + >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2']) + >>> df.select(months_between(df.date1, df.date2).alias('months')).collect() [Row(months=3.9495967...)] """ sc = SparkContext._active_spark_context @@ -1073,12 +1108,17 @@ def last_day(date): return Column(sc._jvm.functions.last_day(_to_java_column(date))) +@ignore_unicode_prefix @since(1.5) def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"): """ Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string representing the timestamp of that moment in the current system time zone in the given format. + + >>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time']) + >>> time_df.select(from_unixtime('unix_time').alias('ts')).collect() + [Row(ts=u'2015-04-08 00:00:00')] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.from_unixtime(_to_java_column(timestamp), format)) @@ -1092,6 +1132,10 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'): locale, return null if fail. if `timestamp` is None, then it returns current timestamp. + + >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect() + [Row(unix_time=1428476400)] """ sc = SparkContext._active_spark_context if timestamp is None: @@ -1106,8 +1150,8 @@ def from_utc_timestamp(timestamp, tz): that corresponds to the same time of day in the given timezone. >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) - >>> df.select(from_utc_timestamp(df.t, "PST").alias('t')).collect() - [Row(t=datetime.datetime(1997, 2, 28, 2, 30))] + >>> df.select(from_utc_timestamp(df.t, "PST").alias('local_time')).collect() + [Row(local_time=datetime.datetime(1997, 2, 28, 2, 30))] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.from_utc_timestamp(_to_java_column(timestamp), tz)) @@ -1119,9 +1163,9 @@ def to_utc_timestamp(timestamp, tz): Given a timestamp, which corresponds to a certain time of day in the given timezone, returns another timestamp that corresponds to the same time of day in UTC. - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) - >>> df.select(to_utc_timestamp(df.t, "PST").alias('t')).collect() - [Row(t=datetime.datetime(1997, 2, 28, 18, 30))] + >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['ts']) + >>> df.select(to_utc_timestamp(df.ts, "PST").alias('utc_time')).collect() + [Row(utc_time=datetime.datetime(1997, 2, 28, 18, 30))] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.to_utc_timestamp(_to_java_column(timestamp), tz)) @@ -2095,7 +2139,7 @@ def _test(): sc = spark.sparkContext globs['sc'] = sc globs['spark'] = spark - globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF() + globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)]) (failure_count, test_count) = doctest.testmod( pyspark.sql.functions, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 1263071a3ffd54f25036beca5e021226c1ba717d..a5e4a444f33be2879c2bcf39075e1cd44b26c416 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1321,7 +1321,8 @@ object functions { def asin(columnName: String): Column = asin(Column(columnName)) /** - * Computes the tangent inverse of the given value. + * Computes the tangent inverse of the given column; the returned angle is in the range + * -pi/2 through pi/2 * * @group math_funcs * @since 1.4.0 @@ -1329,7 +1330,8 @@ object functions { def atan(e: Column): Column = withExpr { Atan(e.expr) } /** - * Computes the tangent inverse of the given column. + * Computes the tangent inverse of the given column; the returned angle is in the range + * -pi/2 through pi/2 * * @group math_funcs * @since 1.4.0 @@ -1338,7 +1340,7 @@ object functions { /** * Returns the angle theta from the conversion of rectangular coordinates (x, y) to - * polar coordinates (r, theta). + * polar coordinates (r, theta). Units in radians. * * @group math_funcs * @since 1.4.0 @@ -1470,7 +1472,7 @@ object functions { } /** - * Computes the cosine of the given value. + * Computes the cosine of the given value. Units in radians. * * @group math_funcs * @since 1.4.0 @@ -1937,7 +1939,7 @@ object functions { def signum(columnName: String): Column = signum(Column(columnName)) /** - * Computes the sine of the given value. + * Computes the sine of the given value. Units in radians. * * @group math_funcs * @since 1.4.0 @@ -1969,7 +1971,7 @@ object functions { def sinh(columnName: String): Column = sinh(Column(columnName)) /** - * Computes the tangent of the given value. + * Computes the tangent of the given value. Units in radians. * * @group math_funcs * @since 1.4.0