Skip to content
Snippets Groups Projects
Commit e574c997 authored by Josh Howes's avatar Josh Howes Committed by Reynold Xin
Browse files

[SPARK-15973][PYSPARK] Fix GroupedData Documentation

*This contribution is my original work and that I license the work to the project under the project's open source license.*

## What changes were proposed in this pull request?

Documentation updates to PySpark's GroupedData

## How was this patch tested?

Manual Tests

Author: Josh Howes <josh.howes@gmail.com>
Author: Josh Howes <josh.howes@maxpoint.com>

Closes #13724 from josh-howes/bugfix/SPARK-15973.
parent 35a2f3c0
No related branches found
No related tags found
No related merge requests found
...@@ -27,7 +27,7 @@ __all__ = ["GroupedData"] ...@@ -27,7 +27,7 @@ __all__ = ["GroupedData"]
def dfapi(f): def dfapi(f):
def _api(self): def _api(self):
name = f.__name__ name = f.__name__
jdf = getattr(self._jdf, name)() jdf = getattr(self._jgd, name)()
return DataFrame(jdf, self.sql_ctx) return DataFrame(jdf, self.sql_ctx)
_api.__name__ = f.__name__ _api.__name__ = f.__name__
_api.__doc__ = f.__doc__ _api.__doc__ = f.__doc__
...@@ -35,9 +35,9 @@ def dfapi(f): ...@@ -35,9 +35,9 @@ def dfapi(f):
def df_varargs_api(f): def df_varargs_api(f):
def _api(self, *args): def _api(self, *cols):
name = f.__name__ name = f.__name__
jdf = getattr(self._jdf, name)(_to_seq(self.sql_ctx._sc, args)) jdf = getattr(self._jgd, name)(_to_seq(self.sql_ctx._sc, cols))
return DataFrame(jdf, self.sql_ctx) return DataFrame(jdf, self.sql_ctx)
_api.__name__ = f.__name__ _api.__name__ = f.__name__
_api.__doc__ = f.__doc__ _api.__doc__ = f.__doc__
...@@ -54,8 +54,8 @@ class GroupedData(object): ...@@ -54,8 +54,8 @@ class GroupedData(object):
.. versionadded:: 1.3 .. versionadded:: 1.3
""" """
def __init__(self, jdf, sql_ctx): def __init__(self, jgd, sql_ctx):
self._jdf = jdf self._jgd = jgd
self.sql_ctx = sql_ctx self.sql_ctx = sql_ctx
@ignore_unicode_prefix @ignore_unicode_prefix
...@@ -83,11 +83,11 @@ class GroupedData(object): ...@@ -83,11 +83,11 @@ class GroupedData(object):
""" """
assert exprs, "exprs should not be empty" assert exprs, "exprs should not be empty"
if len(exprs) == 1 and isinstance(exprs[0], dict): if len(exprs) == 1 and isinstance(exprs[0], dict):
jdf = self._jdf.agg(exprs[0]) jdf = self._jgd.agg(exprs[0])
else: else:
# Columns # Columns
assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column" assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
jdf = self._jdf.agg(exprs[0]._jc, jdf = self._jgd.agg(exprs[0]._jc,
_to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]])) _to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))
return DataFrame(jdf, self.sql_ctx) return DataFrame(jdf, self.sql_ctx)
...@@ -178,18 +178,18 @@ class GroupedData(object): ...@@ -178,18 +178,18 @@ class GroupedData(object):
:param pivot_col: Name of the column to pivot. :param pivot_col: Name of the column to pivot.
:param values: List of values that will be translated to columns in the output DataFrame. :param values: List of values that will be translated to columns in the output DataFrame.
// Compute the sum of earnings for each year by course with each course as a separate column # Compute the sum of earnings for each year by course with each course as a separate column
>>> df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect() >>> df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect()
[Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)] [Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)]
// Or without specifying column values (less efficient) # Or without specifying column values (less efficient)
>>> df4.groupBy("year").pivot("course").sum("earnings").collect() >>> df4.groupBy("year").pivot("course").sum("earnings").collect()
[Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)] [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)]
""" """
if values is None: if values is None:
jgd = self._jdf.pivot(pivot_col) jgd = self._jgd.pivot(pivot_col)
else: else:
jgd = self._jdf.pivot(pivot_col, values) jgd = self._jgd.pivot(pivot_col, values)
return GroupedData(jgd, self.sql_ctx) return GroupedData(jgd, self.sql_ctx)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment