From a4669443999dc13a1bb34509c827d8b9096ea84f Mon Sep 17 00:00:00 2001
From: qhuang <qian.huang@intel.com>
Date: Tue, 5 May 2015 20:39:56 -0700
Subject: [PATCH] [SPARK-6841] [SPARKR] add support for mean, median, stdev
 etc.

Moving here from https://github.com/amplab-extras/SparkR-pkg/pull/241
sum() has been implemented. (https://github.com/amplab-extras/SparkR-pkg/pull/242)

Now Phase 1: mean, sd, var have been implemented, but some things still need to be improved with the suggestions in https://issues.apache.org/jira/browse/SPARK-6841

Author: qhuang <qian.huang@intel.com>

Closes #5446 from hqzizania/R and squashes the following commits:

f283572 [qhuang] add test unit for describe()
2e74d5a [qhuang] add describe() DataFrame API
---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/DataFrame.R              | 37 ++++++++++++++++++++++++++++++++
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/inst/tests/test_sparkSQL.R | 11 ++++++++++
 4 files changed, 53 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 1fb3311b7f..528e6608c3 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -13,6 +13,7 @@ exportMethods("cache",
               "collect",
               "columns",
               "count",
+              "describe",
               "distinct",
               "dtypes",
               "except",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 841e77e55e..56c305d912 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
             callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
           })
 
+#' describe
+#'
+#' Computes statistics for numeric columns.
+#' If no columns are given, this function computes statistics for all numerical columns.
+#'
+#' @param x A DataFrame to be computed.
+#' @param col A string of name
+#' @param ... Additional expressions
+#' @return A DataFrame
+#' @rdname describe 
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' describe(df)
+#' describe(df, "col1")
+#' describe(df, "col1", "col2")
+#' }
+setMethod("describe",
+          signature(x = "DataFrame", col = "character"),
+          function(x, col, ...) {
+            colList <- list(col, ...)
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })
+
+#' @rdname describe
+setMethod("describe",
+          signature(x = "DataFrame"),
+          function(x) {
+            colList <- as.list(c(columns(x)))
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e88729387e..5838955f74 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
 #' @export
 setGeneric("columns", function(x) {standardGeneric("columns") })
 
+#' @rdname describe
+#' @export
+setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
+
 #' @rdname schema
 #' @export
 setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index f82e56fdd8..7a42e289fc 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
   expect_true(count(parquetDF) == count(df)*2)
 })
 
+test_that("describe() on a DataFrame", {
+  df <- jsonFile(sqlCtx, jsonPath)
+  stats <- describe(df, "age")
+  expect_true(collect(stats)[1, "summary"] == "count")
+  expect_true(collect(stats)[2, "age"] == 24.5)
+  expect_true(collect(stats)[3, "age"] == 5.5)
+  stats <- describe(df)
+  expect_true(collect(stats)[4, "name"] == "Andy")
+  expect_true(collect(stats)[5, "age"] == 30.0)
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
-- 
GitLab