[SPARK-13504] [SPARKR] Add approxQuantile for SparkR

## What changes were proposed in this pull request? Add ```approxQuantile``` for SparkR. ## How was this patch tested? unit tests Author: Yanbo Liang <ybliang8@gmail.com> Closes #11383 from yanboliang/spark-13504 and squashes the following commits: 4f17adb [Yanbo Liang] Add approxQuantile for SparkR

[SPARK-13504] [SPARKR] Add approxQuantile for SparkR
50e60e36 · Yanbo Liang · Xiangrui Meng · f3be369e · 50e60e36 · 50e60e36
Commit 50e60e36 authored 9 years ago by Yanbo Liang Committed by Xiangrui Meng 9 years ago
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -111,6 +111,7 @@ exportMethods("%in%",
              "add_months",
              "alias",
              "approxCountDistinct",
+              "approxQuantile",
              "array_contains",
              "asc",
              "ascii",

--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -67,6 +67,13 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
 # @export
 setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })

+# @rdname statfunctions
+# @export
+setGeneric("approxQuantile",
+           function(x, col, probabilities, relativeError) {
+             standardGeneric("approxQuantile")
+           })
+
 # @rdname distinct
 # @export
 setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })

--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -130,6 +130,45 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
            collect(dataFrame(sct))
          })

+#' approxQuantile
+#'
+#' Calculates the approximate quantiles of a numerical column of a DataFrame.
+#'
+#' The result of this algorithm has the following deterministic bound:
+#' If the DataFrame has N elements and if we request the quantile at probability `p` up to error
+#' `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank
+#' of `x` is close to (p * N). More precisely,
+#'   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
+#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
+#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param col The name of the numerical column.
+#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
+#'                      For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
+#' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
+#'                      the exact quantiles are computed, which could be very expensive.
+#'                      Note that values greater than 1 are accepted but give the same result as 1.
+#' @return The approximate quantiles at the given probabilities.
+#'
+#' @rdname statfunctions
+#' @name approxQuantile
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
+#' }
+setMethod("approxQuantile",
+          signature(x = "DataFrame", col = "character",
+                    probabilities = "numeric", relativeError = "numeric"),
+          function(x, col, probabilities, relativeError) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            callJMethod(statFunctions, "approxQuantile", col,
+                        as.list(probabilities), relativeError)
+          })
+
 #' sampleBy
 #'
 #' Returns a stratified sample without replacement based on the fraction given on each stratum.

--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1785,6 +1785,14 @@ test_that("sampleBy() on a DataFrame", {
  expect_identical(as.list(result[2, ]), list(key = "1", count = 7))
 })

+test_that("approxQuantile() on a DataFrame", {
+  l <- lapply(c(0:99), function(i) { i })
+  df <- createDataFrame(sqlContext, l, "key")
+  quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
+  expect_equal(quantiles[[1]], 50)
+  expect_equal(quantiles[[2]], 80)
+})
+
 test_that("SQL error message is returned from JVM", {
  retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
  expect_equal(grepl("Table not found: blah", retError), TRUE)