diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 6a3d63f43f78546c65c883122ee1c26f0f165d37..636d39e1e9caede0db475b63d3d153663acd700e 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -111,6 +111,7 @@ exportMethods("%in%", "add_months", "alias", "approxCountDistinct", + "approxQuantile", "array_contains", "asc", "ascii", diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index ab61bce03df231b8fb963c3b86527257f7c74742..3db72b57954d76bc57dc644f3f75a095da79bc83 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -67,6 +67,13 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") }) # @export setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") }) +# @rdname statfunctions +# @export +setGeneric("approxQuantile", + function(x, col, probabilities, relativeError) { + standardGeneric("approxQuantile") + }) + # @rdname distinct # @export setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") }) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 2e8076843f08acd4014dcd0bdfff454c5019d174..edf72937c633a3799a00da0534116dd224040937 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -130,6 +130,45 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"), collect(dataFrame(sct)) }) +#' approxQuantile +#' +#' Calculates the approximate quantiles of a numerical column of a DataFrame. +#' +#' The result of this algorithm has the following deterministic bound: +#' If the DataFrame has N elements and if we request the quantile at probability `p` up to error +#' `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank +#' of `x` is close to (p * N). More precisely, +#' floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). +#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed +#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 +#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. +#' +#' @param x A SparkSQL DataFrame. +#' @param col The name of the numerical column. +#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1]. +#' For example 0 is the minimum, 0.5 is the median, 1 is the maximum. +#' @param relativeError The relative target precision to achieve (>= 0). If set to zero, +#' the exact quantiles are computed, which could be very expensive. +#' Note that values greater than 1 are accepted but give the same result as 1. +#' @return The approximate quantiles at the given probabilities. +#' +#' @rdname statfunctions +#' @name approxQuantile +#' @export +#' @examples +#' \dontrun{ +#' df <- jsonFile(sqlContext, "/path/to/file.json") +#' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0) +#' } +setMethod("approxQuantile", + signature(x = "DataFrame", col = "character", + probabilities = "numeric", relativeError = "numeric"), + function(x, col, probabilities, relativeError) { + statFunctions <- callJMethod(x@sdf, "stat") + callJMethod(statFunctions, "approxQuantile", col, + as.list(probabilities), relativeError) + }) + #' sampleBy #' #' Returns a stratified sample without replacement based on the fraction given on each stratum. diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index cc118108f61cc59dfec113290995559e953d316c..236bae6bded25d60fe933b791576ebbb432ba8e3 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1785,6 +1785,14 @@ test_that("sampleBy() on a DataFrame", { expect_identical(as.list(result[2, ]), list(key = "1", count = 7)) }) +test_that("approxQuantile() on a DataFrame", { + l <- lapply(c(0:99), function(i) { i }) + df <- createDataFrame(sqlContext, l, "key") + quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0) + expect_equal(quantiles[[1]], 50) + expect_equal(quantiles[[2]], 80) +}) + test_that("SQL error message is returned from JVM", { retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e) expect_equal(grepl("Table not found: blah", retError), TRUE)