Skip to content
Snippets Groups Projects
Commit 70f44ad2 authored by Rerngvit Yanggratoke's avatar Rerngvit Yanggratoke Committed by Shivaram Venkataraman
Browse files

[SPARK-10905] [SPARKR] Export freqItems() for DataFrameStatFunctions

[SPARK-10905][SparkR]: Export freqItems() for DataFrameStatFunctions
- Add function (together with roxygen2 doc) to DataFrame.R and generics.R
- Expose the function in NAMESPACE
- Add unit test for the function

Author: Rerngvit Yanggratoke <rerngvit@kth.se>

Closes #8962 from rerngvit/SPARK-10905.
parent 5994cfe8
No related branches found
No related tags found
No related merge requests found
......@@ -40,6 +40,7 @@ exportMethods("arrange",
"fillna",
"filter",
"first",
"freqItems",
"group_by",
"groupBy",
"head",
......
......@@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
# @export
setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
# @rdname statfunctions
# @export
setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
# @rdname distinct
# @export
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
......
......@@ -100,3 +100,30 @@ setMethod("corr",
statFunctions <- callJMethod(x@sdf, "stat")
callJMethod(statFunctions, "corr", col1, col2, method)
})
#' freqItems
#'
#' Finding frequent items for columns, possibly with false positives.
#' Using the frequent element count algorithm described in
#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
#'
#' @param x A SparkSQL DataFrame.
#' @param cols A vector column names to search frequent items in.
#' @param support (Optional) The minimum frequency for an item to be considered `frequent`.
#' Should be greater than 1e-4. Default support = 0.01.
#' @return a local R data.frame with the frequent items in each column
#'
#' @rdname statfunctions
#' @name freqItems
#' @export
#' @examples
#' \dontrun{
#' df <- jsonFile(sqlContext, "/path/to/file.json")
#' fi = freqItems(df, c("title", "gender"))
#' }
setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
function(x, cols, support = 0.01) {
statFunctions <- callJMethod(x@sdf, "stat")
sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
collect(dataFrame(sct))
})
......@@ -1350,6 +1350,27 @@ test_that("cov() and corr() on a DataFrame", {
expect_true(abs(result - 1.0) < 1e-12)
})
test_that("freqItems() on a DataFrame", {
input <- 1:1000
rdf <- data.frame(numbers = input, letters = as.character(input),
negDoubles = input * -1.0, stringsAsFactors = F)
rdf[ input %% 3 == 0, ] <- c(1, "1", -1)
df <- createDataFrame(sqlContext, rdf)
multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1)
expect_true(1 %in% multiColResults$numbers[[1]])
expect_true("1" %in% multiColResults$letters[[1]])
singleColResult <- freqItems(df, "negDoubles", support=0.1)
expect_true(-1 %in% head(singleColResult$negDoubles)[[1]])
l <- lapply(c(0:99), function(i) {
if (i %% 2 == 0) { list(1L, -1.0) }
else { list(i, i * -1.0) }})
df <- createDataFrame(sqlContext, l, c("a", "b"))
result <- freqItems(df, c("a", "b"), 0.4)
expect_identical(result[[1]], list(list(1L, 99L)))
expect_identical(result[[2]], list(list(-1, -99)))
})
test_that("SQL error message is returned from JVM", {
retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
expect_equal(grepl("Table Not Found: blah", retError), TRUE)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment