From 702d85af2df9433254af6fa029683aa19c52a276 Mon Sep 17 00:00:00 2001
From: zero323 <>
Date: Tue, 18 Apr 2017 19:59:18 -0700
Subject: [PATCH] [SPARK-20208][R][DOCS] Document R fpGrowth support

## What changes were proposed in this pull request?

Document  fpGrowth in:

- vignettes
- programming guide
- code example

## How was this patch tested?

Manual tests.

Author: zero323 <>

Closes #17557 from zero323/SPARK-20208.
 R/pkg/vignettes/sparkr-vignettes.Rmd | 37 +++++++++++++++++++-
 examples/src/main/r/ml/fpm.R         | 50 ++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 examples/src/main/r/ml/fpm.R

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index a6ff650c33..f81dbab10b 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -505,6 +505,10 @@ SparkR supports the following machine learning models and algorithms.
 * Alternating Least Squares (ALS)
+#### Frequent Pattern Mining
+* FP-growth
 #### Statistics
 * Kolmogorov-Smirnov Test
@@ -707,7 +711,7 @@ summary(tweedieGLM1)
 We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link:
-tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", 
+tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie",
                          var.power = 1.2, link.power = 0.0)
@@ -906,6 +910,37 @@ predicted <- predict(model, df)
+#### FP-growth
+`spark.fpGrowth` executes FP-growth algorithm to mine frequent itemsets on a `SparkDataFrame`. `itemsCol` should be an array of values.
+df <- selectExpr(createDataFrame(data.frame(rawItems = c(
+  "T,R,U", "T,S", "V,R", "R,U,T,V", "R,S", "V,S,U", "U,R", "S,T", "V,R", "V,U,S",
+  "T,V,U", "R,V", "T,S", "T,S", "S,T", "S,U", "T,R", "V,R", "S,V", "T,S,U"
+))), "split(rawItems, ',') AS items")
+fpm <- spark.fpGrowth(df, minSupport = 0.2, minConfidence = 0.5)
+`spark.freqItemsets` method can be used to retrieve a `SparkDataFrame` with the frequent itemsets.
+`spark.associationRules` returns a `SparkDataFrame` with the association rules.
+We can make predictions based on the `antecedent`.
+head(predict(fpm, df))
 #### Kolmogorov-Smirnov Test
 `spark.kstest` runs a two-sided, one-sample [Kolmogorov-Smirnov (KS) test](
diff --git a/examples/src/main/r/ml/fpm.R b/examples/src/main/r/ml/fpm.R
new file mode 100644
index 0000000000..89c4564457
--- /dev/null
+++ b/examples/src/main/r/ml/fpm.R
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/fpm.R
+# Load SparkR library into your R session
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-fpm-example")
+# $example on$
+# Load training data
+df <- selectExpr(createDataFrame(data.frame(rawItems = c(
+  "1,2,5", "1,2,3,5", "1,2"
+))), "split(rawItems, ',') AS items")
+fpm <- spark.fpGrowth(df, itemsCol="items", minSupport=0.5, minConfidence=0.6)
+# Extracting frequent itemsets
+# Extracting association rules
+# Predict uses association rules to and combines possible consequents
+predict(fpm, df)
+# $example off$