From 2ad031be67c7a0f0c4895c084c891330a9ec935e Mon Sep 17 00:00:00 2001 From: Yanbo Liang <ybliang8@gmail.com> Date: Mon, 11 Jul 2016 14:31:11 -0700 Subject: [PATCH] [SPARKR][DOC] SparkR ML user guides update for 2.0 ## What changes were proposed in this pull request? * Update SparkR ML section to make them consistent with SparkR API docs. * Since #13972 adds labelling support for the ```include_example``` Jekyll plugin, so that we can split the single ```ml.R``` example file into multiple line blocks with different labels, and include them in different algorithms/models in the generated HTML page. ## How was this patch tested? Only docs update, manually check the generated docs. Author: Yanbo Liang <ybliang8@gmail.com> Closes #14011 from yanboliang/r-user-guide-update. --- R/pkg/R/mllib.R | 8 +++++--- docs/sparkr.md | 43 +++++++++++++++++++++++----------------- examples/src/main/r/ml.R | 22 ++++++++++---------- 3 files changed, 41 insertions(+), 32 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 4fe73671f8..e9fd0c75c1 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -55,8 +55,9 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' Generalized Linear Models #' -#' Fits generalized linear model against a Spark DataFrame. Users can print, make predictions on the -#' produced model and save the model to the input path. +#' Fits generalized linear model against a Spark DataFrame. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. #' #' @param data SparkDataFrame for training. #' @param formula A symbolic description of the model to be fitted. Currently only a few formula @@ -270,7 +271,8 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' K-Means Clustering Model #' #' Fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans(). -#' Users can print, make predictions on the produced model and save the model to the input path. +#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make +#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. #' #' @param data SparkDataFrame for training #' @param formula A symbolic description of the model to be fitted. Currently only a few formula diff --git a/docs/sparkr.md b/docs/sparkr.md index 32ef815eb1..b4acb23040 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -355,32 +355,39 @@ head(teenagers) # Machine Learning -SparkR supports the following Machine Learning algorithms. +SparkR supports the following machine learning algorithms currently: `Generalized Linear Model`, `Accelerated Failure Time (AFT) Survival Regression Model`, `Naive Bayes Model` and `KMeans Model`. +Under the hood, SparkR uses MLlib to train the model. +Users can call `summary` to print a summary of the fitted model, [predict](api/R/predict.html) to make predictions on new data, and [write.ml](api/R/write.ml.html)/[read.ml](api/R/read.ml.html) to save/load fitted models. +SparkR supports a subset of the available R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘. -* Generalized Linear Regression Model [spark.glm()](api/R/spark.glm.html) -* Naive Bayes [spark.naiveBayes()](api/R/spark.naiveBayes.html) -* KMeans [spark.kmeans()](api/R/spark.kmeans.html) -* AFT Survival Regression [spark.survreg()](api/R/spark.survreg.html) +## Algorithms -[Generalized Linear Regression](api/R/spark.glm.html) can be used to train a model from a specified family. Currently the Gaussian, Binomial, Poisson and Gamma families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+', and '-'. +### Generalized Linear Model -The [summary()](api/R/summary.html) function gives the summary of a model produced by different algorithms listed above. -It produces the similar result compared with R summary function. +[spark.glm()](api/R/spark.glm.html) or [glm()](api/R/glm.html) fits generalized linear model against a Spark DataFrame. +Currently "gaussian", "binomial", "poisson" and "gamma" families are supported. +{% include_example glm r/ml.R %} -## Model persistence +### Accelerated Failure Time (AFT) Survival Regression Model + +[spark.survreg()](api/R/spark.survreg.html) fits an accelerated failure time (AFT) survival regression model on a SparkDataFrame. +Note that the formula of [spark.survreg()](api/R/spark.survreg.html) does not support operator '.' currently. +{% include_example survreg r/ml.R %} + +### Naive Bayes Model -* [write.ml](api/R/write.ml.html) allows users to save a fitted model in a given input path -* [read.ml](api/R/read.ml.html) allows users to read/load the model which was saved using write.ml in a given path +[spark.naiveBayes()](api/R/spark.naiveBayes.html) fits a Bernoulli naive Bayes model against a SparkDataFrame. Only categorical data is supported. +{% include_example naiveBayes r/ml.R %} -Model persistence is supported for all Machine Learning algorithms for all families. +### KMeans Model -The examples below show how to build several models: -* GLM using the Gaussian and Binomial model families -* AFT survival regression model -* Naive Bayes model -* K-Means model +[spark.kmeans()](api/R/spark.kmeans.html) fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans(). +{% include_example kmeans r/ml.R %} + +## Model persistence -{% include_example r/ml.R %} +The following example shows how to save/load a MLlib model by SparkR. +{% include_example read_write r/ml.R %} # R Function Name Conflicts diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R index 65242e68b3..a8a1274ac9 100644 --- a/examples/src/main/r/ml.R +++ b/examples/src/main/r/ml.R @@ -24,9 +24,8 @@ library(SparkR) # Initialize SparkSession sparkR.session(appName = "SparkR-ML-example") -# $example on$ ############################ spark.glm and glm ############################################## - +# $example on:glm$ irisDF <- suppressWarnings(createDataFrame(iris)) # Fit a generalized linear model of family "gaussian" with spark.glm gaussianDF <- irisDF @@ -55,8 +54,9 @@ summary(binomialGLM) # Prediction binomialPredictions <- predict(binomialGLM, binomialTestDF) showDF(binomialPredictions) - +# $example off:glm$ ############################ spark.survreg ############################################## +# $example on:survreg$ # Use the ovarian dataset available in R survival package library(survival) @@ -72,9 +72,9 @@ summary(aftModel) # Prediction aftPredictions <- predict(aftModel, aftTestDF) showDF(aftPredictions) - +# $example off:survreg$ ############################ spark.naiveBayes ############################################## - +# $example on:naiveBayes$ # Fit a Bernoulli naive Bayes model with spark.naiveBayes titanic <- as.data.frame(Titanic) titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5]) @@ -88,9 +88,9 @@ summary(nbModel) # Prediction nbPredictions <- predict(nbModel, nbTestDF) showDF(nbPredictions) - +# $example off:naiveBayes$ ############################ spark.kmeans ############################################## - +# $example on:kmeans$ # Fit a k-means model with spark.kmeans irisDF <- suppressWarnings(createDataFrame(iris)) kmeansDF <- irisDF @@ -107,9 +107,9 @@ showDF(fitted(kmeansModel)) # Prediction kmeansPredictions <- predict(kmeansModel, kmeansTestDF) showDF(kmeansPredictions) - +# $example off:kmeans$ ############################ model read/write ############################################## - +# $example on:read_write$ irisDF <- suppressWarnings(createDataFrame(iris)) # Fit a generalized linear model of family "gaussian" with spark.glm gaussianDF <- irisDF @@ -120,7 +120,7 @@ gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, famil modelPath <- tempfile(pattern = "ml", fileext = ".tmp") write.ml(gaussianGLM, modelPath) gaussianGLM2 <- read.ml(modelPath) -# $example off$ + # Check model summary summary(gaussianGLM2) @@ -129,7 +129,7 @@ gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF) showDF(gaussianPredictions) unlink(modelPath) - +# $example off:read_write$ ############################ fit models with spark.lapply ##################################### # Perform distributed training of multiple models with spark.lapply -- GitLab