Skip to content
Snippets Groups Projects
Commit 9a9c6f5c authored by Yanbo Liang's avatar Yanbo Liang Committed by Xiangrui Meng
Browse files

[SPARK-15222][SPARKR][ML] SparkR ML examples update in 2.0

## What changes were proposed in this pull request?
Update example code in examples/src/main/r/ml.R to reflect the new algorithms.
* spark.glm and glm
* spark.survreg
* spark.naiveBayes
* spark.kmeans

## How was this patch tested?
Offline test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #13000 from yanboliang/spark-15222.
parent a3ceb875
No related branches found
No related tags found
No related merge requests found
......@@ -16,7 +16,7 @@
#
# To run this example use
# ./bin/sparkR examples/src/main/r/ml.R
# ./bin/spark-submit examples/src/main/r/ml.R
# Load SparkR library into your R session
library(SparkR)
......@@ -25,30 +25,125 @@ library(SparkR)
sc <- sparkR.init(appName="SparkR-ML-example")
sqlContext <- sparkRSQL.init(sc)
# Train GLM of family 'gaussian'
training1 <- suppressWarnings(createDataFrame(sqlContext, iris))
test1 <- training1
model1 <- glm(Sepal_Length ~ Sepal_Width + Species, training1, family = "gaussian")
############################ spark.glm and glm ##############################################
irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
# Fit a generalized linear model of family "gaussian" with spark.glm
gaussianDF <- irisDF
gaussianTestDF <- irisDF
gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
# Model summary
summary(model1)
summary(gaussianGLM)
# Prediction
predictions1 <- predict(model1, test1)
head(select(predictions1, "Sepal_Length", "prediction"))
gaussianPredictions <- predict(gaussianGLM, gaussianTestDF)
showDF(gaussianPredictions)
# Fit a generalized linear model with glm (R-compliant)
gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian")
summary(gaussianGLM2)
# Fit a generalized linear model of family "binomial" with spark.glm
binomialDF <- filter(irisDF, irisDF$Species != "setosa")
binomialTestDF <- binomialDF
binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial")
# Model summary
summary(binomialGLM)
# Prediction
binomialPredictions <- predict(binomialGLM, binomialTestDF)
showDF(binomialPredictions)
############################ spark.survreg ##############################################
# Use the ovarian dataset available in R survival package
library(survival)
# Train GLM of family 'binomial'
training2 <- filter(training1, training1$Species != "setosa")
test2 <- training2
model2 <- glm(Species ~ Sepal_Length + Sepal_Width, data = training2, family = "binomial")
# Fit an accelerated failure time (AFT) survival regression model with spark.survreg
ovarianDF <- suppressWarnings(createDataFrame(sqlContext, ovarian))
aftDF <- ovarianDF
aftTestDF <- ovarianDF
aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx)
# Model summary
summary(model2)
summary(aftModel)
# Prediction
aftPredictions <- predict(aftModel, aftTestDF)
showDF(aftPredictions)
############################ spark.naiveBayes ##############################################
# Fit a Bernoulli naive Bayes model with spark.naiveBayes
titanic <- as.data.frame(Titanic)
titanicDF <- suppressWarnings(createDataFrame(sqlContext, titanic[titanic$Freq > 0, -5]))
nbDF <- titanicDF
nbTestDF <- titanicDF
nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age)
# Model summary
summary(nbModel)
# Prediction
nbPredictions <- predict(nbModel, nbTestDF)
showDF(nbPredictions)
############################ spark.kmeans ##############################################
# Fit a k-means model with spark.kmeans
irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
kmeansDF <- irisDF
kmeansTestDF <- irisDF
kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
k = 3)
# Model summary
summary(kmeansModel)
# Get fitted result from the k-means model
showDF(fitted(kmeansModel))
# Prediction
kmeansPredictions <- predict(kmeansModel, kmeansTestDF)
showDF(kmeansPredictions)
############################ model read/write ##############################################
irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
# Fit a generalized linear model of family "gaussian" with spark.glm
gaussianDF <- irisDF
gaussianTestDF <- irisDF
gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
# Save and then load a fitted MLlib model
modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
write.ml(gaussianGLM, modelPath)
gaussianGLM2 <- read.ml(modelPath)
# Check model summary
summary(gaussianGLM2)
# Check model prediction
gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF)
showDF(gaussianPredictions)
unlink(modelPath)
############################ fit models with spark.lapply #####################################
# Perform distributed training of multiple models with spark.lapply
families <- c("gaussian", "poisson")
train <- function(family) {
model <- glm(Sepal.Length ~ Sepal.Width + Species, iris, family = family)
summary(model)
}
model.summaries <- spark.lapply(sc, families, train)
# Print the summary of each model
print(model.summaries)
# Prediction (Currently the output of prediction for binomial GLM is the indexed label,
# we need to transform back to the original string label later)
predictions2 <- predict(model2, test2)
head(select(predictions2, "Species", "prediction"))
# Stop the SparkContext now
sparkR.stop()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment