Skip to content
Snippets Groups Projects
Commit ad459cfb authored by actuaryzhang's avatar actuaryzhang Committed by Felix Cheung
Browse files

[SPARK-20917][ML][SPARKR] SparkR supports string encoding consistent with R

## What changes were proposed in this pull request?

Add `stringIndexerOrderType` to `spark.glm` and `spark.survreg` to support string encoding that is consistent with default R.

## How was this patch tested?
new tests

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #18140 from actuaryzhang/sparkRFormula.
parent cad88f17
No related branches found
No related tags found
No related merge requests found
......@@ -70,6 +70,12 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
#' the relationship between the variance and mean of the distribution. Only
#' applicable to the Tweedie family.
#' @param link.power the index in the power link function. Only applicable to the Tweedie family.
#' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
#' decide the base level of a string feature as the last category after
#' ordering is dropped when encoding strings. Supported options are
#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
#' The default value is "frequencyDesc". When the ordering is set to
#' "alphabetDesc", this drops the same category as R when encoding strings.
#' @param ... additional arguments passed to the method.
#' @aliases spark.glm,SparkDataFrame,formula-method
#' @return \code{spark.glm} returns a fitted generalized linear model.
......@@ -79,7 +85,7 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
#' @examples
#' \dontrun{
#' sparkR.session()
#' t <- as.data.frame(Titanic)
#' t <- as.data.frame(Titanic, stringsAsFactors = FALSE)
#' df <- createDataFrame(t)
#' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
#' summary(model)
......@@ -96,6 +102,15 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
#' savedModel <- read.ml(path)
#' summary(savedModel)
#'
#' # note that the default string encoding is different from R's glm
#' model2 <- glm(Freq ~ Sex + Age, family = "gaussian", data = t)
#' summary(model2)
#' # use stringIndexerOrderType = "alphabetDesc" to force string encoding
#' # to be consistent with R
#' model3 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian",
#' stringIndexerOrderType = "alphabetDesc")
#' summary(model3)
#'
#' # fit tweedie model
#' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie",
#' var.power = 1.2, link.power = 0)
......@@ -110,8 +125,11 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
#' @seealso \link{glm}, \link{read.ml}
setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power) {
regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power,
stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
"alphabetDesc", "alphabetAsc")) {
stringIndexerOrderType <- match.arg(stringIndexerOrderType)
if (is.character(family)) {
# Handle when family = "tweedie"
if (tolower(family) == "tweedie") {
......@@ -145,7 +163,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
"fit", formula, data@sdf, tolower(family$family), family$link,
tol, as.integer(maxIter), weightCol, regParam,
as.double(var.power), as.double(link.power))
as.double(var.power), as.double(link.power),
stringIndexerOrderType)
new("GeneralizedLinearRegressionModel", jobj = jobj)
})
......@@ -167,6 +186,12 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
#' @param maxit integer giving the maximal number of IRLS iterations.
#' @param var.power the index of the power variance function in the Tweedie family.
#' @param link.power the index of the power link function in the Tweedie family.
#' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
#' decide the base level of a string feature as the last category after
#' ordering is dropped when encoding strings. Supported options are
#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
#' The default value is "frequencyDesc". When the ordering is set to
#' "alphabetDesc", this drops the same category as R when encoding strings.
#' @return \code{glm} returns a fitted generalized linear model.
#' @rdname glm
#' @export
......@@ -182,9 +207,12 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
#' @seealso \link{spark.glm}
setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
var.power = 0.0, link.power = 1.0 - var.power) {
var.power = 0.0, link.power = 1.0 - var.power,
stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
"alphabetDesc", "alphabetAsc")) {
spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
var.power = var.power, link.power = link.power)
var.power = var.power, link.power = link.power,
stringIndexerOrderType = stringIndexerOrderType)
})
# Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
......@@ -418,6 +446,12 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
#' or the number of partitions are large, this param could be adjusted to a larger size.
#' This is an expert parameter. Default value should be good for most cases.
#' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
#' decide the base level of a string feature as the last category after
#' ordering is dropped when encoding strings. Supported options are
#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
#' The default value is "frequencyDesc". When the ordering is set to
#' "alphabetDesc", this drops the same category as R when encoding strings.
#' @param ... additional arguments passed to the method.
#' @return \code{spark.survreg} returns a fitted AFT survival regression model.
#' @rdname spark.survreg
......@@ -443,10 +477,14 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
#' }
#' @note spark.survreg since 2.0.0
setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula"),
function(data, formula, aggregationDepth = 2) {
function(data, formula, aggregationDepth = 2,
stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
"alphabetDesc", "alphabetAsc")) {
stringIndexerOrderType <- match.arg(stringIndexerOrderType)
formula <- paste(deparse(formula), collapse = "")
jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper",
"fit", formula, data@sdf, as.integer(aggregationDepth))
"fit", formula, data@sdf, as.integer(aggregationDepth),
stringIndexerOrderType)
new("AFTSurvivalRegressionModel", jobj = jobj)
})
......
......@@ -367,6 +367,49 @@ test_that("glm save/load", {
unlink(modelPath)
})
test_that("spark.glm and glm with string encoding", {
t <- as.data.frame(Titanic, stringsAsFactors = FALSE)
df <- createDataFrame(t)
# base R
rm <- stats::glm(Freq ~ Sex + Age, family = "gaussian", data = t)
# spark.glm with default stringIndexerOrderType = "frequencyDesc"
sm0 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
# spark.glm with stringIndexerOrderType = "alphabetDesc"
sm1 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian",
stringIndexerOrderType = "alphabetDesc")
# glm with stringIndexerOrderType = "alphabetDesc"
sm2 <- glm(Freq ~ Sex + Age, family = "gaussian", data = df,
stringIndexerOrderType = "alphabetDesc")
rStats <- summary(rm)
rCoefs <- rStats$coefficients
sStats <- lapply(list(sm0, sm1, sm2), summary)
# order by coefficient size since column rendering may be different
o <- order(rCoefs[, 1])
# default encoding does not produce same results as R
expect_false(all(abs(rCoefs[o, ] - sStats[[1]]$coefficients[o, ]) < 1e-4))
# all estimates should be the same as R with stringIndexerOrderType = "alphabetDesc"
test <- lapply(sStats[2:3], function(stats) {
expect_true(all(abs(rCoefs[o, ] - stats$coefficients[o, ]) < 1e-4))
expect_equal(stats$dispersion, rStats$dispersion)
expect_equal(stats$null.deviance, rStats$null.deviance)
expect_equal(stats$deviance, rStats$deviance)
expect_equal(stats$df.null, rStats$df.null)
expect_equal(stats$df.residual, rStats$df.residual)
expect_equal(stats$aic, rStats$aic)
})
# fitted values should be equal regardless of string encoding
rVals <- predict(rm, t)
test <- lapply(list(sm0, sm1, sm2), function(sm) {
vals <- collect(select(predict(sm, df), "prediction"))
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
})
})
test_that("spark.isoreg", {
label <- c(7.0, 5.0, 3.0, 5.0, 1.0)
feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
......@@ -462,6 +505,25 @@ test_that("spark.survreg", {
model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
NA)
expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
# Test stringIndexerOrderType
rData <- as.data.frame(rData)
rData$sex2 <- c("female", "male")[rData$sex + 1]
df <- createDataFrame(rData)
expect_error(
rModel <- survival::survreg(survival::Surv(time, status) ~ x + sex2, rData), NA)
rCoefs <- as.numeric(summary(rModel)$table[, 1])
model <- spark.survreg(df, Surv(time, status) ~ x + sex2)
coefs <- as.vector(summary(model)$coefficients[, 1])
o <- order(rCoefs)
# stringIndexerOrderType = "frequencyDesc" produces different estimates from R
expect_false(all(abs(rCoefs[o] - coefs[o]) < 1e-4))
# stringIndexerOrderType = "alphabetDesc" produces the same estimates as R
model <- spark.survreg(df, Surv(time, status) ~ x + sex2,
stringIndexerOrderType = "alphabetDesc")
coefs <- as.vector(summary(model)$coefficients[, 1])
expect_true(all(abs(rCoefs[o] - coefs[o]) < 1e-4))
}
})
......
......@@ -85,11 +85,13 @@ private[r] object AFTSurvivalRegressionWrapper extends MLReadable[AFTSurvivalReg
def fit(
formula: String,
data: DataFrame,
aggregationDepth: Int): AFTSurvivalRegressionWrapper = {
aggregationDepth: Int,
stringIndexerOrderType: String): AFTSurvivalRegressionWrapper = {
val (rewritedFormula, censorCol) = formulaRewrite(formula)
val rFormula = new RFormula().setFormula(rewritedFormula)
.setStringIndexerOrderType(stringIndexerOrderType)
RWrapperUtils.checkDataColumns(rFormula, data)
val rFormulaModel = rFormula.fit(data)
......
......@@ -65,6 +65,7 @@ private[r] class GeneralizedLinearRegressionWrapper private (
private[r] object GeneralizedLinearRegressionWrapper
extends MLReadable[GeneralizedLinearRegressionWrapper] {
// scalastyle:off
def fit(
formula: String,
data: DataFrame,
......@@ -75,8 +76,11 @@ private[r] object GeneralizedLinearRegressionWrapper
weightCol: String,
regParam: Double,
variancePower: Double,
linkPower: Double): GeneralizedLinearRegressionWrapper = {
linkPower: Double,
stringIndexerOrderType: String): GeneralizedLinearRegressionWrapper = {
// scalastyle:on
val rFormula = new RFormula().setFormula(formula)
.setStringIndexerOrderType(stringIndexerOrderType)
checkDataColumns(rFormula, data)
val rFormulaModel = rFormula.fit(data)
// get labels and feature names from output schema
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment