[SPARK-20917][ML][SPARKR] SparkR supports string encoding consistent with R

## What changes were proposed in this pull request? Add `stringIndexerOrderType` to `spark.glm` and `spark.survreg` to support string encoding that is consistent with default R. ## How was this patch tested? new tests Author: actuaryzhang <actuaryzhang10@gmail.com> Closes #18140 from actuaryzhang/sparkRFormula.

[SPARK-20917][ML][SPARKR] SparkR supports string encoding consistent with R
ad459cfb · actuaryzhang · Felix Cheung · cad88f17 · ad459cfb · ad459cfb
Commit ad459cfb authored 7 years ago by actuaryzhang Committed by Felix Cheung 7 years ago
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -70,6 +70,12 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'                      the relationship between the variance and mean of the distribution. Only
 #'                      applicable to the Tweedie family.
 #' @param link.power the index in the power link function. Only applicable to the Tweedie family.
+#' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
+#'                               decide the base level of a string feature as the last category after
+#'                               ordering is dropped when encoding strings. Supported options are
+#'                               "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
+#'                               The default value is "frequencyDesc". When the ordering is set to
+#'                               "alphabetDesc", this drops the same category as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -79,7 +85,7 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' @examples
 #' \dontrun{
 #' sparkR.session()
-#' t <- as.data.frame(Titanic)
+#' t <- as.data.frame(Titanic, stringsAsFactors = FALSE)
 #' df <- createDataFrame(t)
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
 #' summary(model)
@@ -96,6 +102,15 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
 #'
+#' # note that the default string encoding is different from R's glm
+#' model2 <- glm(Freq ~ Sex + Age, family = "gaussian", data = t)
+#' summary(model2)
+#' # use stringIndexerOrderType = "alphabetDesc" to force string encoding
+#' # to be consistent with R
+#' model3 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian",
+#'                    stringIndexerOrderType = "alphabetDesc")
+#' summary(model3)
+#'
 #' # fit tweedie model
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie",
 #'                    var.power = 1.2, link.power = 0)
@@ -110,8 +125,11 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
          function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
-                   regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power) {
+                   regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power,
+                   stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
+                                              "alphabetDesc", "alphabetAsc")) {

+            stringIndexerOrderType <- match.arg(stringIndexerOrderType)
            if (is.character(family)) {
              # Handle when family = "tweedie"
              if (tolower(family) == "tweedie") {
@@ -145,7 +163,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
            jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                "fit", formula, data@sdf, tolower(family$family), family$link,
                                tol, as.integer(maxIter), weightCol, regParam,
-                                as.double(var.power), as.double(link.power))
+                                as.double(var.power), as.double(link.power),
+                                stringIndexerOrderType)
            new("GeneralizedLinearRegressionModel", jobj = jobj)
          })

@@ -167,6 +186,12 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #' @param maxit integer giving the maximal number of IRLS iterations.
 #' @param var.power the index of the power variance function in the Tweedie family.
 #' @param link.power the index of the power link function in the Tweedie family.
+#' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
+#'                               decide the base level of a string feature as the last category after
+#'                               ordering is dropped when encoding strings. Supported options are
+#'                               "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
+#'                               The default value is "frequencyDesc". When the ordering is set to
+#'                               "alphabetDesc", this drops the same category as R when encoding strings.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
@@ -182,9 +207,12 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #' @seealso \link{spark.glm}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
-                   var.power = 0.0, link.power = 1.0 - var.power) {
+                   var.power = 0.0, link.power = 1.0 - var.power,
+                   stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
+                                              "alphabetDesc", "alphabetAsc")) {
            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
-                      var.power = var.power, link.power = link.power)
+                      var.power = var.power, link.power = link.power,
+                      stringIndexerOrderType = stringIndexerOrderType)
          })

 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
@@ -418,6 +446,12 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
 #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
 #'                         or the number of partitions are large, this param could be adjusted to a larger size.
 #'                         This is an expert parameter. Default value should be good for most cases.
+#' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
+#'                               decide the base level of a string feature as the last category after
+#'                               ordering is dropped when encoding strings. Supported options are
+#'                               "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
+#'                               The default value is "frequencyDesc". When the ordering is set to
+#'                               "alphabetDesc", this drops the same category as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.survreg} returns a fitted AFT survival regression model.
 #' @rdname spark.survreg
@@ -443,10 +477,14 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
 #' }
 #' @note spark.survreg since 2.0.0
 setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula"),
-          function(data, formula, aggregationDepth = 2) {
+          function(data, formula, aggregationDepth = 2,
+                   stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
+                                              "alphabetDesc", "alphabetAsc")) {
+            stringIndexerOrderType <- match.arg(stringIndexerOrderType)
            formula <- paste(deparse(formula), collapse = "")
            jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper",
-                                "fit", formula, data@sdf, as.integer(aggregationDepth))
+                                "fit", formula, data@sdf, as.integer(aggregationDepth),
+                                stringIndexerOrderType)
            new("AFTSurvivalRegressionModel", jobj = jobj)
          })


--- a/R/pkg/tests/fulltests/test_mllib_regression.R
+++ b/R/pkg/tests/fulltests/test_mllib_regression.R
@@ -367,6 +367,49 @@ test_that("glm save/load", {
  unlink(modelPath)
 })

+test_that("spark.glm and glm with string encoding", {
+  t <- as.data.frame(Titanic, stringsAsFactors = FALSE)
+  df <- createDataFrame(t)
+
+  # base R
+  rm <- stats::glm(Freq ~ Sex + Age, family = "gaussian", data = t)
+  # spark.glm with default stringIndexerOrderType = "frequencyDesc"
+  sm0 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
+  # spark.glm with stringIndexerOrderType = "alphabetDesc"
+  sm1 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian",
+                   stringIndexerOrderType = "alphabetDesc")
+  # glm with stringIndexerOrderType = "alphabetDesc"
+  sm2 <- glm(Freq ~ Sex + Age, family = "gaussian", data = df,
+                stringIndexerOrderType = "alphabetDesc")
+
+  rStats <- summary(rm)
+  rCoefs <- rStats$coefficients
+  sStats <- lapply(list(sm0, sm1, sm2), summary)
+  # order by coefficient size since column rendering may be different
+  o <- order(rCoefs[, 1])
+
+  # default encoding does not produce same results as R
+  expect_false(all(abs(rCoefs[o, ] - sStats[[1]]$coefficients[o, ]) < 1e-4))
+
+  # all estimates should be the same as R with stringIndexerOrderType = "alphabetDesc"
+  test <- lapply(sStats[2:3], function(stats) {
+    expect_true(all(abs(rCoefs[o, ] - stats$coefficients[o, ]) < 1e-4))
+    expect_equal(stats$dispersion, rStats$dispersion)
+    expect_equal(stats$null.deviance, rStats$null.deviance)
+    expect_equal(stats$deviance, rStats$deviance)
+    expect_equal(stats$df.null, rStats$df.null)
+    expect_equal(stats$df.residual, rStats$df.residual)
+    expect_equal(stats$aic, rStats$aic)
+  })
+
+  # fitted values should be equal regardless of string encoding
+  rVals <- predict(rm, t)
+  test <- lapply(list(sm0, sm1, sm2), function(sm) {
+    vals <- collect(select(predict(sm, df), "prediction"))
+    expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+  })
+})
+
 test_that("spark.isoreg", {
  label <- c(7.0, 5.0, 3.0, 5.0, 1.0)
  feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
@@ -462,6 +505,25 @@ test_that("spark.survreg", {
      model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
                                 NA)
    expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
+
+    # Test stringIndexerOrderType
+    rData <- as.data.frame(rData)
+    rData$sex2 <- c("female", "male")[rData$sex + 1]
+    df <- createDataFrame(rData)
+    expect_error(
+      rModel <- survival::survreg(survival::Surv(time, status) ~ x + sex2, rData), NA)
+    rCoefs <- as.numeric(summary(rModel)$table[, 1])
+    model <- spark.survreg(df, Surv(time, status) ~ x + sex2)
+    coefs <- as.vector(summary(model)$coefficients[, 1])
+    o <- order(rCoefs)
+    # stringIndexerOrderType = "frequencyDesc" produces different estimates from R
+    expect_false(all(abs(rCoefs[o] - coefs[o]) < 1e-4))
+
+    # stringIndexerOrderType = "alphabetDesc" produces the same estimates as R
+    model <- spark.survreg(df, Surv(time, status) ~ x + sex2,
+                           stringIndexerOrderType = "alphabetDesc")
+    coefs <- as.vector(summary(model)$coefficients[, 1])
+    expect_true(all(abs(rCoefs[o] - coefs[o]) < 1e-4))
  }
 })


--- a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
@@ -85,11 +85,13 @@ private[r] object AFTSurvivalRegressionWrapper extends MLReadable[AFTSurvivalReg
  def fit(
      formula: String,
      data: DataFrame,
-      aggregationDepth: Int): AFTSurvivalRegressionWrapper = {
+      aggregationDepth: Int,
+      stringIndexerOrderType: String): AFTSurvivalRegressionWrapper = {

    val (rewritedFormula, censorCol) = formulaRewrite(formula)

    val rFormula = new RFormula().setFormula(rewritedFormula)
+      .setStringIndexerOrderType(stringIndexerOrderType)
    RWrapperUtils.checkDataColumns(rFormula, data)
    val rFormulaModel = rFormula.fit(data)


--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -65,6 +65,7 @@ private[r] class GeneralizedLinearRegressionWrapper private (
 private[r] object GeneralizedLinearRegressionWrapper
  extends MLReadable[GeneralizedLinearRegressionWrapper] {

+  // scalastyle:off
  def fit(
      formula: String,
      data: DataFrame,
@@ -75,8 +76,11 @@ private[r] object GeneralizedLinearRegressionWrapper
      weightCol: String,
      regParam: Double,
      variancePower: Double,
-      linkPower: Double): GeneralizedLinearRegressionWrapper = {
+      linkPower: Double,
+      stringIndexerOrderType: String): GeneralizedLinearRegressionWrapper = {
+  // scalastyle:on
    val rFormula = new RFormula().setFormula(formula)
+      .setStringIndexerOrderType(stringIndexerOrderType)
    checkDataColumns(rFormula, data)
    val rFormulaModel = rFormula.fit(data)
    // get labels and feature names from output schema