From fc5fee83e363bc6df22459a9b1ba2ba11bfdfa20 Mon Sep 17 00:00:00 2001 From: Yanbo Liang <ybliang8@gmail.com> Date: Tue, 22 Nov 2016 19:17:48 -0800 Subject: [PATCH] [SPARK-18501][ML][SPARKR] Fix spark.glm errors when fitting on collinear data ## What changes were proposed in this pull request? * Fix SparkR ```spark.glm``` errors when fitting on collinear data, since ```standard error of coefficients, t value and p value``` are not available in this condition. * Scala/Python GLM summary should throw exception if users get ```standard error of coefficients, t value and p value``` but the underlying WLS was solved by local "l-bfgs". ## How was this patch tested? Add unit tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #15930 from yanboliang/spark-18501. (cherry picked from commit 982b82e32e0fc7d30c5d557944a79eb3e6d2da59) Signed-off-by: Yanbo Liang <ybliang8@gmail.com> --- R/pkg/R/mllib.R | 21 ++++++-- R/pkg/inst/tests/testthat/test_mllib.R | 9 ++++ .../GeneralizedLinearRegressionWrapper.scala | 54 +++++++++++-------- .../GeneralizedLinearRegression.scala | 46 +++++++++++++--- .../GeneralizedLinearRegressionSuite.scala | 21 ++++++++ 5 files changed, 115 insertions(+), 36 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 265e64e746..02bc6456de 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -278,8 +278,10 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat #' @param object a fitted generalized linear model. #' @return \code{summary} returns a summary object of the fitted model, a list of components -#' including at least the coefficients, null/residual deviance, null/residual degrees -#' of freedom, AIC and number of iterations IRLS takes. +#' including at least the coefficients matrix (which includes coefficients, standard error +#' of coefficients, t value and p value), null/residual deviance, null/residual degrees of +#' freedom, AIC and number of iterations IRLS takes. If there are collinear columns +#' in you data, the coefficients matrix only provides coefficients. #' #' @rdname spark.glm #' @export @@ -303,9 +305,18 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), } else { dataFrame(callJMethod(jobj, "rDevianceResiduals")) } - coefficients <- matrix(coefficients, ncol = 4) - colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)") - rownames(coefficients) <- unlist(features) + # If the underlying WeightedLeastSquares using "normal" solver, we can provide + # coefficients, standard error of coefficients, t value and p value. Otherwise, + # it will be fitted by local "l-bfgs", we can only provide coefficients. + if (length(features) == length(coefficients)) { + coefficients <- matrix(coefficients, ncol = 1) + colnames(coefficients) <- c("Estimate") + rownames(coefficients) <- unlist(features) + } else { + coefficients <- matrix(coefficients, ncol = 4) + colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)") + rownames(coefficients) <- unlist(features) + } ans <- list(deviance.resid = deviance.resid, coefficients = coefficients, dispersion = dispersion, null.deviance = null.deviance, deviance = deviance, df.null = df.null, df.residual = df.residual, diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 70a033de53..b05be476a3 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -169,6 +169,15 @@ test_that("spark.glm summary", { df <- suppressWarnings(createDataFrame(data)) regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0)) expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result + + # Test spark.glm works on collinear data + A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2) + b <- c(1, 2, 3, 4) + data <- as.data.frame(cbind(A, b)) + df <- createDataFrame(data) + stats <- summary(spark.glm(df, b ~ . - 1)) + coefs <- unlist(stats$coefficients) + expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4)) }) test_that("spark.glm save/load", { diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index add4d49110..8bcc9fe5d1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -144,30 +144,38 @@ private[r] object GeneralizedLinearRegressionWrapper features } - val rCoefficientStandardErrors = if (glm.getFitIntercept) { - Array(summary.coefficientStandardErrors.last) ++ - summary.coefficientStandardErrors.dropRight(1) + val rCoefficients: Array[Double] = if (summary.isNormalSolver) { + val rCoefficientStandardErrors = if (glm.getFitIntercept) { + Array(summary.coefficientStandardErrors.last) ++ + summary.coefficientStandardErrors.dropRight(1) + } else { + summary.coefficientStandardErrors + } + + val rTValues = if (glm.getFitIntercept) { + Array(summary.tValues.last) ++ summary.tValues.dropRight(1) + } else { + summary.tValues + } + + val rPValues = if (glm.getFitIntercept) { + Array(summary.pValues.last) ++ summary.pValues.dropRight(1) + } else { + summary.pValues + } + + if (glm.getFitIntercept) { + Array(glm.intercept) ++ glm.coefficients.toArray ++ + rCoefficientStandardErrors ++ rTValues ++ rPValues + } else { + glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues + } } else { - summary.coefficientStandardErrors - } - - val rTValues = if (glm.getFitIntercept) { - Array(summary.tValues.last) ++ summary.tValues.dropRight(1) - } else { - summary.tValues - } - - val rPValues = if (glm.getFitIntercept) { - Array(summary.pValues.last) ++ summary.pValues.dropRight(1) - } else { - summary.pValues - } - - val rCoefficients: Array[Double] = if (glm.getFitIntercept) { - Array(glm.intercept) ++ glm.coefficients.toArray ++ - rCoefficientStandardErrors ++ rTValues ++ rPValues - } else { - glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues + if (glm.getFitIntercept) { + Array(glm.intercept) ++ glm.coefficients.toArray + } else { + glm.coefficients.toArray + } } val rDispersion: Double = summary.dispersion diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 3f9de1fe74..f33dd0fd29 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -1063,45 +1063,75 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( import GeneralizedLinearRegression._ + /** + * Whether the underlying [[WeightedLeastSquares]] using the "normal" solver. + */ + private[ml] val isNormalSolver: Boolean = { + diagInvAtWA.length != 1 || diagInvAtWA(0) != 0 + } + /** * Standard error of estimated coefficients and intercept. + * This value is only available when the underlying [[WeightedLeastSquares]] + * using the "normal" solver. * * If [[GeneralizedLinearRegression.fitIntercept]] is set to true, * then the last element returned corresponds to the intercept. */ @Since("2.0.0") lazy val coefficientStandardErrors: Array[Double] = { - diagInvAtWA.map(_ * dispersion).map(math.sqrt) + if (isNormalSolver) { + diagInvAtWA.map(_ * dispersion).map(math.sqrt) + } else { + throw new UnsupportedOperationException( + "No Std. Error of coefficients available for this GeneralizedLinearRegressionModel") + } } /** * T-statistic of estimated coefficients and intercept. + * This value is only available when the underlying [[WeightedLeastSquares]] + * using the "normal" solver. * * If [[GeneralizedLinearRegression.fitIntercept]] is set to true, * then the last element returned corresponds to the intercept. */ @Since("2.0.0") lazy val tValues: Array[Double] = { - val estimate = if (model.getFitIntercept) { - Array.concat(model.coefficients.toArray, Array(model.intercept)) + if (isNormalSolver) { + val estimate = if (model.getFitIntercept) { + Array.concat(model.coefficients.toArray, Array(model.intercept)) + } else { + model.coefficients.toArray + } + estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 } } else { - model.coefficients.toArray + throw new UnsupportedOperationException( + "No t-statistic available for this GeneralizedLinearRegressionModel") } - estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 } } /** * Two-sided p-value of estimated coefficients and intercept. + * This value is only available when the underlying [[WeightedLeastSquares]] + * using the "normal" solver. * * If [[GeneralizedLinearRegression.fitIntercept]] is set to true, * then the last element returned corresponds to the intercept. */ @Since("2.0.0") lazy val pValues: Array[Double] = { - if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) { - tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) } + if (isNormalSolver) { + if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) { + tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) } + } else { + tValues.map { x => + 2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) + } + } } else { - tValues.map { x => 2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) } + throw new UnsupportedOperationException( + "No p-value available for this GeneralizedLinearRegressionModel") } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 9b0fa67630..4fab216033 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1048,6 +1048,27 @@ class GeneralizedLinearRegressionSuite assert(summary.solver === "irls") } + test("glm handle collinear features") { + val collinearInstances = Seq( + Instance(1.0, 1.0, Vectors.dense(1.0, 2.0)), + Instance(2.0, 1.0, Vectors.dense(2.0, 4.0)), + Instance(3.0, 1.0, Vectors.dense(3.0, 6.0)), + Instance(4.0, 1.0, Vectors.dense(4.0, 8.0)) + ).toDF() + val trainer = new GeneralizedLinearRegression() + val model = trainer.fit(collinearInstances) + // to make it clear that underlying WLS did not solve analytically + intercept[UnsupportedOperationException] { + model.summary.coefficientStandardErrors + } + intercept[UnsupportedOperationException] { + model.summary.pValues + } + intercept[UnsupportedOperationException] { + model.summary.tValues + } + } + test("read/write") { def checkModelData( model: GeneralizedLinearRegressionModel, -- GitLab