diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala index e612a2122ed6293238c8b9cc8f44aedee3d2fbe7..8617722ae542f7dc5fab85626b042542abcb66e5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala @@ -75,7 +75,7 @@ private[ml] class WeightedLeastSquares( val summary = instances.treeAggregate(new Aggregator)(_.add(_), _.merge(_)) summary.validate() logInfo(s"Number of instances: ${summary.count}.") - val k = summary.k + val k = if (fitIntercept) summary.k + 1 else summary.k val triK = summary.triK val wSum = summary.wSum val bBar = summary.bBar @@ -86,14 +86,6 @@ private[ml] class WeightedLeastSquares( val aaBar = summary.aaBar val aaValues = aaBar.values - if (fitIntercept) { - // shift centers - // A^T A - aBar aBar^T - BLAS.spr(-1.0, aBar, aaValues) - // A^T b - bBar aBar - BLAS.axpy(-bBar, aBar, abBar) - } - // add regularization to diagonals var i = 0 var j = 2 @@ -111,21 +103,32 @@ private[ml] class WeightedLeastSquares( j += 1 } - val x = new DenseVector(CholeskyDecomposition.solve(aaBar.values, abBar.values)) + val aa = if (fitIntercept) { + Array.concat(aaBar.values, aBar.values, Array(1.0)) + } else { + aaBar.values + } + val ab = if (fitIntercept) { + Array.concat(abBar.values, Array(bBar)) + } else { + abBar.values + } + + val x = CholeskyDecomposition.solve(aa, ab) + + val aaInv = CholeskyDecomposition.inverse(aa, k) - val aaInv = CholeskyDecomposition.inverse(aaBar.values, k) // aaInv is a packed upper triangular matrix, here we get all elements on diagonal val diagInvAtWA = new DenseVector((1 to k).map { i => aaInv(i + (i - 1) * i / 2 - 1) / wSum }.toArray) - // compute intercept - val intercept = if (fitIntercept) { - bBar - BLAS.dot(aBar, x) + val (coefficients, intercept) = if (fitIntercept) { + (new DenseVector(x.slice(0, x.length - 1)), x.last) } else { - 0.0 + (new DenseVector(x), 0.0) } - new WeightedLeastSquaresModel(x, intercept, diagInvAtWA) + new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index c51e30483ab3dac79592d3a66ed16e59a6f503fc..6638313818703fdbfcd229a477b364dacebee401 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -511,8 +511,7 @@ class LinearRegressionSummary private[regression] ( } /** - * Standard error of estimated coefficients. - * Note that standard error of estimated intercept is not supported currently. + * Standard error of estimated coefficients and intercept. */ lazy val coefficientStandardErrors: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { @@ -532,21 +531,26 @@ class LinearRegressionSummary private[regression] ( } } - /** T-statistic of estimated coefficients. - * Note that t-statistic of estimated intercept is not supported currently. - */ + /** + * T-statistic of estimated coefficients and intercept. + */ lazy val tValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { throw new UnsupportedOperationException( "No t-statistic available for this LinearRegressionModel") } else { - model.coefficients.toArray.zip(coefficientStandardErrors).map { x => x._1 / x._2 } + val estimate = if (model.getFitIntercept) { + Array.concat(model.coefficients.toArray, Array(model.intercept)) + } else { + model.coefficients.toArray + } + estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 } } } - /** Two-sided p-value of estimated coefficients. - * Note that p-value of estimated intercept is not supported currently. - */ + /** + * Two-sided p-value of estimated coefficients and intercept. + */ lazy val pValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { throw new UnsupportedOperationException( diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index fbf83e892286124500a5587055421ce1987afe10..a1d86fe8fedadb878d0792ff5c0b1aee58daebc4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -621,13 +621,13 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model.summary.objectiveHistory.length == 1) assert(model.summary.objectiveHistory(0) == 0.0) val devianceResidualsR = Array(-0.35566, 0.34504) - val seCoefR = Array(0.0011756, 0.0009032) - val tValsR = Array(3998, 7971) - val pValsR = Array(0, 0) + val seCoefR = Array(0.0011756, 0.0009032, 0.0018489) + val tValsR = Array(3998, 7971, 3407) + val pValsR = Array(0, 0, 0) model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x => - assert(x._1 ~== x._2 absTol 1E-3) } + assert(x._1 ~== x._2 absTol 1E-5) } model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => - assert(x._1 ~== x._2 absTol 1E-3) } + assert(x._1 ~== x._2 absTol 1E-5) } model.summary.tValues.map(_.round).zip(tValsR).foreach{ x => assert(x._1 === x._2) } model.summary.pValues.map(_.round).zip(pValsR).foreach{ x => assert(x._1 === x._2) } } @@ -789,9 +789,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val coefficientsR = Vectors.dense(Array(6.080, -0.600)) val interceptR = 18.080 val devianceResidualsR = Array(-1.358, 1.920) - val seCoefR = Array(5.556, 1.960) - val tValsR = Array(1.094, -0.306) - val pValsR = Array(0.471, 0.811) + val seCoefR = Array(5.556, 1.960, 9.608) + val tValsR = Array(1.094, -0.306, 1.882) + val pValsR = Array(0.471, 0.811, 0.311) assert(model.coefficients ~== coefficientsR absTol 1E-3) assert(model.intercept ~== interceptR absTol 1E-3) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index d7b4fd92c3817e32c090a59d04ac0deeacddcbb6..7648bf13266bfb6cf4d9a31afb10b41fd15a7855 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -55,15 +55,15 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction >>> lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal") >>> model = lr.fit(df) >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> model.transform(test0).head().prediction - -1.0 - >>> model.weights - DenseVector([1.0]) - >>> model.intercept - 0.0 + >>> abs(model.transform(test0).head().prediction - (-1.0)) < 0.001 + True + >>> abs(model.coefficients[0] - 1.0) < 0.001 + True + >>> abs(model.intercept - 0.0) < 0.001 + True >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) - >>> model.transform(test1).head().prediction - 1.0 + >>> abs(model.transform(test1).head().prediction - 1.0) < 0.001 + True >>> lr.setParams("vector") Traceback (most recent call last): ...