From be85245a98d58f636ff54956cdfde15ea5cd6122 Mon Sep 17 00:00:00 2001 From: sethah <seth.hendrickson16@gmail.com> Date: Sat, 25 Mar 2017 17:41:59 +0000 Subject: [PATCH] [SPARK-17137][ML][WIP] Compress logistic regression coefficients ## What changes were proposed in this pull request? Use the new `compressed` method on matrices to store the logistic regression coefficients as sparse or dense - whichever is requires less memory. Marked as WIP so we can add some performance test results. Basically, we should see if prediction is slower because of using a sparse matrix over a dense one. This can happen since sparse matrices do not use native BLAS operations when computing the margins. ## How was this patch tested? Unit tests added. Author: sethah <seth.hendrickson16@gmail.com> Closes #17426 from sethah/SPARK-17137. --- .../classification/LogisticRegression.scala | 28 ++------- .../LogisticRegressionSuite.scala | 58 ++++++++++++++----- 2 files changed, 49 insertions(+), 37 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 1a78187d4f..7b56bce41c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -399,14 +399,9 @@ class LogisticRegression @Since("1.2.0") ( logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " + s"will be zeros. Training is not needed.") val constantLabelIndex = Vectors.dense(histogram).argmax - // TODO: use `compressed` after SPARK-17471 - val coefMatrix = if (numFeatures < numCoefficientSets) { - new SparseMatrix(numCoefficientSets, numFeatures, - Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double]) - } else { - new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numCoefficientSets + 1)(0), - Array.empty[Int], Array.empty[Double], isTransposed = true) - } + val coefMatrix = new SparseMatrix(numCoefficientSets, numFeatures, + new Array[Int](numCoefficientSets + 1), Array.empty[Int], Array.empty[Double], + isTransposed = true).compressed val interceptVec = if (isMultinomial) { Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity))) } else { @@ -617,26 +612,13 @@ class LogisticRegression @Since("1.2.0") ( denseCoefficientMatrix.update(_ - coefficientMean) } - // TODO: use `denseCoefficientMatrix.compressed` after SPARK-17471 - val compressedCoefficientMatrix = if (isMultinomial) { - denseCoefficientMatrix - } else { - val compressedVector = Vectors.dense(denseCoefficientMatrix.values).compressed - compressedVector match { - case dv: DenseVector => denseCoefficientMatrix - case sv: SparseVector => - new SparseMatrix(1, numFeatures, Array(0, sv.indices.length), sv.indices, sv.values, - isTransposed = true) - } - } - // center the intercepts when using multinomial algorithm if ($(fitIntercept) && isMultinomial) { val interceptArray = interceptVec.toArray val interceptMean = interceptArray.sum / interceptArray.length (0 until interceptVec.size).foreach { i => interceptArray(i) -= interceptMean } } - (compressedCoefficientMatrix, interceptVec.compressed, arrayBuilder.result()) + (denseCoefficientMatrix.compressed, interceptVec.compressed, arrayBuilder.result()) } } @@ -713,7 +695,7 @@ class LogisticRegressionModel private[spark] ( // convert to appropriate vector representation without replicating data private lazy val _coefficients: Vector = { require(coefficientMatrix.isTransposed, - "LogisticRegressionModel coefficients should be row major.") + "LogisticRegressionModel coefficients should be row major for binomial model.") coefficientMatrix match { case dm: DenseMatrix => Vectors.dense(dm.values) case sm: SparseMatrix => Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index affaa57374..1b64480373 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.{Instance, LabeledPoint} -import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors} +import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vector, Vectors} import org.apache.spark.ml.param.{ParamMap, ParamsSuite} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ @@ -713,8 +713,6 @@ class LogisticRegressionSuite assert(model2.intercept ~== interceptR relTol 1E-2) assert(model2.coefficients ~== coefficientsR absTol 1E-3) - // TODO: move this to a standalone test of compression after SPARK-17471 - assert(model2.coefficients.isInstanceOf[SparseVector]) } test("binary logistic regression without intercept with L1 regularization") { @@ -2031,29 +2029,61 @@ class LogisticRegressionSuite // TODO: check num iters is zero when it become available in the model } - test("compressed storage") { + test("compressed storage for constant label") { + /* + When the label is constant and fit intercept is true, all the coefficients will be + zeros, and so the model coefficients should be stored as sparse data structures, except + when the matrix dimensions are very small. + */ val moreClassesThanFeatures = Seq( - LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)), - LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)), - LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0))).toDF() - val mlr = new LogisticRegression().setFamily("multinomial") + LabeledPoint(4.0, Vectors.dense(Array.fill(5)(0.0))), + LabeledPoint(4.0, Vectors.dense(Array.fill(5)(1.0))), + LabeledPoint(4.0, Vectors.dense(Array.fill(5)(2.0)))).toDF() + val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true) val model = mlr.fit(moreClassesThanFeatures) assert(model.coefficientMatrix.isInstanceOf[SparseMatrix]) - assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4) + assert(model.coefficientMatrix.isColMajor) + + // in this case, it should be stored as row major val moreFeaturesThanClasses = Seq( - LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)), - LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)), - LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0))).toDF() + LabeledPoint(1.0, Vectors.dense(Array.fill(5)(0.0))), + LabeledPoint(1.0, Vectors.dense(Array.fill(5)(1.0))), + LabeledPoint(1.0, Vectors.dense(Array.fill(5)(2.0)))).toDF() val model2 = mlr.fit(moreFeaturesThanClasses) assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix]) - assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3) + assert(model2.coefficientMatrix.isRowMajor) - val blr = new LogisticRegression().setFamily("binomial") + val blr = new LogisticRegression().setFamily("binomial").setFitIntercept(true) val blrModel = blr.fit(moreFeaturesThanClasses) assert(blrModel.coefficientMatrix.isInstanceOf[SparseMatrix]) assert(blrModel.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 2) } + test("compressed coefficients") { + + val trainer1 = new LogisticRegression() + .setRegParam(0.1) + .setElasticNetParam(1.0) + + // compressed row major is optimal + val model1 = trainer1.fit(multinomialDataset.limit(100)) + assert(model1.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(model1.coefficientMatrix.isRowMajor) + + // compressed column major is optimal since there are more classes than features + val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata() + val model2 = trainer1.fit(multinomialDataset + .withColumn("label", col("label").as("label", labelMeta)).limit(100)) + assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(model2.coefficientMatrix.isColMajor) + + // coefficients are dense without L1 regularization + val trainer2 = new LogisticRegression() + .setElasticNetParam(0.0) + val model3 = trainer2.fit(multinomialDataset.limit(100)) + assert(model3.coefficientMatrix.isInstanceOf[DenseMatrix]) + } + test("numClasses specified in metadata/inferred") { val lr = new LogisticRegression().setMaxIter(1).setFamily("multinomial") -- GitLab