From be85245a98d58f636ff54956cdfde15ea5cd6122 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Sat, 25 Mar 2017 17:41:59 +0000
Subject: [PATCH] [SPARK-17137][ML][WIP] Compress logistic regression
 coefficients

## What changes were proposed in this pull request?

Use the new `compressed` method on matrices to store the logistic regression coefficients as sparse or dense - whichever is requires less memory.

Marked as WIP so we can add some performance test results. Basically, we should see if prediction is slower because of using a sparse matrix over a dense one. This can happen since sparse matrices do not use native BLAS operations when computing the margins.

## How was this patch tested?

Unit tests added.

Author: sethah <seth.hendrickson16@gmail.com>

Closes #17426 from sethah/SPARK-17137.
---
 .../classification/LogisticRegression.scala   | 28 ++-------
 .../LogisticRegressionSuite.scala             | 58 ++++++++++++++-----
 2 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 1a78187d4f..7b56bce41c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -399,14 +399,9 @@ class LogisticRegression @Since("1.2.0") (
         logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
           s"will be zeros. Training is not needed.")
         val constantLabelIndex = Vectors.dense(histogram).argmax
-        // TODO: use `compressed` after SPARK-17471
-        val coefMatrix = if (numFeatures < numCoefficientSets) {
-          new SparseMatrix(numCoefficientSets, numFeatures,
-            Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double])
-        } else {
-          new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numCoefficientSets + 1)(0),
-            Array.empty[Int], Array.empty[Double], isTransposed = true)
-        }
+        val coefMatrix = new SparseMatrix(numCoefficientSets, numFeatures,
+          new Array[Int](numCoefficientSets + 1), Array.empty[Int], Array.empty[Double],
+          isTransposed = true).compressed
         val interceptVec = if (isMultinomial) {
           Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity)))
         } else {
@@ -617,26 +612,13 @@ class LogisticRegression @Since("1.2.0") (
           denseCoefficientMatrix.update(_ - coefficientMean)
         }
 
-        // TODO: use `denseCoefficientMatrix.compressed` after SPARK-17471
-        val compressedCoefficientMatrix = if (isMultinomial) {
-          denseCoefficientMatrix
-        } else {
-          val compressedVector = Vectors.dense(denseCoefficientMatrix.values).compressed
-          compressedVector match {
-            case dv: DenseVector => denseCoefficientMatrix
-            case sv: SparseVector =>
-              new SparseMatrix(1, numFeatures, Array(0, sv.indices.length), sv.indices, sv.values,
-                isTransposed = true)
-          }
-        }
-
         // center the intercepts when using multinomial algorithm
         if ($(fitIntercept) && isMultinomial) {
           val interceptArray = interceptVec.toArray
           val interceptMean = interceptArray.sum / interceptArray.length
           (0 until interceptVec.size).foreach { i => interceptArray(i) -= interceptMean }
         }
-        (compressedCoefficientMatrix, interceptVec.compressed, arrayBuilder.result())
+        (denseCoefficientMatrix.compressed, interceptVec.compressed, arrayBuilder.result())
       }
     }
 
@@ -713,7 +695,7 @@ class LogisticRegressionModel private[spark] (
   // convert to appropriate vector representation without replicating data
   private lazy val _coefficients: Vector = {
     require(coefficientMatrix.isTransposed,
-      "LogisticRegressionModel coefficients should be row major.")
+      "LogisticRegressionModel coefficients should be row major for binomial model.")
     coefficientMatrix match {
       case dm: DenseMatrix => Vectors.dense(dm.values)
       case sm: SparseMatrix => Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index affaa57374..1b64480373 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
-import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors}
+import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vector, Vectors}
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
@@ -713,8 +713,6 @@ class LogisticRegressionSuite
 
     assert(model2.intercept ~== interceptR relTol 1E-2)
     assert(model2.coefficients ~== coefficientsR absTol 1E-3)
-    // TODO: move this to a standalone test of compression after SPARK-17471
-    assert(model2.coefficients.isInstanceOf[SparseVector])
   }
 
   test("binary logistic regression without intercept with L1 regularization") {
@@ -2031,29 +2029,61 @@ class LogisticRegressionSuite
     // TODO: check num iters is zero when it become available in the model
   }
 
-  test("compressed storage") {
+  test("compressed storage for constant label") {
+    /*
+      When the label is constant and fit intercept is true, all the coefficients will be
+      zeros, and so the model coefficients should be stored as sparse data structures, except
+      when the matrix dimensions are very small.
+     */
     val moreClassesThanFeatures = Seq(
-      LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)),
-      LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)),
-      LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0))).toDF()
-    val mlr = new LogisticRegression().setFamily("multinomial")
+      LabeledPoint(4.0, Vectors.dense(Array.fill(5)(0.0))),
+      LabeledPoint(4.0, Vectors.dense(Array.fill(5)(1.0))),
+      LabeledPoint(4.0, Vectors.dense(Array.fill(5)(2.0)))).toDF()
+    val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true)
     val model = mlr.fit(moreClassesThanFeatures)
     assert(model.coefficientMatrix.isInstanceOf[SparseMatrix])
-    assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4)
+    assert(model.coefficientMatrix.isColMajor)
+
+    // in this case, it should be stored as row major
     val moreFeaturesThanClasses = Seq(
-      LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)),
-      LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)),
-      LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0))).toDF()
+      LabeledPoint(1.0, Vectors.dense(Array.fill(5)(0.0))),
+      LabeledPoint(1.0, Vectors.dense(Array.fill(5)(1.0))),
+      LabeledPoint(1.0, Vectors.dense(Array.fill(5)(2.0)))).toDF()
     val model2 = mlr.fit(moreFeaturesThanClasses)
     assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
-    assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3)
+    assert(model2.coefficientMatrix.isRowMajor)
 
-    val blr = new LogisticRegression().setFamily("binomial")
+    val blr = new LogisticRegression().setFamily("binomial").setFitIntercept(true)
     val blrModel = blr.fit(moreFeaturesThanClasses)
     assert(blrModel.coefficientMatrix.isInstanceOf[SparseMatrix])
     assert(blrModel.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 2)
   }
 
+  test("compressed coefficients") {
+
+    val trainer1 = new LogisticRegression()
+      .setRegParam(0.1)
+      .setElasticNetParam(1.0)
+
+    // compressed row major is optimal
+    val model1 = trainer1.fit(multinomialDataset.limit(100))
+    assert(model1.coefficientMatrix.isInstanceOf[SparseMatrix])
+    assert(model1.coefficientMatrix.isRowMajor)
+
+    // compressed column major is optimal since there are more classes than features
+    val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata()
+    val model2 = trainer1.fit(multinomialDataset
+      .withColumn("label", col("label").as("label", labelMeta)).limit(100))
+    assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
+    assert(model2.coefficientMatrix.isColMajor)
+
+    // coefficients are dense without L1 regularization
+    val trainer2 = new LogisticRegression()
+      .setElasticNetParam(0.0)
+    val model3 = trainer2.fit(multinomialDataset.limit(100))
+    assert(model3.coefficientMatrix.isInstanceOf[DenseMatrix])
+  }
+
   test("numClasses specified in metadata/inferred") {
     val lr = new LogisticRegression().setMaxIter(1).setFamily("multinomial")
 
-- 
GitLab