diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala index 3fbc0958a0f11ae02df764763b68538e32c6efb1..941b6eca568d321fe435925ec2a28fa60de2dcca 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala @@ -136,7 +136,21 @@ sealed trait Vector extends Serializable { * Converts this vector to a sparse vector with all explicit zeros removed. */ @Since("2.0.0") - def toSparse: SparseVector + def toSparse: SparseVector = toSparseWithSize(numNonzeros) + + /** + * Converts this vector to a sparse vector with all explicit zeros removed when the size is known. + * This method is used to avoid re-computing the number of non-zero elements when it is + * already known. This method should only be called after computing the number of non-zero + * elements via [[numNonzeros]]. e.g. + * {{{ + * val nnz = numNonzeros + * val sv = toSparse(nnz) + * }}} + * + * If `nnz` is under-specified, a [[java.lang.ArrayIndexOutOfBoundsException]] is thrown. + */ + private[linalg] def toSparseWithSize(nnz: Int): SparseVector /** * Converts this vector to a dense vector. @@ -152,7 +166,7 @@ sealed trait Vector extends Serializable { val nnz = numNonzeros // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes. if (1.5 * (nnz + 1.0) < size) { - toSparse + toSparseWithSize(nnz) } else { toDense } @@ -495,8 +509,7 @@ class DenseVector @Since("2.0.0") ( @Since("2.0.0") val values: Array[Double]) e nnz } - override def toSparse: SparseVector = { - val nnz = numNonzeros + private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = { val ii = new Array[Int](nnz) val vv = new Array[Double](nnz) var k = 0 @@ -635,8 +648,7 @@ class SparseVector @Since("2.0.0") ( nnz } - override def toSparse: SparseVector = { - val nnz = numNonzeros + private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = { if (nnz == numActives) { this } else { diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala index 4cd91afd6d7fc141aa8d494a19eec80c4b7cc1c6..79acef8214d88bcecf6b42a09369b5e55b3a474f 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala @@ -318,11 +318,21 @@ class VectorsSuite extends SparkMLFunSuite { assert(dv0s.numActives === 2) assert(dv0s === dv0) + assert(dv0.toSparseWithSize(dv0.numNonzeros) === dv0) + val dv0s2 = dv0.toSparseWithSize(dv0.numNonzeros) + assert(dv0s2.numActives === 2) + assert(dv0s2 === dv0s) + val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0)) assert(sv0.toDense === sv0) val sv0s = sv0.toSparse assert(sv0s.numActives === 2) assert(sv0s === sv0) + + assert(sv0.toSparseWithSize(sv0.numNonzeros) === sv0) + val sv0s2 = sv0.toSparseWithSize(sv0.numNonzeros) + assert(sv0s2.numActives === 2) + assert(sv0s2 === sv0s) } test("Vector.compressed") { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index f063420bec1437088ee8019b38a32943027b890e..fd9605c013625329123c3f0d04d888ba2694c806 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -149,7 +149,21 @@ sealed trait Vector extends Serializable { * Converts this vector to a sparse vector with all explicit zeros removed. */ @Since("1.4.0") - def toSparse: SparseVector + def toSparse: SparseVector = toSparseWithSize(numNonzeros) + + /** + * Converts this vector to a sparse vector with all explicit zeros removed when the size is known. + * This method is used to avoid re-computing the number of non-zero elements when it is + * already known. This method should only be called after computing the number of non-zero + * elements via [[numNonzeros]]. e.g. + * {{{ + * val nnz = numNonzeros + * val sv = toSparse(nnz) + * }}} + * + * If `nnz` is under-specified, a [[java.lang.ArrayIndexOutOfBoundsException]] is thrown. + */ + private[linalg] def toSparseWithSize(nnz: Int): SparseVector /** * Converts this vector to a dense vector. @@ -165,7 +179,7 @@ sealed trait Vector extends Serializable { val nnz = numNonzeros // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes. if (1.5 * (nnz + 1.0) < size) { - toSparse + toSparseWithSize(nnz) } else { toDense } @@ -669,9 +683,7 @@ class DenseVector @Since("1.0.0") ( nnz } - @Since("1.4.0") - override def toSparse: SparseVector = { - val nnz = numNonzeros + private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = { val ii = new Array[Int](nnz) val vv = new Array[Double](nnz) var k = 0 @@ -822,9 +834,7 @@ class SparseVector @Since("1.0.0") ( nnz } - @Since("1.4.0") - override def toSparse: SparseVector = { - val nnz = numNonzeros + private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = { if (nnz == numActives) { this } else { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 6172cffee861c786ba0e64c41b8566c5a3f3f65b..a1e3ee54b49ff22f0de519cc5724f8e287063c46 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -351,11 +351,21 @@ class VectorsSuite extends SparkFunSuite with Logging { assert(dv0s.numActives === 2) assert(dv0s === dv0) + assert(dv0.toSparseWithSize(dv0.numNonzeros) === dv0) + val dv0s2 = dv0.toSparseWithSize(dv0.numNonzeros) + assert(dv0s2.numActives === 2) + assert(dv0s2 === dv0s) + val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0)) assert(sv0.toDense === sv0) val sv0s = sv0.toSparse assert(sv0s.numActives === 2) assert(sv0s === sv0) + + assert(sv0.toSparseWithSize(sv0.numNonzeros) === sv0) + val sv0s2 = sv0.toSparseWithSize(sv0.numNonzeros) + assert(sv0s2.numActives === 2) + assert(sv0s2 === sv0s) } test("Vector.compressed") { diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 7ba85bda0cd7a0bdac4306f3a2ceb24239bfebfe..9bda917377c2e7ac5f614e469b163301ffa85fe8 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -1015,6 +1015,10 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy") + ) ++ Seq( + // [SPARK-21680][ML][MLLIB]optimzie Vector coompress + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparseWithSize"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparseWithSize") ) }