Skip to content
Snippets Groups Projects
Commit 5ef006fc authored by Xiangrui Meng's avatar Xiangrui Meng
Browse files

[SPARK-6756] [MLLIB] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector

Add `compressed` to `Vector` with some other methods: `numActives`, `numNonzeros`, `toSparse`, and `toDense`. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #5756 from mengxr/SPARK-6756 and squashes the following commits:

8d4ecbd [Xiangrui Meng] address comment and add mima excludes
da54179 [Xiangrui Meng] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
parent a8aeadb7
No related branches found
No related tags found
No related merge requests found
......@@ -116,6 +116,40 @@ sealed trait Vector extends Serializable {
* with type `Double`.
*/
private[spark] def foreachActive(f: (Int, Double) => Unit)
/**
* Number of active entries. An "active entry" is an element which is explicitly stored,
* regardless of its value. Note that inactive entries have value 0.
*/
def numActives: Int
/**
* Number of nonzero elements. This scans all active values and count nonzeros.
*/
def numNonzeros: Int
/**
* Converts this vector to a sparse vector with all explicit zeros removed.
*/
def toSparse: SparseVector
/**
* Converts this vector to a dense vector.
*/
def toDense: DenseVector = new DenseVector(this.toArray)
/**
* Returns a vector in either dense or sparse format, whichever uses less storage.
*/
def compressed: Vector = {
val nnz = numNonzeros
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
if (1.5 * (nnz + 1.0) < size) {
toSparse
} else {
toDense
}
}
}
/**
......@@ -525,6 +559,34 @@ class DenseVector(val values: Array[Double]) extends Vector {
}
result
}
override def numActives: Int = size
override def numNonzeros: Int = {
// same as values.count(_ != 0.0) but faster
var nnz = 0
values.foreach { v =>
if (v != 0.0) {
nnz += 1
}
}
nnz
}
override def toSparse: SparseVector = {
val nnz = numNonzeros
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
foreachActive { (i, v) =>
if (v != 0) {
ii(k) = i
vv(k) = v
k += 1
}
}
new SparseVector(size, ii, vv)
}
}
object DenseVector {
......@@ -602,6 +664,37 @@ class SparseVector(
}
result
}
override def numActives: Int = values.length
override def numNonzeros: Int = {
var nnz = 0
values.foreach { v =>
if (v != 0.0) {
nnz += 1
}
}
nnz
}
override def toSparse: SparseVector = {
val nnz = numNonzeros
if (nnz == numActives) {
this
} else {
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
foreachActive { (i, v) =>
if (v != 0.0) {
ii(k) = i
vv(k) = v
k += 1
}
}
new SparseVector(size, ii, vv)
}
}
}
object SparseVector {
......
......@@ -270,4 +270,48 @@ class VectorsSuite extends FunSuite {
assert(Vectors.norm(sv, 3.7) ~== math.pow(sv.toArray.foldLeft(0.0)((a, v) =>
a + math.pow(math.abs(v), 3.7)), 1.0 / 3.7) relTol 1E-8)
}
test("Vector numActive and numNonzeros") {
val dv = Vectors.dense(0.0, 2.0, 3.0, 0.0)
assert(dv.numActives === 4)
assert(dv.numNonzeros === 2)
val sv = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv.numActives === 3)
assert(sv.numNonzeros === 2)
}
test("Vector toSparse and toDense") {
val dv0 = Vectors.dense(0.0, 2.0, 3.0, 0.0)
assert(dv0.toDense === dv0)
val dv0s = dv0.toSparse
assert(dv0s.numActives === 2)
assert(dv0s === dv0)
val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv0.toDense === sv0)
val sv0s = sv0.toSparse
assert(sv0s.numActives === 2)
assert(sv0s === sv0)
}
test("Vector.compressed") {
val dv0 = Vectors.dense(1.0, 2.0, 3.0, 0.0)
val dv0c = dv0.compressed.asInstanceOf[DenseVector]
assert(dv0c === dv0)
val dv1 = Vectors.dense(0.0, 2.0, 0.0, 0.0)
val dv1c = dv1.compressed.asInstanceOf[SparseVector]
assert(dv1 === dv1c)
assert(dv1c.numActives === 1)
val sv0 = Vectors.sparse(4, Array(1, 2), Array(2.0, 0.0))
val sv0c = sv0.compressed.asInstanceOf[SparseVector]
assert(sv0 === sv0c)
assert(sv0c.numActives === 1)
val sv1 = Vectors.sparse(4, Array(0, 1, 2), Array(1.0, 2.0, 3.0))
val sv1c = sv1.compressed.asInstanceOf[DenseVector]
assert(sv1 === sv1c)
}
}
......@@ -76,6 +76,18 @@ object MimaExcludes {
// SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
ProblemFilters.exclude[MissingClassProblem](
"org.apache.spark.mllib.clustering.LDA$EMOptimizer")
) ++ Seq(
// SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.compressed"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.toDense"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.numNonzeros"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.toSparse"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.numActives")
)
case v if v.startsWith("1.3") =>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment