diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala index e6af0c0ec7ec2eb6b9861d9b8efed73ca8c25a2e..976299124cedd66f520474ab0af39de145e7addb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala @@ -68,6 +68,19 @@ class IndexedRowMatrix @Since("1.0.0") ( nRows } + + /** + * Compute all cosine similarities between columns of this matrix using the brute-force + * approach of computing normalized dot products. + * + * @return An n x n sparse upper-triangular matrix of cosine similarities between + * columns of this matrix. + */ + @Since("1.6.0") + def columnSimilarities(): CoordinateMatrix = { + toRowMatrix().columnSimilarities() + } + /** * Drops row indices and converts this matrix to a * [[org.apache.spark.mllib.linalg.distributed.RowMatrix]]. diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala index 0ecb7a221a50374111838eb8d2198328572314ae..6de6cf2fa86341c0ede887576a69583f7874877d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala @@ -153,6 +153,18 @@ class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("similar columns") { + val A = new IndexedRowMatrix(indexedRows) + val gram = A.computeGramianMatrix().toBreeze.toDenseMatrix + + val G = A.columnSimilarities().toBreeze() + + for (i <- 0 until n; j <- i + 1 until n) { + val trueResult = gram(i, j) / scala.math.sqrt(gram(i, i) * gram(j, j)) + assert(math.abs(G(i, j) - trueResult) < 1e-6) + } + } + def closeToZero(G: BDM[Double]): Boolean = { G.valuesIterator.map(math.abs).sum < 1e-6 }