From fb81a412eea1e60bd503cb5bb879ae468be24e56 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 5 Apr 2017 17:46:44 -0700
Subject: [PATCH] [SPARK-20214][ML] Make sure converted csc matrix has sorted
 indices

## What changes were proposed in this pull request?

`_convert_to_vector` converts a scipy sparse matrix to csc matrix for initializing `SparseVector`. However, it doesn't guarantee the converted csc matrix has sorted indices and so a failure happens when you do something like that:

    from scipy.sparse import lil_matrix
    lil = lil_matrix((4, 1))
    lil[1, 0] = 1
    lil[3, 0] = 2
    _convert_to_vector(lil.todok())

    File "/home/jenkins/workspace/python/pyspark/mllib/linalg/__init__.py", line 78, in _convert_to_vector
      return SparseVector(l.shape[0], csc.indices, csc.data)
    File "/home/jenkins/workspace/python/pyspark/mllib/linalg/__init__.py", line 556, in __init__
      % (self.indices[i], self.indices[i + 1]))
    TypeError: Indices 3 and 1 are not strictly increasing

A simple test can confirm that `dok_matrix.tocsc()` won't guarantee sorted indices:

    >>> from scipy.sparse import lil_matrix
    >>> lil = lil_matrix((4, 1))
    >>> lil[1, 0] = 1
    >>> lil[3, 0] = 2
    >>> dok = lil.todok()
    >>> csc = dok.tocsc()
    >>> csc.has_sorted_indices
    0
    >>> csc.indices
    array([3, 1], dtype=int32)

I checked the source codes of scipy. The only way to guarantee it is `csc_matrix.tocsr()` and `csr_matrix.tocsc()`.

## How was this patch tested?

Existing tests.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #17532 from viirya/make-sure-sorted-indices.

(cherry picked from commit 12206058e8780e202c208b92774df3773eff36ae)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 python/pyspark/ml/linalg/__init__.py    |  3 +++
 python/pyspark/mllib/linalg/__init__.py |  3 +++
 python/pyspark/mllib/tests.py           | 11 +++++++++++
 3 files changed, 17 insertions(+)

diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index 1705c156ce..eed9946aba 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -72,7 +72,10 @@ def _convert_to_vector(l):
         return DenseVector(l)
     elif _have_scipy and scipy.sparse.issparse(l):
         assert l.shape[1] == 1, "Expected column vector"
+        # Make sure the converted csc_matrix has sorted indices.
         csc = l.tocsc()
+        if not csc.has_sorted_indices:
+            csc.sort_indices()
         return SparseVector(l.shape[0], csc.indices, csc.data)
     else:
         raise TypeError("Cannot convert type %s into Vector" % type(l))
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 031f22c020..7b24b3c74a 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -74,7 +74,10 @@ def _convert_to_vector(l):
         return DenseVector(l)
     elif _have_scipy and scipy.sparse.issparse(l):
         assert l.shape[1] == 1, "Expected column vector"
+        # Make sure the converted csc_matrix has sorted indices.
         csc = l.tocsc()
+        if not csc.has_sorted_indices:
+            csc.sort_indices()
         return SparseVector(l.shape[0], csc.indices, csc.data)
     else:
         raise TypeError("Cannot convert type %s into Vector" % type(l))
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index c519883cdd..523b3f1113 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -853,6 +853,17 @@ class SciPyTests(MLlibTestCase):
         self.assertEqual(sv, serialize(lil.tocsr()))
         self.assertEqual(sv, serialize(lil.todok()))
 
+    def test_convert_to_vector(self):
+        from scipy.sparse import csc_matrix
+        # Create a CSC matrix with non-sorted indices
+        indptr = array([0, 2])
+        indices = array([3, 1])
+        data = array([2.0, 1.0])
+        csc = csc_matrix((data, indices, indptr))
+        self.assertFalse(csc.has_sorted_indices)
+        sv = SparseVector(4, {1: 1, 3: 2})
+        self.assertEqual(sv, _convert_to_vector(csc))
+
     def test_dot(self):
         from scipy.sparse import lil_matrix
         lil = lil_matrix((4, 1))
-- 
GitLab