From 6c6f32574023b8e43a24f2081ff17e6e446de2f3 Mon Sep 17 00:00:00 2001
From: freeman <the.freeman.lab@gmail.com>
Date: Mon, 5 Jan 2015 13:10:59 -0800
Subject: [PATCH] [SPARK-5089][PYSPARK][MLLIB] Fix vector convert

This is a small change addressing a potentially significant bug in how PySpark + MLlib handles non-float64 numpy arrays. The automatic conversion to `DenseVector` that occurs when passing RDDs to MLlib algorithms in PySpark should automatically upcast to float64s, but currently this wasn't actually happening. As a result, non-float64 would be silently parsed inappropriately during SerDe, yielding erroneous results when running, for example, KMeans.

The PR includes the fix, as well as a new test for the correct conversion behavior.

davies

Author: freeman <the.freeman.lab@gmail.com>

Closes #3902 from freeman-lab/fix-vector-convert and squashes the following commits:

764db47 [freeman] Add a test for proper conversion behavior
704f97e [freeman] Return array after changing type
---
 python/pyspark/mllib/linalg.py |  2 +-
 python/pyspark/mllib/tests.py  | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index f7aa2b0cb0..4f8491f43e 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -178,7 +178,7 @@ class DenseVector(Vector):
         elif not isinstance(ar, np.ndarray):
             ar = np.array(ar, dtype=np.float64)
         if ar.dtype != np.float64:
-            ar.astype(np.float64)
+            ar = ar.astype(np.float64)
         self.array = ar
 
     def __reduce__(self):
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 5034f229e8..1f48bc1219 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -110,6 +110,16 @@ class VectorTests(PySparkTestCase):
         self.assertEquals(0.0, _squared_distance(dv, dv))
         self.assertEquals(0.0, _squared_distance(lst, lst))
 
+    def test_conversion(self):
+        # numpy arrays should be automatically upcast to float64
+        # tests for fix of [SPARK-5089]
+        v = array([1, 2, 3, 4], dtype='float64')
+        dv = DenseVector(v)
+        self.assertTrue(dv.array.dtype == 'float64')
+        v = array([1, 2, 3, 4], dtype='float32')
+        dv = DenseVector(v)
+        self.assertTrue(dv.array.dtype == 'float64')
+
 
 class ListTests(PySparkTestCase):
 
-- 
GitLab