Skip to content
Snippets Groups Projects
Commit d75496b1 authored by Xiangrui Meng's avatar Xiangrui Meng
Browse files

[SPARK-3701][MLLIB] update python linalg api and small fixes

1. doc updates
2. simple checks on vector dimensions
3. use column major for matrices

davies jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #2548 from mengxr/mllib-py-clean and squashes the following commits:

6dce2df [Xiangrui Meng] address comments
116b5db [Xiangrui Meng] use np.dot instead of array.dot
75f2fcc [Xiangrui Meng] fix python style
fefce00 [Xiangrui Meng] better check of vector size with more tests
067ef71 [Xiangrui Meng] majored -> major
ef853f9 [Xiangrui Meng] update python linalg api and small fixes
parent 6c696d7d
No related branches found
No related tags found
No related merge requests found
......@@ -85,7 +85,7 @@ sealed trait Matrix extends Serializable {
}
/**
* Column-majored dense matrix.
* Column-major dense matrix.
* The entry values are stored in a single array of doubles with columns listed in sequence.
* For example, the following matrix
* {{{
......@@ -128,7 +128,7 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double])
}
/**
* Column-majored sparse matrix.
* Column-major sparse matrix.
* The entry values are stored in Compressed Sparse Column (CSC) format.
* For example, the following matrix
* {{{
......@@ -207,7 +207,7 @@ class SparseMatrix(
object Matrices {
/**
* Creates a column-majored dense matrix.
* Creates a column-major dense matrix.
*
* @param numRows number of rows
* @param numCols number of columns
......@@ -218,7 +218,7 @@ object Matrices {
}
/**
* Creates a column-majored sparse matrix in Compressed Sparse Column (CSC) format.
* Creates a column-major sparse matrix in Compressed Sparse Column (CSC) format.
*
* @param numRows number of rows
* @param numCols number of columns
......
......@@ -63,6 +63,41 @@ def _convert_to_vector(l):
raise TypeError("Cannot convert type %s into Vector" % type(l))
def _vector_size(v):
"""
Returns the size of the vector.
>>> _vector_size([1., 2., 3.])
3
>>> _vector_size((1., 2., 3.))
3
>>> _vector_size(array.array('d', [1., 2., 3.]))
3
>>> _vector_size(np.zeros(3))
3
>>> _vector_size(np.zeros((3, 1)))
3
>>> _vector_size(np.zeros((1, 3)))
Traceback (most recent call last):
...
ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
"""
if isinstance(v, Vector):
return len(v)
elif type(v) in (array.array, list, tuple):
return len(v)
elif type(v) == np.ndarray:
if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
return len(v)
else:
raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
elif _have_scipy and scipy.sparse.issparse(v):
assert v.shape[1] == 1, "Expected column vector"
return v.shape[0]
else:
raise TypeError("Cannot treat type %s as a vector" % type(v))
class Vector(object):
"""
Abstract class for DenseVector and SparseVector
......@@ -76,6 +111,9 @@ class Vector(object):
class DenseVector(Vector):
"""
A dense vector represented by a value array.
"""
def __init__(self, ar):
if not isinstance(ar, array.array):
ar = array.array('d', ar)
......@@ -100,15 +138,31 @@ class DenseVector(Vector):
5.0
>>> dense.dot(np.array(range(1, 3)))
5.0
>>> dense.dot([1.,])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
array([ 5., 11.])
>>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
if isinstance(other, SparseVector):
return other.dot(self)
if type(other) == np.ndarray and other.ndim > 1:
assert len(self) == other.shape[0], "dimension mismatch"
return np.dot(self.toArray(), other)
elif _have_scipy and scipy.sparse.issparse(other):
return other.transpose().dot(self.toArray())[0]
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
assert len(self) == other.shape[0], "dimension mismatch"
return other.transpose().dot(self.toArray())
else:
return np.dot(self.toArray(), other)
assert len(self) == _vector_size(other), "dimension mismatch"
if isinstance(other, SparseVector):
return other.dot(self)
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
else:
return np.dot(self.toArray(), other)
def squared_distance(self, other):
"""
......@@ -126,7 +180,16 @@ class DenseVector(Vector):
>>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
>>> dense1.squared_distance(sparse1)
2.0
>>> dense1.squared_distance([1.,])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if isinstance(other, SparseVector):
return other.squared_distance(self)
elif _have_scipy and scipy.sparse.issparse(other):
......@@ -165,12 +228,10 @@ class DenseVector(Vector):
class SparseVector(Vector):
"""
A simple sparse vector class for passing data to MLlib. Users may
alternatively pass SciPy's {scipy.sparse} data types.
"""
def __init__(self, size, *args):
"""
Create a sparse vector, using either a dictionary, a list of
......@@ -222,20 +283,33 @@ class SparseVector(Vector):
0.0
>>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
array([ 22., 22.])
>>> a.dot([1., 2., 3.])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> a.dot(np.array([1., 2.]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> a.dot(DenseVector([1., 2.]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> a.dot(np.zeros((3, 2)))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
if type(other) == np.ndarray:
if other.ndim == 1:
result = 0.0
for i in xrange(len(self.indices)):
result += self.values[i] * other[self.indices[i]]
return result
elif other.ndim == 2:
if other.ndim == 2:
results = [self.dot(other[:, i]) for i in xrange(other.shape[1])]
return np.array(results)
else:
raise Exception("Cannot call dot with %d-dimensional array" % other.ndim)
elif other.ndim > 2:
raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
assert len(self) == _vector_size(other), "dimension mismatch"
elif type(other) in (array.array, DenseVector):
if type(other) in (np.ndarray, array.array, DenseVector):
result = 0.0
for i in xrange(len(self.indices)):
result += self.values[i] * other[self.indices[i]]
......@@ -254,6 +328,7 @@ class SparseVector(Vector):
else:
j += 1
return result
else:
return self.dot(_convert_to_vector(other))
......@@ -273,7 +348,16 @@ class SparseVector(Vector):
30.0
>>> b.squared_distance(a)
30.0
>>> b.squared_distance([1., 2.])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
if type(other) is np.array and other.ndim != 1:
raise Exception("Cannot call squared_distance with %d-dimensional array" %
......@@ -348,7 +432,6 @@ class SparseVector(Vector):
>>> v1 != v2
False
"""
return (isinstance(other, self.__class__)
and other.size == self.size
and other.indices == self.indices
......@@ -414,23 +497,32 @@ class Vectors(object):
class Matrix(object):
""" the Matrix """
def __init__(self, nRow, nCol):
self.nRow = nRow
self.nCol = nCol
"""
Represents a local matrix.
"""
def __init__(self, numRows, numCols):
self.numRows = numRows
self.numCols = numCols
def toArray(self):
"""
Returns its elements in a NumPy ndarray.
"""
raise NotImplementedError
class DenseMatrix(Matrix):
def __init__(self, nRow, nCol, values):
Matrix.__init__(self, nRow, nCol)
assert len(values) == nRow * nCol
"""
Column-major dense matrix.
"""
def __init__(self, numRows, numCols, values):
Matrix.__init__(self, numRows, numCols)
assert len(values) == numRows * numCols
self.values = values
def __reduce__(self):
return DenseMatrix, (self.nRow, self.nCol, self.values)
return DenseMatrix, (self.numRows, self.numCols, self.values)
def toArray(self):
"""
......@@ -439,10 +531,10 @@ class DenseMatrix(Matrix):
>>> arr = array.array('d', [float(i) for i in range(4)])
>>> m = DenseMatrix(2, 2, arr)
>>> m.toArray()
array([[ 0., 1.],
[ 2., 3.]])
array([[ 0., 2.],
[ 1., 3.]])
"""
return np.ndarray((self.nRow, self.nCol), np.float64, buffer=self.values.tostring())
return np.reshape(self.values, (self.numRows, self.numCols), order='F')
def _test():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment