From 6ed7e2cd01955adfbb3960e2986b6d19eaee8717 Mon Sep 17 00:00:00 2001
From: Evan Sparks <evan.sparks@gmail.com>
Date: Thu, 8 May 2014 00:24:36 -0400
Subject: [PATCH] Use numpy directly for matrix multiply.

Using matrix multiply to compute XtX and XtY yields a 5-20x speedup depending on problem size.

For example - the following takes 19s locally after this change vs. 5m21s before the change. (16x speedup).
bin/pyspark examples/src/main/python/als.py local[8] 1000 1000 50 10 10

Author: Evan Sparks <evan.sparks@gmail.com>

Closes #687 from etrain/patch-1 and squashes the following commits:

e094dbc [Evan Sparks] Touching only diaganols on update.
d1ab9b6 [Evan Sparks] Use numpy directly for matrix multiply.
---
 examples/src/main/python/als.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
index a77dfb2577..33700ab4f8 100755
--- a/examples/src/main/python/als.py
+++ b/examples/src/main/python/als.py
@@ -36,14 +36,13 @@ def rmse(R, ms, us):
 def update(i, vec, mat, ratings):
     uu = mat.shape[0]
     ff = mat.shape[1]
-    XtX = matrix(np.zeros((ff, ff)))
-    Xty = np.zeros((ff, 1))
-
-    for j in range(uu):
-        v = mat[j, :]
-        XtX += v.T * v
-        Xty += v.T * ratings[i, j]
-    XtX += np.eye(ff, ff) * LAMBDA * uu
+    
+    XtX = mat.T * mat
+    XtY = mat.T * ratings[i, :].T
+    
+    for j in range(ff):
+        XtX[j,j] += LAMBDA * uu
+    
     return np.linalg.solve(XtX, Xty)
 
 if __name__ == "__main__":
-- 
GitLab