From 32e3cdaa647722671adcb5068bd5ffbf2f157806 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sun, 21 Jun 2015 12:04:20 -0700
Subject: [PATCH] [SPARK-7604] [MLLIB] Python API for PCA and PCAModel

Python API for PCA and PCAModel

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #6315 from yanboliang/spark-7604 and squashes the following commits:

1d58734 [Yanbo Liang] remove transform() in PCAModel, use default behavior
4d9d121 [Yanbo Liang] Python API for PCA and PCAModel
---
 .../mllib/api/python/PythonMLLibAPI.scala     | 10 ++++++
 python/pyspark/mllib/feature.py               | 35 +++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 2897865af6..634d56d08d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -519,6 +519,16 @@ private[python] class PythonMLLibAPI extends Serializable {
     new ChiSqSelector(numTopFeatures).fit(data.rdd)
   }
 
+  /**
+   * Java stub for PCA.fit(). This stub returns a
+   * handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on
+   * exit; see the Py4J documentation.
+   */
+  def fitPCA(k: Int, data: JavaRDD[Vector]): PCAModel = {
+    new PCA(k).fit(data.rdd)
+  }
+
   /**
    * Java stub for IDF.fit(). This stub returns a
    * handle to the Java object instead of the content of the Java object.
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index cf5fdf2cf9..334f5b86cd 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -252,6 +252,41 @@ class ChiSqSelector(object):
         return ChiSqSelectorModel(jmodel)
 
 
+class PCAModel(JavaVectorTransformer):
+    """
+    Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
+    """
+
+
+class PCA(object):
+    """
+    A feature transformer that projects vectors to a low-dimensional space using PCA.
+
+    >>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),
+    ...     Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),
+    ...     Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
+    >>> model = PCA(2).fit(sc.parallelize(data))
+    >>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
+    >>> pcArray[0]
+    1.648...
+    >>> pcArray[1]
+    -4.013...
+    """
+    def __init__(self, k):
+        """
+        :param k: number of principal components.
+        """
+        self.k = int(k)
+
+    def fit(self, data):
+        """
+        Computes a [[PCAModel]] that contains the principal components of the input vectors.
+        :param data: source vectors
+        """
+        jmodel = callMLlibFunc("fitPCA", self.k, data)
+        return PCAModel(jmodel)
+
+
 class HashingTF(object):
     """
     .. note:: Experimental
-- 
GitLab