From 5aa804f3c6485670937a658ce8207c2317c6a506 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Sat, 6 Jun 2015 14:52:14 -0700
Subject: [PATCH] [SPARK-7639] [PYSPARK] [MLLIB] Python API for KernelDensity

Python API for KernelDensity

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6387 from MechCoder/spark-7639 and squashes the following commits:

17abc62 [MechCoder] add tests
2de6540 [MechCoder] style tests
bf4acc0 [MechCoder] Added doctests
84359d5 [MechCoder] [SPARK-7639] Python API for KernelDensity
---
 .../mllib/api/python/PythonMLLibAPI.scala     | 12 +++-
 python/pyspark/mllib/stat/KernelDensity.py    | 61 +++++++++++++++++++
 python/pyspark/mllib/stat/__init__.py         |  3 +-
 python/run-tests                              |  1 +
 4 files changed, 75 insertions(+), 2 deletions(-)
 create mode 100644 python/pyspark/mllib/stat/KernelDensity.py

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 16f3131796..8f66bc808a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -43,7 +43,8 @@ import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.stat.correlation.CorrelationNames
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.stat.test.ChiSqTestResult
-import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
+import org.apache.spark.mllib.stat.{
+  KernelDensity, MultivariateStatisticalSummary, Statistics}
 import org.apache.spark.mllib.tree.configuration.{Algo, BoostingStrategy, Strategy}
 import org.apache.spark.mllib.tree.impurity._
 import org.apache.spark.mllib.tree.loss.Losses
@@ -945,6 +946,15 @@ private[python] class PythonMLLibAPI extends Serializable {
       r => (r.getSeq(0).toArray[Any], r.getSeq(1).toArray[Any])))
   }
 
+  /**
+   * Java stub for the estimate method of KernelDensity
+   */
+  def estimateKernelDensity(
+      sample: JavaRDD[Double],
+      bandwidth: Double, points: java.util.ArrayList[Double]): Array[Double] = {
+    return new KernelDensity().setSample(sample).setBandwidth(bandwidth).estimate(
+      points.asScala.toArray)
+  }
 
 }
 
diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py
new file mode 100644
index 0000000000..7da921976d
--- /dev/null
+++ b/python/pyspark/mllib/stat/KernelDensity.py
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+
+if sys.version > '3':
+    xrange = range
+
+import numpy as np
+
+from pyspark.mllib.common import callMLlibFunc
+from pyspark.rdd import RDD
+
+
+class KernelDensity(object):
+    """
+    .. note:: Experimental
+
+    Estimate probability density at required points given a RDD of samples
+    from the population.
+
+    >>> kd = KernelDensity()
+    >>> sample = sc.parallelize([0.0, 1.0])
+    >>> kd.setSample(sample)
+    >>> kd.estimate([0.0, 1.0])
+    array([ 0.12938758,  0.12938758])
+    """
+    def __init__(self):
+        self._bandwidth = 1.0
+        self._sample = None
+
+    def setBandwidth(self, bandwidth):
+        """Set bandwidth of each sample. Defaults to 1.0"""
+        self._bandwidth = bandwidth
+
+    def setSample(self, sample):
+        """Set sample points from the population. Should be a RDD"""
+        if not isinstance(sample, RDD):
+            raise TypeError("samples should be a RDD, received %s" % type(sample))
+        self._sample = sample
+
+    def estimate(self, points):
+        """Estimate the probability density at points"""
+        points = list(points)
+        densities = callMLlibFunc(
+            "estimateKernelDensity", self._sample, self._bandwidth, points)
+        return np.asarray(densities)
diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py
index e3e128513e..c8a721d3fe 100644
--- a/python/pyspark/mllib/stat/__init__.py
+++ b/python/pyspark/mllib/stat/__init__.py
@@ -22,6 +22,7 @@ Python package for statistical functions in MLlib.
 from pyspark.mllib.stat._statistics import *
 from pyspark.mllib.stat.distribution import MultivariateGaussian
 from pyspark.mllib.stat.test import ChiSqTestResult
+from pyspark.mllib.stat.KernelDensity import KernelDensity
 
 __all__ = ["Statistics", "MultivariateStatisticalSummary", "ChiSqTestResult",
-           "MultivariateGaussian"]
+           "MultivariateGaussian", "KernelDensity"]
diff --git a/python/run-tests b/python/run-tests
index 17dda3eada..4468fdb3f2 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -93,6 +93,7 @@ function run_mllib_tests() {
     run_test "pyspark.mllib.recommendation"
     run_test "pyspark.mllib.regression"
     run_test "pyspark.mllib.stat._statistics"
+    run_test "pyspark.mllib.stat.KernelDensity"
     run_test "pyspark.mllib.tree"
     run_test "pyspark.mllib.util"
     run_test "pyspark.mllib.tests"
-- 
GitLab