[SPARK-7487] [ML] Feature Parity in PySpark for ml.regression

Added LinearRegression Python API Author: Burak Yavuz <brkyvz@gmail.com> Closes #6016 from brkyvz/ml-reg and squashes the following commits: 11c9ef9 [Burak Yavuz] address comments 1027a40 [Burak Yavuz] fix typo 4c699ad [Burak Yavuz] added tree regressor api 8afead2 [Burak Yavuz] made mixin for DT fa51c74 [Burak Yavuz] save additions 0640d48 [Burak Yavuz] added ml.regression 82aac48 [Burak Yavuz] added linear regression

[SPARK-7487] [ML] Feature Parity in PySpark for ml.regression
8e935b0a · Burak Yavuz · Xiangrui Meng · b9b01f44 · 8e935b0a · 8e935b0a
Commit 8e935b0a authored 9 years ago by Burak Yavuz Committed by Xiangrui Meng 9 years ago
--- a/python/docs/pyspark.ml.rst
+++ b/python/docs/pyspark.ml.rst
@@ -25,6 +25,22 @@ pyspark.ml.classification module
    :undoc-members:
    :inherited-members:

+pyspark.ml.recommendation module
+-------------------------
+
+.. automodule:: pyspark.ml.recommendation
+    :members:
+    :undoc-members:
+    :inherited-members:
+
+pyspark.ml.regression module
+-------------------------
+
+.. automodule:: pyspark.ml.regression
+    :members:
+    :undoc-members:
+    :inherited-members:
+
 pyspark.ml.tuning module
 --------------------------------


--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -91,7 +91,7 @@ if __name__ == "__main__":
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
-    sc = SparkContext("local[2]", "ml.feature tests")
+    sc = SparkContext("local[2]", "ml.classification tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext

--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -38,16 +38,13 @@ header = """#
 # python _shared_params_code_gen.py > shared.py


-def _gen_param_code(name, doc, defaultValueStr):
+def _gen_param_header(name, doc, defaultValueStr):
    """
-    Generates Python code for a shared param class.
+    Generates the header part for shared variables

    :param name: param name
    :param doc: param doc
-    :param defaultValueStr: string representation of the default value
-    :return: code string
    """
-    # TODO: How to correctly inherit instance attributes?
    template = '''class Has$Name(Params):
    """
    Mixin for param $name: $doc.
@@ -61,8 +58,27 @@ def _gen_param_code(name, doc, defaultValueStr):
        #: param for $doc
        self.$name = Param(self, "$name", "$doc")
        if $defaultValueStr is not None:
-            self._setDefault($name=$defaultValueStr)
+            self._setDefault($name=$defaultValueStr)'''
+
+    Name = name[0].upper() + name[1:]
+    return template \
+        .replace("$name", name) \
+        .replace("$Name", Name) \
+        .replace("$doc", doc) \
+        .replace("$defaultValueStr", str(defaultValueStr))

+
+def _gen_param_code(name, doc, defaultValueStr):
+    """
+    Generates Python code for a shared param class.
+
+    :param name: param name
+    :param doc: param doc
+    :param defaultValueStr: string representation of the default value
+    :return: code string
+    """
+    # TODO: How to correctly inherit instance attributes?
+    template = '''
    def set$Name(self, value):
        """
        Sets the value of :py:attr:`$name`.
@@ -104,5 +120,44 @@ if __name__ == "__main__":
        ("stepSize", "Step size to be used for each iteration of optimization.", None)]
    code = []
    for name, doc, defaultValueStr in shared:
-        code.append(_gen_param_code(name, doc, defaultValueStr))
+        param_code = _gen_param_header(name, doc, defaultValueStr)
+        code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr))
+
+    decisionTreeParams = [
+        ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " +
+         "depth 1 means 1 internal node + 2 leaf nodes."),
+        ("maxBins", "Max number of bins for" +
+         " discretizing continuous features.  Must be >=2 and >= number of categories for any" +
+         " categorical feature."),
+        ("minInstancesPerNode", "Minimum number of instances each child must have after split. " +
+         "If a split causes the left or right child to have fewer than minInstancesPerNode, the " +
+         "split will be discarded as invalid. Should be >= 1."),
+        ("minInfoGain", "Minimum information gain for a split to be considered at a tree node."),
+        ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."),
+        ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " +
+         "instances with nodes. If true, the algorithm will cache node IDs for each instance. " +
+         "Caching can speed up training of deeper trees.")]
+
+    decisionTreeCode = '''class DecisionTreeParams(Params):
+    """
+    Mixin for Decision Tree parameters.
+    """
+
+    # a placeholder to make it appear in the generated doc
+    $dummyPlaceHolders
+
+    def __init__(self):
+        super(DecisionTreeParams, self).__init__()
+        $realParams'''
+    dtParamMethods = ""
+    dummyPlaceholders = ""
+    realParams = ""
+    paramTemplate = """$name = Param($owner, "$name", "$doc")"""
+    for name, doc in decisionTreeParams:
+        variable = paramTemplate.replace("$name", name).replace("$doc", doc)
+        dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n    "
+        realParams += "self." + variable.replace("$owner", "self") + "\n        "
+        dtParamMethods += _gen_param_code(name, doc, None) + "\n"
+    code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders)
+                .replace("$realParams", realParams) + dtParamMethods)
    print("\n\n\n".join(code))
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -424,3 +424,111 @@ class HasStepSize(Params):
        Gets the value of stepSize or its default value.
        """
        return self.getOrDefault(self.stepSize)
+
+
+class DecisionTreeParams(Params):
+    """
+    Mixin for Decision Tree parameters.
+    """
+
+    # a placeholder to make it appear in the generated doc
+    maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
+    maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
+    minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
+    minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
+    maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
+    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+
+    def __init__(self):
+        super(DecisionTreeParams, self).__init__()
+        #: param for Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+        self.maxDepth = Param(self, "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
+        #: param for Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.
+        self.maxBins = Param(self, "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
+        #: param for Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.
+        self.minInstancesPerNode = Param(self, "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
+        #: param for Minimum information gain for a split to be considered at a tree node.
+        self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
+        #: param for Maximum memory in MB allocated to histogram aggregation.
+        self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
+        #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
+        self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+
+    def setMaxDepth(self, value):
+        """
+        Sets the value of :py:attr:`maxDepth`.
+        """
+        self.paramMap[self.maxDepth] = value
+        return self
+
+    def getMaxDepth(self):
+        """
+        Gets the value of maxDepth or its default value.
+        """
+        return self.getOrDefault(self.maxDepth)
+
+    def setMaxBins(self, value):
+        """
+        Sets the value of :py:attr:`maxBins`.
+        """
+        self.paramMap[self.maxBins] = value
+        return self
+
+    def getMaxBins(self):
+        """
+        Gets the value of maxBins or its default value.
+        """
+        return self.getOrDefault(self.maxBins)
+
+    def setMinInstancesPerNode(self, value):
+        """
+        Sets the value of :py:attr:`minInstancesPerNode`.
+        """
+        self.paramMap[self.minInstancesPerNode] = value
+        return self
+
+    def getMinInstancesPerNode(self):
+        """
+        Gets the value of minInstancesPerNode or its default value.
+        """
+        return self.getOrDefault(self.minInstancesPerNode)
+
+    def setMinInfoGain(self, value):
+        """
+        Sets the value of :py:attr:`minInfoGain`.
+        """
+        self.paramMap[self.minInfoGain] = value
+        return self
+
+    def getMinInfoGain(self):
+        """
+        Gets the value of minInfoGain or its default value.
+        """
+        return self.getOrDefault(self.minInfoGain)
+
+    def setMaxMemoryInMB(self, value):
+        """
+        Sets the value of :py:attr:`maxMemoryInMB`.
+        """
+        self.paramMap[self.maxMemoryInMB] = value
+        return self
+
+    def getMaxMemoryInMB(self):
+        """
+        Gets the value of maxMemoryInMB or its default value.
+        """
+        return self.getOrDefault(self.maxMemoryInMB)
+
+    def setCacheNodeIds(self, value):
+        """
+        Sets the value of :py:attr:`cacheNodeIds`.
+        """
+        self.paramMap[self.cacheNodeIds] = value
+        return self
+
+    def getCacheNodeIds(self):
+        """
+        Gets the value of cacheNodeIds or its default value.
+        """
+        return self.getOrDefault(self.cacheNodeIds)
+
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
--- a/python/run-tests
+++ b/python/run-tests
@@ -98,6 +98,8 @@ function run_ml_tests() {
    echo "Run ml tests ..."
    run_test "pyspark/ml/feature.py"
    run_test "pyspark/ml/classification.py"
+    run_test "pyspark/ml/recommendation.py"
+    run_test "pyspark/ml/regression.py"
    run_test "pyspark/ml/tuning.py"
    run_test "pyspark/ml/tests.py"
    run_test "pyspark/ml/evaluation.py"