Skip to content
Snippets Groups Projects
Commit 8e935b0a authored by Burak Yavuz's avatar Burak Yavuz Committed by Xiangrui Meng
Browse files

[SPARK-7487] [ML] Feature Parity in PySpark for ml.regression

Added LinearRegression Python API

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #6016 from brkyvz/ml-reg and squashes the following commits:

11c9ef9 [Burak Yavuz] address comments
1027a40 [Burak Yavuz] fix typo
4c699ad [Burak Yavuz] added tree regressor api
8afead2 [Burak Yavuz] made mixin for DT
fa51c74 [Burak Yavuz] save additions
0640d48 [Burak Yavuz] added ml.regression
82aac48 [Burak Yavuz] added linear regression
parent b9b01f44
No related branches found
No related tags found
No related merge requests found
......@@ -25,6 +25,22 @@ pyspark.ml.classification module
:undoc-members:
:inherited-members:
pyspark.ml.recommendation module
-------------------------
.. automodule:: pyspark.ml.recommendation
:members:
:undoc-members:
:inherited-members:
pyspark.ml.regression module
-------------------------
.. automodule:: pyspark.ml.regression
:members:
:undoc-members:
:inherited-members:
pyspark.ml.tuning module
--------------------------------
......
......@@ -91,7 +91,7 @@ if __name__ == "__main__":
globs = globals().copy()
# The small batch size here ensures that we see multiple batches,
# even in these small test examples:
sc = SparkContext("local[2]", "ml.feature tests")
sc = SparkContext("local[2]", "ml.classification tests")
sqlContext = SQLContext(sc)
globs['sc'] = sc
globs['sqlContext'] = sqlContext
......
......@@ -38,16 +38,13 @@ header = """#
# python _shared_params_code_gen.py > shared.py
def _gen_param_code(name, doc, defaultValueStr):
def _gen_param_header(name, doc, defaultValueStr):
"""
Generates Python code for a shared param class.
Generates the header part for shared variables
:param name: param name
:param doc: param doc
:param defaultValueStr: string representation of the default value
:return: code string
"""
# TODO: How to correctly inherit instance attributes?
template = '''class Has$Name(Params):
"""
Mixin for param $name: $doc.
......@@ -61,8 +58,27 @@ def _gen_param_code(name, doc, defaultValueStr):
#: param for $doc
self.$name = Param(self, "$name", "$doc")
if $defaultValueStr is not None:
self._setDefault($name=$defaultValueStr)
self._setDefault($name=$defaultValueStr)'''
Name = name[0].upper() + name[1:]
return template \
.replace("$name", name) \
.replace("$Name", Name) \
.replace("$doc", doc) \
.replace("$defaultValueStr", str(defaultValueStr))
def _gen_param_code(name, doc, defaultValueStr):
"""
Generates Python code for a shared param class.
:param name: param name
:param doc: param doc
:param defaultValueStr: string representation of the default value
:return: code string
"""
# TODO: How to correctly inherit instance attributes?
template = '''
def set$Name(self, value):
"""
Sets the value of :py:attr:`$name`.
......@@ -104,5 +120,44 @@ if __name__ == "__main__":
("stepSize", "Step size to be used for each iteration of optimization.", None)]
code = []
for name, doc, defaultValueStr in shared:
code.append(_gen_param_code(name, doc, defaultValueStr))
param_code = _gen_param_header(name, doc, defaultValueStr)
code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr))
decisionTreeParams = [
("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " +
"depth 1 means 1 internal node + 2 leaf nodes."),
("maxBins", "Max number of bins for" +
" discretizing continuous features. Must be >=2 and >= number of categories for any" +
" categorical feature."),
("minInstancesPerNode", "Minimum number of instances each child must have after split. " +
"If a split causes the left or right child to have fewer than minInstancesPerNode, the " +
"split will be discarded as invalid. Should be >= 1."),
("minInfoGain", "Minimum information gain for a split to be considered at a tree node."),
("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."),
("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " +
"instances with nodes. If true, the algorithm will cache node IDs for each instance. " +
"Caching can speed up training of deeper trees.")]
decisionTreeCode = '''class DecisionTreeParams(Params):
"""
Mixin for Decision Tree parameters.
"""
# a placeholder to make it appear in the generated doc
$dummyPlaceHolders
def __init__(self):
super(DecisionTreeParams, self).__init__()
$realParams'''
dtParamMethods = ""
dummyPlaceholders = ""
realParams = ""
paramTemplate = """$name = Param($owner, "$name", "$doc")"""
for name, doc in decisionTreeParams:
variable = paramTemplate.replace("$name", name).replace("$doc", doc)
dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n "
realParams += "self." + variable.replace("$owner", "self") + "\n "
dtParamMethods += _gen_param_code(name, doc, None) + "\n"
code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders)
.replace("$realParams", realParams) + dtParamMethods)
print("\n\n\n".join(code))
......@@ -424,3 +424,111 @@ class HasStepSize(Params):
Gets the value of stepSize or its default value.
"""
return self.getOrDefault(self.stepSize)
class DecisionTreeParams(Params):
"""
Mixin for Decision Tree parameters.
"""
# a placeholder to make it appear in the generated doc
maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.")
minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
def __init__(self):
super(DecisionTreeParams, self).__init__()
#: param for Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
self.maxDepth = Param(self, "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
#: param for Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.
self.maxBins = Param(self, "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.")
#: param for Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.
self.minInstancesPerNode = Param(self, "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
#: param for Minimum information gain for a split to be considered at a tree node.
self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
#: param for Maximum memory in MB allocated to histogram aggregation.
self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
#: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
def setMaxDepth(self, value):
"""
Sets the value of :py:attr:`maxDepth`.
"""
self.paramMap[self.maxDepth] = value
return self
def getMaxDepth(self):
"""
Gets the value of maxDepth or its default value.
"""
return self.getOrDefault(self.maxDepth)
def setMaxBins(self, value):
"""
Sets the value of :py:attr:`maxBins`.
"""
self.paramMap[self.maxBins] = value
return self
def getMaxBins(self):
"""
Gets the value of maxBins or its default value.
"""
return self.getOrDefault(self.maxBins)
def setMinInstancesPerNode(self, value):
"""
Sets the value of :py:attr:`minInstancesPerNode`.
"""
self.paramMap[self.minInstancesPerNode] = value
return self
def getMinInstancesPerNode(self):
"""
Gets the value of minInstancesPerNode or its default value.
"""
return self.getOrDefault(self.minInstancesPerNode)
def setMinInfoGain(self, value):
"""
Sets the value of :py:attr:`minInfoGain`.
"""
self.paramMap[self.minInfoGain] = value
return self
def getMinInfoGain(self):
"""
Gets the value of minInfoGain or its default value.
"""
return self.getOrDefault(self.minInfoGain)
def setMaxMemoryInMB(self, value):
"""
Sets the value of :py:attr:`maxMemoryInMB`.
"""
self.paramMap[self.maxMemoryInMB] = value
return self
def getMaxMemoryInMB(self):
"""
Gets the value of maxMemoryInMB or its default value.
"""
return self.getOrDefault(self.maxMemoryInMB)
def setCacheNodeIds(self, value):
"""
Sets the value of :py:attr:`cacheNodeIds`.
"""
self.paramMap[self.cacheNodeIds] = value
return self
def getCacheNodeIds(self):
"""
Gets the value of cacheNodeIds or its default value.
"""
return self.getOrDefault(self.cacheNodeIds)
This diff is collapsed.
......@@ -98,6 +98,8 @@ function run_ml_tests() {
echo "Run ml tests ..."
run_test "pyspark/ml/feature.py"
run_test "pyspark/ml/classification.py"
run_test "pyspark/ml/recommendation.py"
run_test "pyspark/ml/regression.py"
run_test "pyspark/ml/tuning.py"
run_test "pyspark/ml/tests.py"
run_test "pyspark/ml/evaluation.py"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment