Skip to content
Snippets Groups Projects
Commit 164fe2aa authored by Holden Karau's avatar Holden Karau Committed by Joseph K. Bradley
Browse files

[SPARK-7781] [MLLIB] gradient boosted trees.train regressor missing max bins

Author: Holden Karau <holden@pigscanfly.ca>

Closes #6331 from holdenk/SPARK-7781-GradientBoostedTrees.trainRegressor-missing-max-bins and squashes the following commits:

2894695 [Holden Karau] remove extra blank line
2573e8d [Holden Karau] Update the scala side of the pythonmllibapi and make the test a bit nicer too
3a09170 [Holden Karau] add maxBins to to the train method as well
af7f274 [Holden Karau] Add maxBins to GradientBoostedTrees.trainRegressor and correctly mention the default of 32 in other places where it mentioned 100
parent 44fa7df6
No related branches found
No related tags found
No related merge requests found
...@@ -696,12 +696,14 @@ private[python] class PythonMLLibAPI extends Serializable { ...@@ -696,12 +696,14 @@ private[python] class PythonMLLibAPI extends Serializable {
lossStr: String, lossStr: String,
numIterations: Int, numIterations: Int,
learningRate: Double, learningRate: Double,
maxDepth: Int): GradientBoostedTreesModel = { maxDepth: Int,
maxBins: Int): GradientBoostedTreesModel = {
val boostingStrategy = BoostingStrategy.defaultParams(algoStr) val boostingStrategy = BoostingStrategy.defaultParams(algoStr)
boostingStrategy.setLoss(Losses.fromString(lossStr)) boostingStrategy.setLoss(Losses.fromString(lossStr))
boostingStrategy.setNumIterations(numIterations) boostingStrategy.setNumIterations(numIterations)
boostingStrategy.setLearningRate(learningRate) boostingStrategy.setLearningRate(learningRate)
boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
boostingStrategy.treeStrategy.setMaxBins(maxBins)
boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap
val cached = data.rdd.persist(StorageLevel.MEMORY_AND_DISK) val cached = data.rdd.persist(StorageLevel.MEMORY_AND_DISK)
......
...@@ -463,6 +463,13 @@ class ListTests(MLlibTestCase): ...@@ -463,6 +463,13 @@ class ListTests(MLlibTestCase):
except ValueError: except ValueError:
self.fail() self.fail()
# Verify that maxBins is being passed through
GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32)
with self.assertRaises(Exception) as cm:
GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)
class StatTests(MLlibTestCase): class StatTests(MLlibTestCase):
# SPARK-4023 # SPARK-4023
......
...@@ -299,7 +299,7 @@ class RandomForest(object): ...@@ -299,7 +299,7 @@ class RandomForest(object):
1 internal node + 2 leaf nodes. (default: 4) 1 internal node + 2 leaf nodes. (default: 4)
:param maxBins: maximum number of bins used for splitting :param maxBins: maximum number of bins used for splitting
features features
(default: 100) (default: 32)
:param seed: Random seed for bootstrapping and choosing feature :param seed: Random seed for bootstrapping and choosing feature
subsets. subsets.
:return: RandomForestModel that can be used for prediction :return: RandomForestModel that can be used for prediction
...@@ -377,7 +377,7 @@ class RandomForest(object): ...@@ -377,7 +377,7 @@ class RandomForest(object):
1 leaf node; depth 1 means 1 internal node + 2 leaf 1 leaf node; depth 1 means 1 internal node + 2 leaf
nodes. (default: 4) nodes. (default: 4)
:param maxBins: maximum number of bins used for splitting :param maxBins: maximum number of bins used for splitting
features (default: 100) features (default: 32)
:param seed: Random seed for bootstrapping and choosing feature :param seed: Random seed for bootstrapping and choosing feature
subsets. subsets.
:return: RandomForestModel that can be used for prediction :return: RandomForestModel that can be used for prediction
...@@ -435,16 +435,17 @@ class GradientBoostedTrees(object): ...@@ -435,16 +435,17 @@ class GradientBoostedTrees(object):
@classmethod @classmethod
def _train(cls, data, algo, categoricalFeaturesInfo, def _train(cls, data, algo, categoricalFeaturesInfo,
loss, numIterations, learningRate, maxDepth): loss, numIterations, learningRate, maxDepth, maxBins):
first = data.first() first = data.first()
assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo, model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
loss, numIterations, learningRate, maxDepth) loss, numIterations, learningRate, maxDepth, maxBins)
return GradientBoostedTreesModel(model) return GradientBoostedTreesModel(model)
@classmethod @classmethod
def trainClassifier(cls, data, categoricalFeaturesInfo, def trainClassifier(cls, data, categoricalFeaturesInfo,
loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3): loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
maxBins=32):
""" """
Method to train a gradient-boosted trees model for Method to train a gradient-boosted trees model for
classification. classification.
...@@ -467,6 +468,8 @@ class GradientBoostedTrees(object): ...@@ -467,6 +468,8 @@ class GradientBoostedTrees(object):
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
1 leaf node; depth 1 means 1 internal node + 2 leaf 1 leaf node; depth 1 means 1 internal node + 2 leaf
nodes. (default: 3) nodes. (default: 3)
:param maxBins: maximum number of bins used for splitting
features (default: 32) DecisionTree requires maxBins >= max categories
:return: GradientBoostedTreesModel that can be used for :return: GradientBoostedTreesModel that can be used for
prediction prediction
...@@ -499,11 +502,12 @@ class GradientBoostedTrees(object): ...@@ -499,11 +502,12 @@ class GradientBoostedTrees(object):
[1.0, 0.0] [1.0, 0.0]
""" """
return cls._train(data, "classification", categoricalFeaturesInfo, return cls._train(data, "classification", categoricalFeaturesInfo,
loss, numIterations, learningRate, maxDepth) loss, numIterations, learningRate, maxDepth, maxBins)
@classmethod @classmethod
def trainRegressor(cls, data, categoricalFeaturesInfo, def trainRegressor(cls, data, categoricalFeaturesInfo,
loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3): loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
maxBins=32):
""" """
Method to train a gradient-boosted trees model for regression. Method to train a gradient-boosted trees model for regression.
...@@ -522,6 +526,8 @@ class GradientBoostedTrees(object): ...@@ -522,6 +526,8 @@ class GradientBoostedTrees(object):
contribution of each estimator. The learning rate contribution of each estimator. The learning rate
should be between in the interval (0, 1]. should be between in the interval (0, 1].
(default: 0.1) (default: 0.1)
:param maxBins: maximum number of bins used for splitting
features (default: 32) DecisionTree requires maxBins >= max categories
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
1 leaf node; depth 1 means 1 internal node + 2 leaf 1 leaf node; depth 1 means 1 internal node + 2 leaf
nodes. (default: 3) nodes. (default: 3)
...@@ -556,7 +562,7 @@ class GradientBoostedTrees(object): ...@@ -556,7 +562,7 @@ class GradientBoostedTrees(object):
[1.0, 0.0] [1.0, 0.0]
""" """
return cls._train(data, "regression", categoricalFeaturesInfo, return cls._train(data, "regression", categoricalFeaturesInfo,
loss, numIterations, learningRate, maxDepth) loss, numIterations, learningRate, maxDepth, maxBins)
def _test(): def _test():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment