Skip to content
Snippets Groups Projects
Commit b656e613 authored by Yanbo Liang's avatar Yanbo Liang Committed by Xiangrui Meng
Browse files

[SPARK-10026] [ML] [PySpark] Implement some common Params for regression in PySpark

LinearRegression and LogisticRegression lack of some Params for Python, and some Params are not shared classes which lead we need to write them for each class. These kinds of Params are list here:
```scala
HasElasticNetParam
HasFitIntercept
HasStandardization
HasThresholds
```
Here we implement them in shared params at Python side and make LinearRegression/LogisticRegression parameters peer with Scala one.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8508 from yanboliang/spark-10026.
parent c268ca4d
No related branches found
No related tags found
No related merge requests found
...@@ -31,7 +31,8 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassif ...@@ -31,7 +31,8 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassif
@inherit_doc @inherit_doc
class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol): HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol,
HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds):
""" """
Logistic regression. Logistic regression.
Currently, this class only supports binary classification. Currently, this class only supports binary classification.
...@@ -65,17 +66,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti ...@@ -65,17 +66,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
""" """
# a placeholder to make it appear in the generated doc # a placeholder to make it appear in the generated doc
elasticNetParam = \
Param(Params._dummy(), "elasticNetParam",
"the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
"the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.")
thresholds = Param(Params._dummy(), "thresholds",
"Thresholds in multi-class classification" +
" to adjust the probability of predicting each class." +
" Array must have length equal to the number of classes, with values >= 0." +
" The class with largest value p/t is predicted, where p is the original" +
" probability of that class and t is the class' threshold.")
threshold = Param(Params._dummy(), "threshold", threshold = Param(Params._dummy(), "threshold",
"Threshold in binary classification prediction, in range [0, 1]." + "Threshold in binary classification prediction, in range [0, 1]." +
" If threshold and thresholds are both set, they must match.") " If threshold and thresholds are both set, they must match.")
...@@ -83,40 +73,23 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti ...@@ -83,40 +73,23 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
@keyword_only @keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None, threshold=0.5, thresholds=None, probabilityCol="probability",
probabilityCol="probability", rawPredictionCol="rawPrediction"): rawPredictionCol="rawPrediction", standardization=True):
""" """
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, \ threshold=0.5, thresholds=None, probabilityCol="probability", \
probabilityCol="probability", rawPredictionCol="rawPrediction") rawPredictionCol="rawPrediction", standardization=True)
If the threshold and thresholds Params are both set, they must be equivalent. If the threshold and thresholds Params are both set, they must be equivalent.
""" """
super(LogisticRegression, self).__init__() super(LogisticRegression, self).__init__()
self._java_obj = self._new_java_obj( self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.LogisticRegression", self.uid) "org.apache.spark.ml.classification.LogisticRegression", self.uid)
#: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
# is an L2 penalty. For alpha = 1, it is an L1 penalty.
self.elasticNetParam = \
Param(self, "elasticNetParam",
"the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
"the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
#: param for whether to fit an intercept term.
self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
#: param for threshold in binary classification, in range [0, 1]. #: param for threshold in binary classification, in range [0, 1].
self.threshold = Param(self, "threshold", self.threshold = Param(self, "threshold",
"Threshold in binary classification prediction, in range [0, 1]." + "Threshold in binary classification prediction, in range [0, 1]." +
" If threshold and thresholds are both set, they must match.") " If threshold and thresholds are both set, they must match.")
#: param for thresholds or cutoffs in binary or multiclass classification self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5)
self.thresholds = \
Param(self, "thresholds",
"Thresholds in multi-class classification" +
" to adjust the probability of predicting each class." +
" Array must have length equal to the number of classes, with values >= 0." +
" The class with largest value p/t is predicted, where p is the original" +
" probability of that class and t is the class' threshold.")
self._setDefault(maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1E-6,
fitIntercept=True, threshold=0.5)
kwargs = self.__init__._input_kwargs kwargs = self.__init__._input_kwargs
self.setParams(**kwargs) self.setParams(**kwargs)
self._checkThresholdConsistency() self._checkThresholdConsistency()
...@@ -124,13 +97,13 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti ...@@ -124,13 +97,13 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
@keyword_only @keyword_only
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None, threshold=0.5, thresholds=None, probabilityCol="probability",
probabilityCol="probability", rawPredictionCol="rawPrediction"): rawPredictionCol="rawPrediction", standardization=True):
""" """
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, \ threshold=0.5, thresholds=None, probabilityCol="probability", \
probabilityCol="probability", rawPredictionCol="rawPrediction") rawPredictionCol="rawPrediction", standardization=True)
Sets params for logistic regression. Sets params for logistic regression.
If the threshold and thresholds Params are both set, they must be equivalent. If the threshold and thresholds Params are both set, they must be equivalent.
""" """
...@@ -142,32 +115,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti ...@@ -142,32 +115,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
def _create_model(self, java_model): def _create_model(self, java_model):
return LogisticRegressionModel(java_model) return LogisticRegressionModel(java_model)
def setElasticNetParam(self, value):
"""
Sets the value of :py:attr:`elasticNetParam`.
"""
self._paramMap[self.elasticNetParam] = value
return self
def getElasticNetParam(self):
"""
Gets the value of elasticNetParam or its default value.
"""
return self.getOrDefault(self.elasticNetParam)
def setFitIntercept(self, value):
"""
Sets the value of :py:attr:`fitIntercept`.
"""
self._paramMap[self.fitIntercept] = value
return self
def getFitIntercept(self):
"""
Gets the value of fitIntercept or its default value.
"""
return self.getOrDefault(self.fitIntercept)
def setThreshold(self, value): def setThreshold(self, value):
""" """
Sets the value of :py:attr:`threshold`. Sets the value of :py:attr:`threshold`.
......
...@@ -124,7 +124,16 @@ if __name__ == "__main__": ...@@ -124,7 +124,16 @@ if __name__ == "__main__":
("stepSize", "Step size to be used for each iteration of optimization.", None), ("stepSize", "Step size to be used for each iteration of optimization.", None),
("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " + ("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " +
"out rows with bad values), or error (which will throw an errror). More options may be " + "out rows with bad values), or error (which will throw an errror). More options may be " +
"added later.", None)] "added later.", None),
("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
"the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0"),
("fitIntercept", "whether to fit an intercept term.", "True"),
("standardization", "whether to standardize the training features before fitting the " +
"model.", "True"),
("thresholds", "Thresholds in multi-class classification to adjust the probability of " +
"predicting each class. Array must have length equal to the number of classes, with " +
"values >= 0. The class with largest value p/t is predicted, where p is the original " +
"probability of that class and t is the class' threshold.", None)]
code = [] code = []
for name, doc, defaultValueStr in shared: for name, doc, defaultValueStr in shared:
param_code = _gen_param_header(name, doc, defaultValueStr) param_code = _gen_param_header(name, doc, defaultValueStr)
......
...@@ -459,6 +459,117 @@ class HasHandleInvalid(Params): ...@@ -459,6 +459,117 @@ class HasHandleInvalid(Params):
return self.getOrDefault(self.handleInvalid) return self.getOrDefault(self.handleInvalid)
class HasElasticNetParam(Params):
"""
Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty..
"""
# a placeholder to make it appear in the generated doc
elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
def __init__(self):
super(HasElasticNetParam, self).__init__()
#: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
self.elasticNetParam = Param(self, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
self._setDefault(elasticNetParam=0.0)
def setElasticNetParam(self, value):
"""
Sets the value of :py:attr:`elasticNetParam`.
"""
self._paramMap[self.elasticNetParam] = value
return self
def getElasticNetParam(self):
"""
Gets the value of elasticNetParam or its default value.
"""
return self.getOrDefault(self.elasticNetParam)
class HasFitIntercept(Params):
"""
Mixin for param fitIntercept: whether to fit an intercept term..
"""
# a placeholder to make it appear in the generated doc
fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.")
def __init__(self):
super(HasFitIntercept, self).__init__()
#: param for whether to fit an intercept term.
self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
self._setDefault(fitIntercept=True)
def setFitIntercept(self, value):
"""
Sets the value of :py:attr:`fitIntercept`.
"""
self._paramMap[self.fitIntercept] = value
return self
def getFitIntercept(self):
"""
Gets the value of fitIntercept or its default value.
"""
return self.getOrDefault(self.fitIntercept)
class HasStandardization(Params):
"""
Mixin for param standardization: whether to standardize the training features before fitting the model..
"""
# a placeholder to make it appear in the generated doc
standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.")
def __init__(self):
super(HasStandardization, self).__init__()
#: param for whether to standardize the training features before fitting the model.
self.standardization = Param(self, "standardization", "whether to standardize the training features before fitting the model.")
self._setDefault(standardization=True)
def setStandardization(self, value):
"""
Sets the value of :py:attr:`standardization`.
"""
self._paramMap[self.standardization] = value
return self
def getStandardization(self):
"""
Gets the value of standardization or its default value.
"""
return self.getOrDefault(self.standardization)
class HasThresholds(Params):
"""
Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold..
"""
# a placeholder to make it appear in the generated doc
thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.")
def __init__(self):
super(HasThresholds, self).__init__()
#: param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
self.thresholds = Param(self, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.")
def setThresholds(self, value):
"""
Sets the value of :py:attr:`thresholds`.
"""
self._paramMap[self.thresholds] = value
return self
def getThresholds(self):
"""
Gets the value of thresholds or its default value.
"""
return self.getOrDefault(self.thresholds)
class DecisionTreeParams(Params): class DecisionTreeParams(Params):
""" """
Mixin for Decision Tree parameters. Mixin for Decision Tree parameters.
......
...@@ -28,7 +28,8 @@ __all__ = ['DecisionTreeRegressor', 'DecisionTreeRegressionModel', 'GBTRegressor ...@@ -28,7 +28,8 @@ __all__ = ['DecisionTreeRegressor', 'DecisionTreeRegressionModel', 'GBTRegressor
@inherit_doc @inherit_doc
class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
HasRegParam, HasTol): HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept,
HasStandardization):
""" """
Linear regression. Linear regression.
...@@ -63,38 +64,30 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction ...@@ -63,38 +64,30 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
TypeError: Method setParams forces keyword arguments. TypeError: Method setParams forces keyword arguments.
""" """
# a placeholder to make it appear in the generated doc
elasticNetParam = \
Param(Params._dummy(), "elasticNetParam",
"the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
"the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
@keyword_only @keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6): maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
standardization=True):
""" """
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6) maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
standardization=True)
""" """
super(LinearRegression, self).__init__() super(LinearRegression, self).__init__()
self._java_obj = self._new_java_obj( self._java_obj = self._new_java_obj(
"org.apache.spark.ml.regression.LinearRegression", self.uid) "org.apache.spark.ml.regression.LinearRegression", self.uid)
#: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty self._setDefault(maxIter=100, regParam=0.0, tol=1e-6)
# is an L2 penalty. For alpha = 1, it is an L1 penalty.
self.elasticNetParam = \
Param(self, "elasticNetParam",
"the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " +
"is an L2 penalty. For alpha = 1, it is an L1 penalty.")
self._setDefault(maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
kwargs = self.__init__._input_kwargs kwargs = self.__init__._input_kwargs
self.setParams(**kwargs) self.setParams(**kwargs)
@keyword_only @keyword_only
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6): maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
standardization=True):
""" """
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6) maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
standardization=True)
Sets params for linear regression. Sets params for linear regression.
""" """
kwargs = self.setParams._input_kwargs kwargs = self.setParams._input_kwargs
...@@ -103,19 +96,6 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction ...@@ -103,19 +96,6 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
def _create_model(self, java_model): def _create_model(self, java_model):
return LinearRegressionModel(java_model) return LinearRegressionModel(java_model)
def setElasticNetParam(self, value):
"""
Sets the value of :py:attr:`elasticNetParam`.
"""
self._paramMap[self.elasticNetParam] = value
return self
def getElasticNetParam(self):
"""
Gets the value of elasticNetParam or its default value.
"""
return self.getOrDefault(self.elasticNetParam)
class LinearRegressionModel(JavaModel): class LinearRegressionModel(JavaModel):
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment