Skip to content
Snippets Groups Projects
Commit b656e613 authored by Yanbo Liang's avatar Yanbo Liang Committed by Xiangrui Meng
Browse files

[SPARK-10026] [ML] [PySpark] Implement some common Params for regression in PySpark

LinearRegression and LogisticRegression lack of some Params for Python, and some Params are not shared classes which lead we need to write them for each class. These kinds of Params are list here:
```scala
HasElasticNetParam
HasFitIntercept
HasStandardization
HasThresholds
```
Here we implement them in shared params at Python side and make LinearRegression/LogisticRegression parameters peer with Scala one.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8508 from yanboliang/spark-10026.
parent c268ca4d
No related branches found
No related tags found
No related merge requests found
......@@ -31,7 +31,8 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassif
@inherit_doc
class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol):
HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol,
HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds):
"""
Logistic regression.
Currently, this class only supports binary classification.
......@@ -65,17 +66,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
"""
# a placeholder to make it appear in the generated doc
elasticNetParam = \
Param(Params._dummy(), "elasticNetParam",
"the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
"the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.")
thresholds = Param(Params._dummy(), "thresholds",
"Thresholds in multi-class classification" +
" to adjust the probability of predicting each class." +
" Array must have length equal to the number of classes, with values >= 0." +
" The class with largest value p/t is predicted, where p is the original" +
" probability of that class and t is the class' threshold.")
threshold = Param(Params._dummy(), "threshold",
"Threshold in binary classification prediction, in range [0, 1]." +
" If threshold and thresholds are both set, they must match.")
......@@ -83,40 +73,23 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None,
probabilityCol="probability", rawPredictionCol="rawPrediction"):
threshold=0.5, thresholds=None, probabilityCol="probability",
rawPredictionCol="rawPrediction", standardization=True):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, \
probabilityCol="probability", rawPredictionCol="rawPrediction")
threshold=0.5, thresholds=None, probabilityCol="probability", \
rawPredictionCol="rawPrediction", standardization=True)
If the threshold and thresholds Params are both set, they must be equivalent.
"""
super(LogisticRegression, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.LogisticRegression", self.uid)
#: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
# is an L2 penalty. For alpha = 1, it is an L1 penalty.
self.elasticNetParam = \
Param(self, "elasticNetParam",
"the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
"the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
#: param for whether to fit an intercept term.
self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
#: param for threshold in binary classification, in range [0, 1].
self.threshold = Param(self, "threshold",
"Threshold in binary classification prediction, in range [0, 1]." +
" If threshold and thresholds are both set, they must match.")
#: param for thresholds or cutoffs in binary or multiclass classification
self.thresholds = \
Param(self, "thresholds",
"Thresholds in multi-class classification" +
" to adjust the probability of predicting each class." +
" Array must have length equal to the number of classes, with values >= 0." +
" The class with largest value p/t is predicted, where p is the original" +
" probability of that class and t is the class' threshold.")
self._setDefault(maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1E-6,
fitIntercept=True, threshold=0.5)
self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
self._checkThresholdConsistency()
......@@ -124,13 +97,13 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
@keyword_only
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None,
probabilityCol="probability", rawPredictionCol="rawPrediction"):
threshold=0.5, thresholds=None, probabilityCol="probability",
rawPredictionCol="rawPrediction", standardization=True):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, \
probabilityCol="probability", rawPredictionCol="rawPrediction")
threshold=0.5, thresholds=None, probabilityCol="probability", \
rawPredictionCol="rawPrediction", standardization=True)
Sets params for logistic regression.
If the threshold and thresholds Params are both set, they must be equivalent.
"""
......@@ -142,32 +115,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
def _create_model(self, java_model):
return LogisticRegressionModel(java_model)
def setElasticNetParam(self, value):
"""
Sets the value of :py:attr:`elasticNetParam`.
"""
self._paramMap[self.elasticNetParam] = value
return self
def getElasticNetParam(self):
"""
Gets the value of elasticNetParam or its default value.
"""
return self.getOrDefault(self.elasticNetParam)
def setFitIntercept(self, value):
"""
Sets the value of :py:attr:`fitIntercept`.
"""
self._paramMap[self.fitIntercept] = value
return self
def getFitIntercept(self):
"""
Gets the value of fitIntercept or its default value.
"""
return self.getOrDefault(self.fitIntercept)
def setThreshold(self, value):
"""
Sets the value of :py:attr:`threshold`.
......
......@@ -124,7 +124,16 @@ if __name__ == "__main__":
("stepSize", "Step size to be used for each iteration of optimization.", None),
("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " +
"out rows with bad values), or error (which will throw an errror). More options may be " +
"added later.", None)]
"added later.", None),
("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
"the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0"),
("fitIntercept", "whether to fit an intercept term.", "True"),
("standardization", "whether to standardize the training features before fitting the " +
"model.", "True"),
("thresholds", "Thresholds in multi-class classification to adjust the probability of " +
"predicting each class. Array must have length equal to the number of classes, with " +
"values >= 0. The class with largest value p/t is predicted, where p is the original " +
"probability of that class and t is the class' threshold.", None)]
code = []
for name, doc, defaultValueStr in shared:
param_code = _gen_param_header(name, doc, defaultValueStr)
......
......@@ -459,6 +459,117 @@ class HasHandleInvalid(Params):
return self.getOrDefault(self.handleInvalid)
class HasElasticNetParam(Params):
"""
Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty..
"""
# a placeholder to make it appear in the generated doc
elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
def __init__(self):
super(HasElasticNetParam, self).__init__()
#: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
self.elasticNetParam = Param(self, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
self._setDefault(elasticNetParam=0.0)
def setElasticNetParam(self, value):
"""
Sets the value of :py:attr:`elasticNetParam`.
"""
self._paramMap[self.elasticNetParam] = value
return self
def getElasticNetParam(self):
"""
Gets the value of elasticNetParam or its default value.
"""
return self.getOrDefault(self.elasticNetParam)
class HasFitIntercept(Params):
"""
Mixin for param fitIntercept: whether to fit an intercept term..
"""
# a placeholder to make it appear in the generated doc
fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.")
def __init__(self):
super(HasFitIntercept, self).__init__()
#: param for whether to fit an intercept term.
self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
self._setDefault(fitIntercept=True)
def setFitIntercept(self, value):
"""
Sets the value of :py:attr:`fitIntercept`.
"""
self._paramMap[self.fitIntercept] = value
return self
def getFitIntercept(self):
"""
Gets the value of fitIntercept or its default value.
"""
return self.getOrDefault(self.fitIntercept)
class HasStandardization(Params):
"""
Mixin for param standardization: whether to standardize the training features before fitting the model..
"""
# a placeholder to make it appear in the generated doc
standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.")
def __init__(self):
super(HasStandardization, self).__init__()
#: param for whether to standardize the training features before fitting the model.
self.standardization = Param(self, "standardization", "whether to standardize the training features before fitting the model.")
self._setDefault(standardization=True)
def setStandardization(self, value):
"""
Sets the value of :py:attr:`standardization`.
"""
self._paramMap[self.standardization] = value
return self
def getStandardization(self):
"""
Gets the value of standardization or its default value.
"""
return self.getOrDefault(self.standardization)
class HasThresholds(Params):
"""
Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold..
"""
# a placeholder to make it appear in the generated doc
thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.")
def __init__(self):
super(HasThresholds, self).__init__()
#: param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
self.thresholds = Param(self, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.")
def setThresholds(self, value):
"""
Sets the value of :py:attr:`thresholds`.
"""
self._paramMap[self.thresholds] = value
return self
def getThresholds(self):
"""
Gets the value of thresholds or its default value.
"""
return self.getOrDefault(self.thresholds)
class DecisionTreeParams(Params):
"""
Mixin for Decision Tree parameters.
......
......@@ -28,7 +28,8 @@ __all__ = ['DecisionTreeRegressor', 'DecisionTreeRegressionModel', 'GBTRegressor
@inherit_doc
class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
HasRegParam, HasTol):
HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept,
HasStandardization):
"""
Linear regression.
......@@ -63,38 +64,30 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
TypeError: Method setParams forces keyword arguments.
"""
# a placeholder to make it appear in the generated doc
elasticNetParam = \
Param(Params._dummy(), "elasticNetParam",
"the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
"the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6):
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
standardization=True):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
standardization=True)
"""
super(LinearRegression, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.regression.LinearRegression", self.uid)
#: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty
# is an L2 penalty. For alpha = 1, it is an L1 penalty.
self.elasticNetParam = \
Param(self, "elasticNetParam",
"the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " +
"is an L2 penalty. For alpha = 1, it is an L1 penalty.")
self._setDefault(maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
self._setDefault(maxIter=100, regParam=0.0, tol=1e-6)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@keyword_only
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6):
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
standardization=True):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
standardization=True)
Sets params for linear regression.
"""
kwargs = self.setParams._input_kwargs
......@@ -103,19 +96,6 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
def _create_model(self, java_model):
return LinearRegressionModel(java_model)
def setElasticNetParam(self, value):
"""
Sets the value of :py:attr:`elasticNetParam`.
"""
self._paramMap[self.elasticNetParam] = value
return self
def getElasticNetParam(self):
"""
Gets the value of elasticNetParam or its default value.
"""
return self.getOrDefault(self.elasticNetParam)
class LinearRegressionModel(JavaModel):
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment