Skip to content
Snippets Groups Projects
Commit 43533738 authored by Yanbo Liang's avatar Yanbo Liang Committed by Joseph K. Bradley
Browse files

[SPARK-6256] [MLlib] MLlib Python API parity check for regression

MLlib Python API parity check for Regression, major disparities need to be added for Python list following:
```scala
LinearRegressionWithSGD
    setValidateData
LassoWithSGD
    setIntercept
    setValidateData
RidgeRegressionWithSGD
    setIntercept
    setValidateData
```
setFeatureScaling is mllib private function which is not needed to expose in pyspark.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #4997 from yanboliang/spark-6256 and squashes the following commits:

102f498 [Yanbo Liang] fix intercept issue & add doc test
1fb7b4f [Yanbo Liang] change 'intercept' to 'addIntercept'
de5ecbc [Yanbo Liang] MLlib Python API parity check for regression
parent c1b74df6
No related branches found
No related tags found
No related merge requests found
......@@ -111,9 +111,11 @@ private[python] class PythonMLLibAPI extends Serializable {
initialWeights: Vector,
regParam: Double,
regType: String,
intercept: Boolean): JList[Object] = {
intercept: Boolean,
validateData: Boolean): JList[Object] = {
val lrAlg = new LinearRegressionWithSGD()
lrAlg.setIntercept(intercept)
.setValidateData(validateData)
lrAlg.optimizer
.setNumIterations(numIterations)
.setRegParam(regParam)
......@@ -135,8 +137,12 @@ private[python] class PythonMLLibAPI extends Serializable {
stepSize: Double,
regParam: Double,
miniBatchFraction: Double,
initialWeights: Vector): JList[Object] = {
initialWeights: Vector,
intercept: Boolean,
validateData: Boolean): JList[Object] = {
val lassoAlg = new LassoWithSGD()
lassoAlg.setIntercept(intercept)
.setValidateData(validateData)
lassoAlg.optimizer
.setNumIterations(numIterations)
.setRegParam(regParam)
......@@ -157,8 +163,12 @@ private[python] class PythonMLLibAPI extends Serializable {
stepSize: Double,
regParam: Double,
miniBatchFraction: Double,
initialWeights: Vector): JList[Object] = {
initialWeights: Vector,
intercept: Boolean,
validateData: Boolean): JList[Object] = {
val ridgeAlg = new RidgeRegressionWithSGD()
ridgeAlg.setIntercept(intercept)
.setValidateData(validateData)
ridgeAlg.optimizer
.setNumIterations(numIterations)
.setRegParam(regParam)
......
......@@ -140,6 +140,13 @@ class LinearRegressionModel(LinearRegressionModelBase):
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
>>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
... miniBatchFraction=1.0, initialWeights=array([1.0]), regParam=0.1, regType="l2",
... intercept=True, validateData=True)
>>> abs(lrm.predict(array([0.0])) - 0) < 0.5
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
"""
def save(self, sc, path):
java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel(
......@@ -173,7 +180,8 @@ class LinearRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
initialWeights=None, regParam=0.0, regType=None, intercept=False):
initialWeights=None, regParam=0.0, regType=None, intercept=False,
validateData=True):
"""
Train a linear regression model on the given data.
......@@ -195,15 +203,18 @@ class LinearRegressionWithSGD(object):
(default: None)
@param intercept: Boolean parameter which indicates the use
:param intercept: Boolean parameter which indicates the use
or not of the augmented representation for
training data (i.e. whether bias features
are activated or not). (default: False)
:param validateData: Boolean parameter which indicates if the
algorithm should validate data before training.
(default: True)
"""
def train(rdd, i):
return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
float(step), float(miniBatchFraction), i, float(regParam),
regType, bool(intercept))
regType, bool(intercept), bool(validateData))
return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights)
......@@ -253,6 +264,13 @@ class LassoModel(LinearRegressionModelBase):
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
>>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
... validateData=True)
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
"""
def save(self, sc, path):
java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel(
......@@ -273,11 +291,13 @@ class LassoWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
miniBatchFraction=1.0, initialWeights=None):
miniBatchFraction=1.0, initialWeights=None, intercept=False,
validateData=True):
"""Train a Lasso regression model on the given data."""
def train(rdd, i):
return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
float(regParam), float(miniBatchFraction), i)
float(regParam), float(miniBatchFraction), i, bool(intercept),
bool(validateData))
return _regression_train_wrapper(train, LassoModel, data, initialWeights)
......@@ -327,6 +347,13 @@ class RidgeRegressionModel(LinearRegressionModelBase):
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
>>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
... validateData=True)
>>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
"""
def save(self, sc, path):
java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel(
......@@ -347,11 +374,13 @@ class RidgeRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
miniBatchFraction=1.0, initialWeights=None):
miniBatchFraction=1.0, initialWeights=None, intercept=False,
validateData=True):
"""Train a ridge regression model on the given data."""
def train(rdd, i):
return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
float(regParam), float(miniBatchFraction), i)
float(regParam), float(miniBatchFraction), i, bool(intercept),
bool(validateData))
return _regression_train_wrapper(train, RidgeRegressionModel, data, initialWeights)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment