Skip to content
Snippets Groups Projects
Commit 8a4ed788 authored by Yanbo Liang's avatar Yanbo Liang Committed by DB Tsai
Browse files

[SPARK-13379][MLLIB] Fix MLlib LogisticRegressionWithLBFGS set regularization incorrectly

## What changes were proposed in this pull request?
Fix MLlib LogisticRegressionWithLBFGS regularization map as:
```SquaredL2Updater``` -> ```elasticNetParam = 0.0```
```L1Updater``` -> ```elasticNetParam = 1.0```
cc dbtsai
## How was the this patch tested?
unit tests

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #11258 from yanboliang/spark-13379.
parent 9bf6a926
No related branches found
No related tags found
No related merge requests found
......@@ -444,8 +444,8 @@ class LogisticRegressionWithLBFGS
createModel(weights, mlLogisticRegresionModel.intercept)
}
optimizer.getUpdater() match {
case x: SquaredL2Updater => runWithMlLogisitcRegression(1.0)
case x: L1Updater => runWithMlLogisitcRegression(0.0)
case x: SquaredL2Updater => runWithMlLogisitcRegression(0.0)
case x: L1Updater => runWithMlLogisitcRegression(1.0)
case _ => super.run(input, initialWeights)
}
} else {
......
......@@ -29,6 +29,7 @@ import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils
......@@ -171,6 +172,37 @@ object LogisticRegressionSuite {
class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers {
@transient var binaryDataset: RDD[LabeledPoint] = _
override def beforeAll(): Unit = {
super.beforeAll()
/*
Here is the instruction describing how to export the test data into CSV format
so we can validate the training accuracy compared with R's glmnet package.
val nPoints = 10000
val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
val xMean = Array(5.843, 3.057, 3.758, 1.199)
val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
val data = sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput(
coefficients, xMean, xVariance, true, nPoints, 42), 1)
data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", "
+ x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
*/
binaryDataset = {
val nPoints = 10000
val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
val xMean = Array(5.843, 3.057, 3.758, 1.199)
val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
val testData = LogisticRegressionSuite.generateMultinomialLogisticInput(
coefficients, xMean, xVariance, true, nPoints, 42)
sc.parallelize(testData, 2)
}
}
def validatePrediction(
predictions: Seq[Double],
input: Seq[LabeledPoint],
......@@ -555,6 +587,322 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext w
}
}
/**
* From Spark 2.0, MLlib LogisticRegressionWithLBFGS will call the LogisticRegression
* implementation in ML to train model. We copies test cases from ML to guarantee
* they produce the same result.
*/
test("binary logistic regression with intercept without regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 2.8366423
data.V2 -0.5895848
data.V3 0.8931147
data.V4 -0.3925051
data.V5 -0.7996864
*/
val interceptR = 2.8366423
val coefficientsR = Vectors.dense(-0.5895848, 0.8931147, -0.3925051, -0.7996864)
assert(model1.intercept ~== interceptR relTol 1E-3)
assert(model1.weights ~= coefficientsR relTol 1E-3)
// Without regularization, with or without feature scaling will converge to the same solution.
assert(model2.intercept ~== interceptR relTol 1E-3)
assert(model2.weights ~= coefficientsR relTol 1E-3)
}
test("binary logistic regression without intercept without regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients =
coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 -0.3534996
data.V3 1.2964482
data.V4 -0.3571741
data.V5 -0.7407946
*/
val interceptR = 0.0
val coefficientsR = Vectors.dense(-0.3534996, 1.2964482, -0.3571741, -0.7407946)
assert(model1.intercept ~== interceptR relTol 1E-3)
assert(model1.weights ~= coefficientsR relTol 1E-2)
// Without regularization, with or without feature scaling should converge to the same solution.
assert(model2.intercept ~== interceptR relTol 1E-3)
assert(model2.weights ~= coefficientsR relTol 1E-2)
}
test("binary logistic regression with intercept with L1 regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) -0.05627428
data.V2 .
data.V3 .
data.V4 -0.04325749
data.V5 -0.02481551
*/
val interceptR1 = -0.05627428
val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551)
assert(model1.intercept ~== interceptR1 relTol 1E-2)
assert(model1.weights ~= coefficientsR1 absTol 2E-2)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
standardize=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 0.3722152
data.V2 .
data.V3 .
data.V4 -0.1665453
data.V5 .
*/
val interceptR2 = 0.3722152
val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)
assert(model2.intercept ~== interceptR2 relTol 1E-2)
assert(model2.weights ~= coefficientsR2 absTol 1E-3)
}
test("binary logistic regression without intercept with L1 regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
intercept=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 .
data.V3 .
data.V4 -0.05189203
data.V5 -0.03891782
*/
val interceptR1 = 0.0
val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.05189203, -0.03891782)
assert(model1.intercept ~== interceptR1 relTol 1E-3)
assert(model1.weights ~= coefficientsR1 absTol 1E-3)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
intercept=FALSE, standardize=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 .
data.V3 .
data.V4 -0.08420782
data.V5 .
*/
val interceptR2 = 0.0
val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.08420782, 0.0)
assert(model2.intercept ~== interceptR2 absTol 1E-3)
assert(model2.weights ~= coefficientsR2 absTol 1E-3)
}
test("binary logistic regression with intercept with L2 regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 0.15021751
data.V2 -0.07251837
data.V3 0.10724191
data.V4 -0.04865309
data.V5 -0.10062872
*/
val interceptR1 = 0.15021751
val coefficientsR1 = Vectors.dense(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
assert(model1.intercept ~== interceptR1 relTol 1E-3)
assert(model1.weights ~= coefficientsR1 relTol 1E-3)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
standardize=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 0.48657516
data.V2 -0.05155371
data.V3 0.02301057
data.V4 -0.11482896
data.V5 -0.06266838
*/
val interceptR2 = 0.48657516
val coefficientsR2 = Vectors.dense(-0.05155371, 0.02301057, -0.11482896, -0.06266838)
assert(model2.intercept ~== interceptR2 relTol 1E-3)
assert(model2.weights ~= coefficientsR2 relTol 1E-3)
}
test("binary logistic regression without intercept with L2 regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
intercept=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 -0.06099165
data.V3 0.12857058
data.V4 -0.04708770
data.V5 -0.09799775
*/
val interceptR1 = 0.0
val coefficientsR1 = Vectors.dense(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
assert(model1.intercept ~== interceptR1 absTol 1E-3)
assert(model1.weights ~= coefficientsR1 relTol 1E-2)
/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
intercept=FALSE, standardize=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 -0.005679651
data.V3 0.048967094
data.V4 -0.093714016
data.V5 -0.053314311
*/
val interceptR2 = 0.0
val coefficientsR2 = Vectors.dense(-0.005679651, 0.048967094, -0.093714016, -0.053314311)
assert(model2.intercept ~== interceptR2 absTol 1E-3)
assert(model2.weights ~= coefficientsR2 relTol 1E-2)
}
}
class LogisticRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment