From b8280271396eb74638da6546d76bbb2d06c7011b Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 7 Dec 2016 16:37:25 +0800
Subject: [PATCH] [SPARK-18701][ML] Fix Poisson GLM failure due to wrong
 initialization

Poisson GLM fails for many standard data sets (see example in test or JIRA). The issue is incorrect initialization leading to almost zero probability and weights. Specifically, the mean is initialized as the response, which could be zero. Applying the log link results in very negative numbers (protected against -Inf), which again leads to close to zero probability and weights in the weighted least squares. Fix and test are included in the commits.

## What changes were proposed in this pull request?
Update initialization in Poisson GLM

## How was this patch tested?
Add test in GeneralizedLinearRegressionSuite

srowen sethah yanboliang HyukjinKwon mengxr

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #16131 from actuaryzhang/master.
---
 .../GeneralizedLinearRegression.scala         |  6 +++++-
 .../GeneralizedLinearRegressionSuite.scala    | 21 +++++++++++--------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 770a2571bb..f137c8cb41 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -505,7 +505,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
     override def initialize(y: Double, weight: Double): Double = {
       require(y >= 0.0, "The response variable of Poisson family " +
         s"should be non-negative, but got $y")
-      y
+      /*
+        Force Poisson mean > 0 to avoid numerical instability in IRLS.
+        R uses y + 0.1 for initialization. See poisson()$initialize.
+       */
+      math.max(y, 0.1)
     }
 
     override def variance(mu: Double): Double = mu
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 4fab216033..3e9e1fced8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -89,11 +89,14 @@ class GeneralizedLinearRegressionSuite
       xVariance = Array(0.7, 1.2), nPoints = 10000, seed, noiseLevel = 0.01,
       family = "poisson", link = "log").toDF()
 
-    datasetPoissonLogWithZero = generateGeneralizedLinearRegressionInput(
-      intercept = -1.5, coefficients = Array(0.22, 0.06), xMean = Array(2.9, 10.5),
-      xVariance = Array(0.7, 1.2), nPoints = 100, seed, noiseLevel = 0.01,
-      family = "poisson", link = "log")
-      .map{x => LabeledPoint(if (x.label < 0.7) 0.0 else x.label, x.features)}.toDF()
+    datasetPoissonLogWithZero = Seq(
+      LabeledPoint(0.0, Vectors.dense(18, 1.0)),
+      LabeledPoint(1.0, Vectors.dense(12, 0.0)),
+      LabeledPoint(0.0, Vectors.dense(15, 0.0)),
+      LabeledPoint(0.0, Vectors.dense(13, 2.0)),
+      LabeledPoint(0.0, Vectors.dense(15, 1.0)),
+      LabeledPoint(1.0, Vectors.dense(16, 1.0))
+    ).toDF()
 
     datasetPoissonIdentity = generateGeneralizedLinearRegressionInput(
       intercept = 2.5, coefficients = Array(2.2, 0.6), xMean = Array(2.9, 10.5),
@@ -480,12 +483,12 @@ class GeneralizedLinearRegressionSuite
          model <- glm(formula, family="poisson", data=data)
          print(as.vector(coef(model)))
        }
-       [1]  0.4272661 -0.1565423
-       [1] -3.6911354  0.6214301  0.1295814
+       [1] -0.0457441 -0.6833928
+       [1] 1.8121235  -0.1747493  -0.5815417
      */
     val expected = Seq(
-      Vectors.dense(0.0, 0.4272661, -0.1565423),
-      Vectors.dense(-3.6911354, 0.6214301, 0.1295814))
+      Vectors.dense(0.0, -0.0457441, -0.6833928),
+      Vectors.dense(1.8121235, -0.1747493, -0.5815417))
 
     import GeneralizedLinearRegression._
 
-- 
GitLab