From d9327192eee7f18e92381c59a42b0e1770f1f8f4 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <>
Date: Thu, 30 Oct 2014 12:00:56 -0700
Subject: [PATCH] SPARK-4111 [MLlib] add regression metrics

Add RegressionMetrics.scala as regression metrics used for evaluation and corresponding test case RegressionMetricsSuite.scala.

Author: Yanbo Liang <>
Author: liangyanbo <>

Closes #2978 from yanbohappy/regression_metrics and squashes the following commits:

730d0a9 [Yanbo Liang] more clearly annotation
3d0bec1 [Yanbo Liang] rename and keep code style
a8ad3e3 [Yanbo Liang] simplify code for keeping style
d454909 [Yanbo Liang] rename parameter and function names, delete unused columns, add reference
2e56282 [liangyanbo] rename r2_score() and remove unused column
43bb12b [liangyanbo] add regression metrics
 .../mllib/evaluation/RegressionMetrics.scala  | 89 +++++++++++++++++++
 .../evaluation/RegressionMetricsSuite.scala   | 52 +++++++++++
 2 files changed, 141 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
new file mode 100644
index 0000000000..693117d820
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -0,0 +1,89 @@
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.evaluation
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.rdd.RDD
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, MultivariateOnlineSummarizer}
+ * :: Experimental ::
+ * Evaluator for regression.
+ *
+ * @param predictionAndObservations an RDD of (prediction, observation) pairs.
+ */
+class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging {
+  /**
+   * Use MultivariateOnlineSummarizer to calculate summary statistics of observations and errors.
+   */
+  private lazy val summary: MultivariateStatisticalSummary = {
+    val summary: MultivariateStatisticalSummary = {
+      case (prediction, observation) => Vectors.dense(observation, observation - prediction)
+    }.aggregate(new MultivariateOnlineSummarizer())(
+        (summary, v) => summary.add(v),
+        (sum1, sum2) => sum1.merge(sum2)
+      )
+    summary
+  }
+  /**
+   * Returns the explained variance regression score.
+   * explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
+   * Reference: [[]]
+   */
+  def explainedVariance: Double = {
+    1 - summary.variance(1) / summary.variance(0)
+  }
+  /**
+   * Returns the mean absolute error, which is a risk function corresponding to the
+   * expected value of the absolute error loss or l1-norm loss.
+   */
+  def meanAbsoluteError: Double = {
+    summary.normL1(1) / summary.count
+  }
+  /**
+   * Returns the mean squared error, which is a risk function corresponding to the
+   * expected value of the squared error loss or quadratic loss.
+   */
+  def meanSquaredError: Double = {
+    val rmse = summary.normL2(1) / math.sqrt(summary.count)
+    rmse * rmse
+  }
+  /**
+   * Returns the root mean squared error, which is defined as the square root of
+   * the mean squared error.
+   */
+  def rootMeanSquaredError: Double = {
+    summary.normL2(1) / math.sqrt(summary.count)
+  }
+  /**
+   * Returns R^2^, the coefficient of determination.
+   * Reference: [[]]
+   */
+  def r2: Double = {
+    1 - math.pow(summary.normL2(1), 2) / (summary.variance(0) * (summary.count - 1))
+  }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
new file mode 100644
index 0000000000..5396d7b2b7
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
@@ -0,0 +1,52 @@
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.evaluation
+import org.scalatest.FunSuite
+import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+class RegressionMetricsSuite extends FunSuite with LocalSparkContext {
+  test("regression metrics") {
+    val predictionAndObservations = sc.parallelize(
+      Seq((2.5,3.0),(0.0,-0.5),(2.0,2.0),(8.0,7.0)), 2)
+    val metrics = new RegressionMetrics(predictionAndObservations)
+    assert(metrics.explainedVariance ~== 0.95717 absTol 1E-5,
+      "explained variance regression score mismatch")
+    assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch")
+    assert(metrics.meanSquaredError ~== 0.375 absTol 1E-5, "mean squared error mismatch")
+    assert(metrics.rootMeanSquaredError ~== 0.61237 absTol 1E-5,
+      "root mean squared error mismatch")
+    assert(metrics.r2 ~== 0.94861 absTol 1E-5, "r2 score mismatch")
+  }
+  test("regression metrics with complete fitting") {
+    val predictionAndObservations = sc.parallelize(
+      Seq((3.0,3.0),(0.0,0.0),(2.0,2.0),(8.0,8.0)), 2)
+    val metrics = new RegressionMetrics(predictionAndObservations)
+    assert(metrics.explainedVariance ~== 1.0 absTol 1E-5,
+      "explained variance regression score mismatch")
+    assert(metrics.meanAbsoluteError ~== 0.0 absTol 1E-5, "mean absolute error mismatch")
+    assert(metrics.meanSquaredError ~== 0.0 absTol 1E-5, "mean squared error mismatch")
+    assert(metrics.rootMeanSquaredError ~== 0.0 absTol 1E-5,
+      "root mean squared error mismatch")
+    assert(metrics.r2 ~== 1.0 absTol 1E-5, "r2 score mismatch")
+  }