From 8ccca9170f983f74a7482f67206dae070c77b419 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng <ruifengz@foxmail.com> Date: Thu, 19 Jan 2017 03:46:37 -0800 Subject: [PATCH] [SPARK-14272][ML] Add Loglikelihood in GaussianMixtureSummary ## What changes were proposed in this pull request? add loglikelihood in GMM.summary ## How was this patch tested? added tests Author: Zheng RuiFeng <ruifengz@foxmail.com> Author: Ruifeng Zheng <ruifengz@foxmail.com> Closes #12064 from zhengruifeng/gmm_metric. --- .../apache/spark/ml/clustering/GaussianMixture.scala | 7 +++++-- .../scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +- .../spark/ml/clustering/GaussianMixtureSuite.scala | 7 +++++++ project/MimaExcludes.scala | 5 ++++- python/pyspark/ml/clustering.py | 10 ++++++++++ 5 files changed, 27 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index a7bb413795..db5fff5af8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -416,7 +416,7 @@ class GaussianMixture @Since("2.0.0") ( val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this) val summary = new GaussianMixtureSummary(model.transform(dataset), - $(predictionCol), $(probabilityCol), $(featuresCol), $(k)) + $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood) model.setSummary(Some(summary)) instr.logSuccess(model) model @@ -674,6 +674,7 @@ private class ExpectationAggregator( * in `predictions`. * @param featuresCol Name for column of features in `predictions`. * @param k Number of clusters. + * @param logLikelihood Total log-likelihood for this model on the given data. */ @Since("2.0.0") @Experimental @@ -682,7 +683,9 @@ class GaussianMixtureSummary private[clustering] ( predictionCol: String, @Since("2.0.0") val probabilityCol: String, featuresCol: String, - k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) { + k: Int, + @Since("2.2.0") val logLikelihood: Double) + extends ClusteringSummary(predictions, predictionCol, featuresCol, k) { /** * Probability of each cluster. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index de66c7ca1d..95f904dac5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -34,7 +34,7 @@ import org.apache.spark.storage.StorageLevel import org.apache.spark.util.random.BernoulliCellSampler /** - * Helper methods to load, save and pre-process data used in ML Lib. + * Helper methods to load, save and pre-process data used in MLLib. */ @Since("0.8.0") object MLUtils extends Logging { diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index a362aeea39..e54eb2750c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -207,6 +207,10 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext [,1] [,2] [1,] 0.2961543 0.160783 [2,] 0.1607830 1.008878 + + model$loglik + + [1] -46.89499 */ val weights = Array(0.5333333, 0.4666667) val means = Array(Vectors.dense(10.363673, 9.897081), Vectors.dense(0.11731091, -0.06192351)) @@ -219,6 +223,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext val expected = new GaussianMixtureModel("dummy", weights, gaussians) val actual = new GaussianMixture().setK(2).setSeed(seed).fit(rDataset) modelEquals(expected, actual) + + val llk = actual.summary.logLikelihood + assert(llk ~== -46.89499 absTol 1E-5) } test("upper triangular matrix unpacking") { diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index e0ee00e682..bf628210a1 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -46,7 +46,10 @@ object MimaExcludes { ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted"), // [SPARK-19148][SQL] do not expose the external table concept in Catalog - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable") + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable"), + + // [SPARK-14272][ML] Add logLikelihood in GaussianMixtureSummary + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this") ) // Exclude rules for 2.1.x diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 25f97f5696..c6c1a00331 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -175,6 +175,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte 3 >>> summary.clusterSizes [2, 2, 2] + >>> summary.logLikelihood + 8.14636... >>> weights = model.weights >>> len(weights) 3 @@ -281,6 +283,14 @@ class GaussianMixtureSummary(ClusteringSummary): """ return self._call_java("probability") + @property + @since("2.2.0") + def logLikelihood(self): + """ + Total log-likelihood for this model on the given data. + """ + return self._call_java("logLikelihood") + class KMeansSummary(ClusteringSummary): """ -- GitLab