diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index a7bb41379538df2ad70d75e6c6a6239687071934..db5fff5af86ef0ea4fc4d5cbf445b686aa331250 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -416,7 +416,7 @@ class GaussianMixture @Since("2.0.0") ( val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this) val summary = new GaussianMixtureSummary(model.transform(dataset), - $(predictionCol), $(probabilityCol), $(featuresCol), $(k)) + $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood) model.setSummary(Some(summary)) instr.logSuccess(model) model @@ -674,6 +674,7 @@ private class ExpectationAggregator( * in `predictions`. * @param featuresCol Name for column of features in `predictions`. * @param k Number of clusters. + * @param logLikelihood Total log-likelihood for this model on the given data. */ @Since("2.0.0") @Experimental @@ -682,7 +683,9 @@ class GaussianMixtureSummary private[clustering] ( predictionCol: String, @Since("2.0.0") val probabilityCol: String, featuresCol: String, - k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) { + k: Int, + @Since("2.2.0") val logLikelihood: Double) + extends ClusteringSummary(predictions, predictionCol, featuresCol, k) { /** * Probability of each cluster. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index de66c7ca1d13fc5ba4d6a23d9d992d51360494a7..95f904dac552cbf9c4e7eba3ee806103e1c0a789 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -34,7 +34,7 @@ import org.apache.spark.storage.StorageLevel import org.apache.spark.util.random.BernoulliCellSampler /** - * Helper methods to load, save and pre-process data used in ML Lib. + * Helper methods to load, save and pre-process data used in MLLib. */ @Since("0.8.0") object MLUtils extends Logging { diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index a362aeea3962b4aa244fbfdba427b9e92d05f582..e54eb2750c3893527c84b3028474f2fcba5e7c20 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -207,6 +207,10 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext [,1] [,2] [1,] 0.2961543 0.160783 [2,] 0.1607830 1.008878 + + model$loglik + + [1] -46.89499 */ val weights = Array(0.5333333, 0.4666667) val means = Array(Vectors.dense(10.363673, 9.897081), Vectors.dense(0.11731091, -0.06192351)) @@ -219,6 +223,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext val expected = new GaussianMixtureModel("dummy", weights, gaussians) val actual = new GaussianMixture().setK(2).setSeed(seed).fit(rDataset) modelEquals(expected, actual) + + val llk = actual.summary.logLikelihood + assert(llk ~== -46.89499 absTol 1E-5) } test("upper triangular matrix unpacking") { diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index e0ee00e6826abaa164ab490d4bccef5b13583db2..bf628210a16e648b3d7b392b61c04ecb379e3ba0 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -46,7 +46,10 @@ object MimaExcludes { ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted"), // [SPARK-19148][SQL] do not expose the external table concept in Catalog - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable") + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable"), + + // [SPARK-14272][ML] Add logLikelihood in GaussianMixtureSummary + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this") ) // Exclude rules for 2.1.x diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 25f97f5696a14501e4ccc4c72a1189889bc3129f..c6c1a0033190e8ed943c64ea13ac5b35f61ffebb 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -175,6 +175,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte 3 >>> summary.clusterSizes [2, 2, 2] + >>> summary.logLikelihood + 8.14636... >>> weights = model.weights >>> len(weights) 3 @@ -281,6 +283,14 @@ class GaussianMixtureSummary(ClusteringSummary): """ return self._call_java("probability") + @property + @since("2.2.0") + def logLikelihood(self): + """ + Total log-likelihood for this model on the given data. + """ + return self._call_java("logLikelihood") + class KMeansSummary(ClusteringSummary): """