From 302a18686998b8b96546526bfccec9cf5b667386 Mon Sep 17 00:00:00 2001 From: Yanbo Liang <ybliang8@gmail.com> Date: Tue, 26 Apr 2016 11:55:21 -0700 Subject: [PATCH] [SPARK-11559][MLLIB] Make `runs` no effect in mllib.KMeans ## What changes were proposed in this pull request? We deprecated ```runs``` of mllib.KMeans in Spark 1.6 (SPARK-11358). In 2.0, we will make it no effect (with warning messages). We did not remove ```setRuns/getRuns``` for better binary compatibility. This PR change `runs` which are appeared at the public API. Usage inside of ```KMeans.runAlgorithm()``` will be resolved at #10806. ## How was this patch tested? Existing unit tests. cc jkbradley Author: Yanbo Liang <ybliang8@gmail.com> Closes #12608 from yanboliang/spark-11559. --- .../mllib/api/python/PythonMLLibAPI.scala | 1 - .../spark/mllib/clustering/KMeans.scala | 42 +++++-------------- python/pyspark/ml/clustering.py | 5 +-- python/pyspark/mllib/clustering.py | 9 ++-- 4 files changed, 16 insertions(+), 41 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 32dc16de08..8daee7b3aa 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -357,7 +357,6 @@ private[python] class PythonMLLibAPI extends Serializable { val kMeansAlg = new KMeans() .setK(k) .setMaxIterations(maxIterations) - .internalSetRuns(runs) .setInitializationMode(initializationMode) .setInitializationSteps(initializationSteps) .setEpsilon(epsilon) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index ff77090990..60f13d27d0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -32,9 +32,8 @@ import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom /** - * K-means clustering with support for multiple parallel runs and a k-means++ like initialization - * mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested, - * they are executed together with joint passes over the data for efficiency. + * K-means clustering with a k-means++ like initialization mode + * (the k-means|| algorithm by Bahmani et al). * * This is an iterative algorithm that will make multiple passes over the data, so any RDDs given * to it should be cached by the user. @@ -109,35 +108,20 @@ class KMeans private ( } /** - * :: Experimental :: - * Number of runs of the algorithm to execute in parallel. + * This function has no effect since Spark 2.0.0. */ @Since("1.4.0") - @deprecated("Support for runs is deprecated. This param will have no effect in 2.0.0.", "1.6.0") - def getRuns: Int = runs + def getRuns: Int = { + logWarning("Getting number of runs has no effect since Spark 2.0.0.") + runs + } /** - * :: Experimental :: - * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm - * this many times with random starting conditions (configured by the initialization mode), then - * return the best clustering found over any run. Default: 1. + * This function has no effect since Spark 2.0.0. */ @Since("0.8.0") - @deprecated("Support for runs is deprecated. This param will have no effect in 2.0.0.", "1.6.0") def setRuns(runs: Int): this.type = { - internalSetRuns(runs) - } - - // Internal version of setRuns for Python API, this should be removed at the same time as setRuns - // this is done to avoid deprecation warnings in our build. - private[mllib] def internalSetRuns(runs: Int): this.type = { - if (runs <= 0) { - throw new IllegalArgumentException("Number of runs must be positive") - } - if (runs != 1) { - logWarning("Setting number of runs is deprecated and will have no effect in 2.0.0") - } - this.runs = runs + logWarning("Setting number of runs has no effect since Spark 2.0.0.") this } @@ -511,8 +495,7 @@ object KMeans { * @param data Training points as an `RDD` of `Vector` types. * @param k Number of clusters to create. * @param maxIterations Maximum number of iterations allowed. - * @param runs Number of runs to execute in parallel. The best model according to the cost - * function will be returned. (default: 1) + * @param runs This param has no effect since Spark 2.0.0. * @param initializationMode The initialization algorithm. This can either be "random" or * "k-means||". (default: "k-means||") * @param seed Random seed for cluster initialization. Default is to generate seed based @@ -528,7 +511,6 @@ object KMeans { seed: Long): KMeansModel = { new KMeans().setK(k) .setMaxIterations(maxIterations) - .internalSetRuns(runs) .setInitializationMode(initializationMode) .setSeed(seed) .run(data) @@ -540,8 +522,7 @@ object KMeans { * @param data Training points as an `RDD` of `Vector` types. * @param k Number of clusters to create. * @param maxIterations Maximum number of iterations allowed. - * @param runs Number of runs to execute in parallel. The best model according to the cost - * function will be returned. (default: 1) + * @param runs This param has no effect since Spark 2.0.0. * @param initializationMode The initialization algorithm. This can either be "random" or * "k-means||". (default: "k-means||") */ @@ -554,7 +535,6 @@ object KMeans { initializationMode: String): KMeansModel = { new KMeans().setK(k) .setMaxIterations(maxIterations) - .internalSetRuns(runs) .setInitializationMode(initializationMode) .run(data) } diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 4ce8012754..9740ec45af 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -194,9 +194,8 @@ class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, JavaMLWritable, JavaMLReadable): """ - K-means clustering with support for multiple parallel runs and a k-means++ like initialization - mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested, - they are executed together with joint passes over the data for efficiency. + K-means clustering with a k-means++ like initialization mode + (the k-means|| algorithm by Bahmani et al). >>> from pyspark.mllib.linalg import Vectors >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index 23d118bd40..95f7278dc6 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -179,7 +179,7 @@ class KMeansModel(Saveable, Loader): >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2) >>> model = KMeans.train( - ... sc.parallelize(data), 2, maxIterations=10, runs=30, initializationMode="random", + ... sc.parallelize(data), 2, maxIterations=10, initializationMode="random", ... seed=50, initializationSteps=5, epsilon=1e-4) >>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0])) True @@ -323,9 +323,7 @@ class KMeans(object): Maximum number of iterations allowed. (default: 100) :param runs: - Number of runs to execute in parallel. The best model according - to the cost function will be returned (deprecated in 1.6.0). - (default: 1) + This param has no effect since Spark 2.0.0. :param initializationMode: The initialization algorithm. This can be either "random" or "k-means||". @@ -350,8 +348,7 @@ class KMeans(object): (default: None) """ if runs != 1: - warnings.warn( - "Support for runs is deprecated in 1.6.0. This param will have no effect in 2.0.0.") + warnings.warn("The param `runs` has no effect since Spark 2.0.0.") clusterInitialModel = [] if initialModel is not None: if not isinstance(initialModel, KMeansModel): -- GitLab