diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index feb3474cf1a7d246448d947af98b19e1dfbb6a12..5d536b7c245e38f2bdaf40bdb39698859717dfd2 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -12,8 +12,8 @@ avro-1.7.7.jar avro-ipc-1.7.7.jar avro-mapred-1.7.7-hadoop2.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -147,6 +147,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 3e960358f74bd3e206f0cbb42c888aee7cab78f7..d16f42a97d37052389cc2b31b6b7ee0c6162623a 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -15,8 +15,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -154,6 +154,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 3fc14a6fbf4ec4db2c690d1c517af8deec5b8ae7..2e261cb9a543274d0a574bb41b0ffb02f638a810 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -15,8 +15,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -154,6 +154,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 909fbb885222675f4d26ccc1fa034d0a2898e024..67f38f4c220de1b5758abad17a71047faac92fa0 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -19,8 +19,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -162,6 +162,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index b986a313a0c785b7e28c962e64c61512be76cb33..07583963d913b76a12a5faf3c9d6a179d17710aa 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -19,8 +19,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -163,6 +163,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 91eee0e69d63521faa9b0766a855b3812b3a97a6..7694773c816b20183f60b31c55af5e396a31013e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -424,11 +424,6 @@ class LogisticRegression @Since("1.2.0") ( throw new SparkException(msg) } - if (!state.actuallyConverged) { - logWarning("LogisticRegression training finished but the result " + - s"is not converged because: ${state.convergedReason.get.reason}") - } - /* The coefficients are trained in the scaled space; we're converting them back to the original space. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 700a92cc261bee474edbc3eade7479d83e828894..2b9912657f51f6afec83824d12f119f0ce75ceca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -244,12 +244,6 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val msg = s"${optimizer.getClass.getName} failed." throw new SparkException(msg) } - - if (!state.actuallyConverged) { - logWarning("AFTSurvivalRegression training finished but the result " + - s"is not converged because: ${state.convergedReason.get.reason}") - } - state.x.toArray.clone() } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 0a155e1844f623fcf4afcb5977829f08ce6bbb6b..a0ff7f07aa3ddd1148ba8b5dfee17bf1634c2dd5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -325,11 +325,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String throw new SparkException(msg) } - if (!state.actuallyConverged) { - logWarning("LinearRegression training finished but the result " + - s"is not converged because: ${state.convergedReason.get.reason}") - } - /* The coefficients are trained in the scaled space; we're converting them back to the original space. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 9ebba1de0dad4abe009387560420a763d45ce2f1..90d8a558f10d42989461772c9c5b4e452f87015c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -784,7 +784,13 @@ class DistributedLDAModel private[clustering] ( @Since("1.5.0") def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = { graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => - val topIndices = argtopk(topicCounts, k) + // TODO: Remove work-around for the breeze bug. + // https://github.com/scalanlp/breeze/issues/561 + val topIndices = if (k == topicCounts.length) { + Seq.range(0, k) + } else { + argtopk(topicCounts, k) + } val sumCounts = sum(topicCounts) val weights = if (sumCounts != 0) { topicCounts(topIndices) / sumCounts diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 2436efba32489094a03cb234ca1dc6d11a6394c2..e2c6aca553c1cd1be28814e586cc21b31e22a114 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -508,8 +508,9 @@ final class OnlineLDAOptimizer extends LDAOptimizer { val weight = rho() val N = gammat.rows.toDouble val alpha = this.alpha.asBreeze.toDenseVector - val logphat: BDM[Double] = sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)) / N - val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat.toDenseVector) + val logphat: BDV[Double] = + sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)).t / N + val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat) val c = N * trigamma(sum(alpha)) val q = -N * trigamma(alpha) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala index fd09f35277a09eb289a678227cb69f4b01002a3b..e49363c2c64d93d46d53a5e4a5e164b83be6bc4d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala @@ -213,11 +213,6 @@ object LBFGS extends Logging { } lossHistory += state.value - if (!state.actuallyConverged) { - logWarning("LBFGS training finished but the result " + - s"is not converged because: ${state.convergedReason.get.reason}") - } - val weights = Vectors.fromBreeze(state.x) val lossHistoryArray = lossHistory.result() diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java index ac479c08418cee34877c0f61487dc32b3a4806d4..8c0338e2844f089beee48ed5c4e895f1e817e017 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java @@ -107,7 +107,11 @@ public class JavaPCASuite extends SharedSparkSession { .fit(df); List<Row> result = pca.transform(df).select("pca_features", "expected").toJavaRDD().collect(); for (Row r : result) { - Assert.assertEquals(r.get(1), r.get(0)); + Vector calculatedVector = (Vector) r.get(0); + Vector expectedVector = (Vector) r.get(1); + for (int i = 0; i < calculatedVector.size(); i++) { + Assert.assertEquals(calculatedVector.apply(i), expectedVector.apply(i), 1.0e-8); + } } } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index eb050158d48fee336bbfa0047fff2ac1db56c109..211e2bc026c74e2a98dbbfced89b068f14b228ff 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -118,8 +118,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { assert(weights.length == 2) val bdvTopicDist = topicDistribution.asBreeze val top2Indices = argtopk(bdvTopicDist, 2) - assert(top2Indices.toArray === indices) - assert(bdvTopicDist(top2Indices).toArray === weights) + assert(top2Indices.toSet === indices.toSet) + assert(bdvTopicDist(top2Indices).toArray.toSet === weights.toSet) } // Check: log probabilities diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala index a8d82932d3904e28a5c2eaa09c21a6719c87c235..2f90afdcee55ec3fb482212c1b5d5364bf641c02 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala @@ -18,9 +18,10 @@ package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { @@ -42,7 +43,9 @@ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() - assert(pca_transform.toSet === mat_multiply.toSet) - assert(pca.explainedVariance === explainedVariance) + pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => + assert(calculated ~== expected relTol 1e-8) + } + assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } } diff --git a/pom.xml b/pom.xml index 4c8671a57035f1f2830eadbf493ae7a3bf04ba83..d064cb57dd6cf3e9f13cebf8d941af875dae0a03 100644 --- a/pom.xml +++ b/pom.xml @@ -657,7 +657,7 @@ <dependency> <groupId>org.scalanlp</groupId> <artifactId>breeze_${scala.binary.version}</artifactId> - <version>0.11.2</version> + <version>0.12</version> <exclusions> <!-- This is included as a compile-scoped dependency by jtransforms, which is a dependency of breeze. --> diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 3c4af90acac85d381f460f34bbf2a15f006a66a9..613bc8cb3e7e9347aa5d6fad5de8acd8a0b5f573 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1299,7 +1299,7 @@ class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable): >>> [x.coefficients for x in model.models] [DenseVector([3.3925, 1.8785]), DenseVector([-4.3016, -6.3163]), DenseVector([-4.5855, 6.1785])] >>> [x.intercept for x in model.models] - [-3.6474708290602034, 2.5507881951814495, -1.1016513228162115] + [-3.64747..., 2.55078..., -1.10165...] >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 0.0))]).toDF() >>> model.transform(test0).head().prediction 1.0