diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 3fec1a909dfb959f58bd2484c70f886fb235a456..efc0eb935376bf0ffa5686e19240239679ba55e1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -24,7 +24,6 @@ import org.apache.spark.mllib.recommendation._ import org.apache.spark.rdd.RDD import java.nio.ByteBuffer import java.nio.ByteOrder -import java.nio.DoubleBuffer /** * The Java stubs necessary for the Python mllib bindings. @@ -81,7 +80,6 @@ class PythonMLLibAPI extends Serializable { } val db = bb.asDoubleBuffer() val ans = new Array[Array[Double]](rows.toInt) - var i = 0 for (i <- 0 until rows.toInt) { ans(i) = new Array[Double](cols.toInt) db.get(ans(i)) @@ -236,7 +234,7 @@ class PythonMLLibAPI extends Serializable { * Serialize a Rating object into an array of bytes. * It can be deserialized using RatingDeserializer(). * - * @param rate + * @param rate the Rating object to serialize * @return */ private[spark] def serializeRating(rate: Rating): Array[Byte] = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index f2964ea446ec8a52c71578fbfc91422c8656ff3a..6dff29dfb45ccae2bb497c61eab0294498ce3442 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -17,8 +17,6 @@ package org.apache.spark.mllib.classification -import scala.math.signum - import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index cfc81c985aa64616fe56addf01671d58228848fe..980be931576dce875ca018c2ff6f53d78ca6d4aa 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala @@ -19,8 +19,6 @@ package org.apache.spark.mllib.clustering import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.util.MLUtils - /** * A clustering model for K-means. Each point belongs to the cluster with the closest center. @@ -39,6 +37,6 @@ class KMeansModel(val clusterCenters: Array[Array[Double]]) extends Serializable * model on the given data. */ def computeCost(data: RDD[Array[Double]]): Double = { - data.map(p => KMeans.pointCost(clusterCenters, p)).sum + data.map(p => KMeans.pointCost(clusterCenters, p)).sum() } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala index fe5cce064bac74f1395ea516050d0891c7377091..df599fde76a868567e7cf2b5856b864eb166c4d6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.regression -import org.apache.spark.{Logging, SparkContext} +import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.util.MLUtils diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala index c125c6797ada3b01ee3ddedb55f31be21e16174e..0c0e67fb7b1238f11805a8c8c3b9b05e52d252a6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.regression -import org.apache.spark.{Logging, SparkContext} +import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.util.MLUtils @@ -76,7 +76,7 @@ class RidgeRegressionWithSGD private ( def createModel(weights: Array[Double], intercept: Double) = { val weightsMat = new DoubleMatrix(weights.length + 1, 1, (Array(intercept) ++ weights):_*) val weightsScaled = weightsMat.div(xColSd) - val interceptScaled = yMean - (weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0)) + val interceptScaled = yMean - weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0) new RidgeRegressionModel(weightsScaled.data, interceptScaled) } @@ -86,7 +86,7 @@ class RidgeRegressionWithSGD private ( initialWeights: Array[Double]) : RidgeRegressionModel = { - val nfeatures: Int = input.first.features.length + val nfeatures: Int = input.first().features.length val nexamples: Long = input.count() // To avoid penalizing the intercept, we center and scale the data. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index bc5045fb05d3bf5df0f4079a5c486320a5668e42..2e03684e6286128546d731cb23773b860df5559f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -25,7 +25,6 @@ import org.jblas.DoubleMatrix import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.regression.LabeledPoint /** * Generate sample data used for Linear Data. This class generates @@ -73,7 +72,7 @@ object LinearDataGenerator { val x = Array.fill[Array[Double]](nPoints)( Array.fill[Double](weights.length)(2 * rnd.nextDouble - 1.0)) val y = x.map { xi => - (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + intercept + eps * rnd.nextGaussian() + new DoubleMatrix(1, xi.length, xi: _*).dot(weightsMat) + intercept + eps * rnd.nextGaussian() } y.zip(x).map(p => LabeledPoint(p._1, p._2)) } @@ -86,7 +85,6 @@ object LinearDataGenerator { * @param nexamples Number of examples that will be contained in the RDD. * @param nfeatures Number of features to generate for each example. * @param eps Epsilon factor by which examples are scaled. - * @param weights Weights associated with the first weights.length features. * @param nparts Number of partitions in the RDD. Default value is 2. * * @return RDD of LabeledPoint containing sample data. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala index d5f3f6b8dbeeaecaf933609d5d6198d2a42ef49f..348aba1dea5b6c7cda82b4d9fab12e5499e32093 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.mllib.recommendation +package org.apache.spark.mllib.util import scala.util.Random @@ -23,7 +23,6 @@ import org.jblas.DoubleMatrix import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.util.MLUtils /** * Generate RDD(s) containing data for Matrix Factorization. @@ -31,9 +30,9 @@ import org.apache.spark.mllib.util.MLUtils * This method samples training entries according to the oversampling factor * 'trainSampFact', which is a multiplicative factor of the number of * degrees of freedom of the matrix: rank*(m+n-rank). -* -* It optionally samples entries for a testing matrix using -* 'testSampFact', the percentage of the number of training entries +* +* It optionally samples entries for a testing matrix using +* 'testSampFact', the percentage of the number of training entries * to use for testing. * * This method takes the following inputs: @@ -73,7 +72,7 @@ object MFDataGenerator{ val A = DoubleMatrix.randn(m, rank) val B = DoubleMatrix.randn(rank, n) - val z = 1 / (scala.math.sqrt(scala.math.sqrt(rank))) + val z = 1 / scala.math.sqrt(scala.math.sqrt(rank)) A.mmuli(z) B.mmuli(z) val fullData = A.mmul(B) @@ -91,7 +90,7 @@ object MFDataGenerator{ .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1))) // optionally add gaussian noise - if (noise) { + if (noise) { trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma)) } @@ -107,8 +106,8 @@ object MFDataGenerator{ .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1))) testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath) } - + sc.stop() - + } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index d91b74c3ac2b3de970e72bad7a7aee30b8868fcf..64c6136a8b89dc22ca965e214a0641be6ff135df 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -97,7 +97,7 @@ object MLUtils { while (col < nfeatures) { xColMean.put(col, xColSumsMap(col)._1 / nexamples) val variance = - (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / (nexamples) + (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / nexamples xColSd.put(col, math.sqrt(variance)) col += 1 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala index 07022093f300ca4edc345a50ccdcc2f2054153df..c96c94f70eef73bd8eb82d83e71c1e100309fb68 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala @@ -56,7 +56,7 @@ object SVMDataGenerator { val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } - val yD = (new DoubleMatrix(1, x.length, x:_*)).dot(trueWeights) + rnd.nextGaussian() * 0.1 + val yD = new DoubleMatrix(1, x.length, x: _*).dot(trueWeights) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, x) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala index 34c67294e9ac9942f747a06db712d52e71f5e42b..02ede711372d3fa93deb647accf6778f00aa8eda 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala @@ -80,9 +80,9 @@ class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll with Shoul } def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) { - val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) => - (prediction != expected.label) - }.size + val numOffPredictions = predictions.zip(input).count { case (prediction, expected) => + prediction != expected.label + } // At least 83% of the predictions should be on. ((input.length - numOffPredictions).toDouble / input.length) should be > 0.83 } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala index 6a957e3ddca719188677c4bd150d113a8ab98204..3357b86f9b7061cf4ef5be89c6c29a312118a5b1 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.mllib.classification import scala.util.Random -import scala.math.signum import scala.collection.JavaConversions._ import org.scalatest.BeforeAndAfterAll @@ -50,7 +49,7 @@ object SVMSuite { val x = Array.fill[Array[Double]](nPoints)( Array.fill[Double](weights.length)(rnd.nextDouble() * 2.0 - 1.0)) val y = x.map { xi => - val yD = (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + + val yD = new DoubleMatrix(1, xi.length, xi: _*).dot(weightsMat) + intercept + 0.01 * rnd.nextGaussian() if (yD < 0) 0.0 else 1.0 } @@ -72,9 +71,9 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll { } def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) { - val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) => - (prediction != expected.label) - }.size + val numOffPredictions = predictions.zip(input).count { case (prediction, expected) => + prediction != expected.label + } // At least 80% of the predictions should be on. assert(numOffPredictions < input.length / 5) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala index 94245f6027b3094bba9cae70e19d359bce136b78..73657cac893ce9180e54696fe18dba6d0605397a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala @@ -17,15 +17,12 @@ package org.apache.spark.mllib.clustering -import scala.util.Random import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ -import org.jblas._ class KMeansSuite extends FunSuite with BeforeAndAfterAll { @transient private var sc: SparkContext = _ diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala index e683a90f57aba9a0229ee4e1b84a255536c5d71c..4e8dbde65801c05712fe98d1aed98c7b649ba744 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala @@ -24,7 +24,6 @@ import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ import org.jblas._ diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala index db980c7bae64f946e64555c53f04969babed25c1..b2c8df97a82a77f62a41b804d392c5f0767dc54b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.mllib.regression -import scala.collection.JavaConversions._ -import scala.util.Random import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite @@ -41,10 +39,10 @@ class LassoSuite extends FunSuite with BeforeAndAfterAll { } def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) { - val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) => + val numOffPredictions = predictions.zip(input).count { case (prediction, expected) => // A prediction is off if the prediction is more than 0.5 away from expected value. math.abs(prediction - expected.label) > 0.5 - }.size + } // At least 80% of the predictions should be on. assert(numOffPredictions < input.length / 5) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala index ef500c704c8a9949e85fb40885669b500a921f1b..406afbaa3e2c108f71404f075d856ebd4cd4add5 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala @@ -21,7 +21,6 @@ import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ import org.apache.spark.mllib.util.LinearDataGenerator class LinearRegressionSuite extends FunSuite with BeforeAndAfterAll { @@ -37,10 +36,10 @@ class LinearRegressionSuite extends FunSuite with BeforeAndAfterAll { } def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) { - val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) => + val numOffPredictions = predictions.zip(input).count { case (prediction, expected) => // A prediction is off if the prediction is more than 0.5 away from expected value. math.abs(prediction - expected.label) > 0.5 - }.size + } // At least 80% of the predictions should be on. assert(numOffPredictions < input.length / 5) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala index c18092d804fa3e17055238aeb250a99e459e2d59..1d6a10b66e89238dfe4cc9a2b95a41f63ec890ed 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala @@ -17,15 +17,12 @@ package org.apache.spark.mllib.regression -import scala.collection.JavaConversions._ -import scala.util.Random import org.jblas.DoubleMatrix import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ import org.apache.spark.mllib.util.LinearDataGenerator class RidgeRegressionSuite extends FunSuite with BeforeAndAfterAll {