diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 1a629994cc4ed8e1826d80fc06e7421f1a220b09..1583ce4a58892fc6104b68fc28902b875fd6680b 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -4,7 +4,7 @@ title: Machine Learning Library (MLlib) --- MLlib is a Spark implementation of some common machine learning (ML) -functionality, as well associated unit tests and data generators. MLlib +functionality, as well associated tests and data generators. MLlib currently supports four common types of machine learning problem settings, namely, binary classification, regression, clustering and collaborative filtering, as well as an underlying gradient descent optimization primitive. @@ -44,22 +44,20 @@ import org.apache.spark.mllib.regression.LabeledPoint // Load and parse the data file val data = sc.textFile("mllib/data/sample_svm_data.txt") -val parsedData = data.map(line => { +val parsedData = data.map { line => val parts = line.split(' ') LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray) -}) +} // Run training algorithm val numIterations = 20 -val model = SVMWithSGD.train( - parsedData, - numIterations) +val model = SVMWithSGD.train(parsedData, numIterations) // Evaluate model on training examples and compute training error -val labelAndPreds = parsedData.map(r => { - val prediction = model.predict(r.features) - (r.label, prediction) -}) +val labelAndPreds = parsedData.map { point => + val prediction = model.predict(point.features) + (point.label, prediction) +} val trainErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / parsedData.count println("trainError = " + trainErr) {% endhighlight %} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index 3511e24bce24821930060752370ea0c191521b49..3b8f8550d05275f7cba412fde2a6b08f9a2f74bc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -29,7 +29,7 @@ import org.apache.spark.mllib.util.DataValidators import org.jblas.DoubleMatrix /** - * Model built using SVM. + * Model for Support Vector Machines (SVMs). * * @param weights Weights computed for every feature. * @param intercept Intercept computed for this model. @@ -48,8 +48,8 @@ class SVMModel( } /** - * Train an SVM using Stochastic Gradient Descent. - * NOTE: Labels used in SVM should be {0, 1} + * Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. + * NOTE: Labels used in SVM should be {0, 1}. */ class SVMWithSGD private ( var stepSize: Double, @@ -79,7 +79,7 @@ class SVMWithSGD private ( } /** - * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1} + * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}. */ object SVMWithSGD { @@ -88,14 +88,15 @@ object SVMWithSGD { * of iterations of gradient descent using the specified step size. Each iteration uses * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in * gradient descent are initialized using the initial weights provided. - * NOTE: Labels used in SVM should be {0, 1} + * + * NOTE: Labels used in SVM should be {0, 1}. * * @param input RDD of (label, array of features) pairs. * @param numIterations Number of iterations of gradient descent to run. * @param stepSize Step size to be used for each iteration of gradient descent. * @param regParam Regularization parameter. * @param miniBatchFraction Fraction of data to be used per iteration. - * @param initialWeights Initial set of weights to be used. Array should be equal in size to + * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. */ def train( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala index ae95ea24fcd7c84b5293d27a743e70fab92fd359..597d55e0bbdbca6222a247633a77f9f6a0e71d7e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala @@ -43,7 +43,7 @@ class LinearRegressionModel( } /** - * Train a regression model with no regularization using Stochastic Gradient Descent. + * Train a linear regression model with no regularization using Stochastic Gradient Descent. */ class LinearRegressionWithSGD private ( var stepSize: Double, @@ -83,7 +83,7 @@ object LinearRegressionWithSGD { * @param numIterations Number of iterations of gradient descent to run. * @param stepSize Step size to be used for each iteration of gradient descent. * @param miniBatchFraction Fraction of data to be used per iteration. - * @param initialWeights Initial set of weights to be used. Array should be equal in size to + * @param initialWeights Initial set of weights to be used. Array should be equal in size to * the number of features in the data. */ def train(