diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md index aac8f7560a4f83b53ac9057b25462d9ec2da11e9..63665c49bc972b6948cf5d8f22ed4f1c4d21cd99 100644 --- a/docs/mllib-linear-methods.md +++ b/docs/mllib-linear-methods.md @@ -170,42 +170,7 @@ error. Refer to the [`SVMWithSGD` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.SVMWithSGD) and [`SVMModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.SVMModel) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics -import org.apache.spark.mllib.util.MLUtils - -// Load training data in LIBSVM format. -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") - -// Split data into training (60%) and test (40%). -val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) -val training = splits(0).cache() -val test = splits(1) - -// Run training algorithm to build the model -val numIterations = 100 -val model = SVMWithSGD.train(training, numIterations) - -// Clear the default threshold. -model.clearThreshold() - -// Compute raw scores on the test set. -val scoreAndLabels = test.map { point => - val score = model.predict(point.features) - (score, point.label) -} - -// Get evaluation metrics. -val metrics = new BinaryClassificationMetrics(scoreAndLabels) -val auROC = metrics.areaUnderROC() - -println("Area under ROC = " + auROC) - -// Save and load model -model.save(sc, "myModelPath") -val sameModel = SVMModel.load(sc, "myModelPath") -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala %} The `SVMWithSGD.train()` method by default performs L2 regularization with the regularization parameter set to 1.0. If we want to configure this algorithm, we @@ -216,6 +181,7 @@ variant of SVMs with regularization parameter set to 0.1, and runs the training algorithm for 200 iterations. {% highlight scala %} + import org.apache.spark.mllib.optimization.L1Updater val svmAlg = new SVMWithSGD() @@ -237,61 +203,7 @@ that is equivalent to the provided example in Scala is given below: Refer to the [`SVMWithSGD` Java docs](api/java/org/apache/spark/mllib/classification/SVMWithSGD.html) and [`SVMModel` Java docs](api/java/org/apache/spark/mllib/classification/SVMModel.html) for details on the API. -{% highlight java %} -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.classification.*; -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; - -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; - -public class SVMClassifier { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("SVM Classifier Example"); - SparkContext sc = new SparkContext(conf); - String path = "data/mllib/sample_libsvm_data.txt"; - JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L); - training.cache(); - JavaRDD<LabeledPoint> test = data.subtract(training); - - // Run training algorithm to build the model. - int numIterations = 100; - final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations); - - // Clear the default threshold. - model.clearThreshold(); - - // Compute raw scores on the test set. - JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map( - new Function<LabeledPoint, Tuple2<Object, Object>>() { - public Tuple2<Object, Object> call(LabeledPoint p) { - Double score = model.predict(p.features()); - return new Tuple2<Object, Object>(score, p.label()); - } - } - ); - - // Get evaluation metrics. - BinaryClassificationMetrics metrics = - new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels)); - double auROC = metrics.areaUnderROC(); - - System.out.println("Area under ROC = " + auROC); - - // Save and load model - model.save(sc, "myModelPath"); - SVMModel sameModel = SVMModel.load(sc, "myModelPath"); - } -} -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaSVMWithSGDExample.java %} The `SVMWithSGD.train()` method by default performs L2 regularization with the regularization parameter set to 1.0. If we want to configure this algorithm, we @@ -325,30 +237,7 @@ and make predictions with the resulting model to compute the training error. Refer to the [`SVMWithSGD` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.SVMWithSGD) and [`SVMModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.SVMModel) for more details on the API. -{% highlight python %} -from pyspark.mllib.classification import SVMWithSGD, SVMModel -from pyspark.mllib.regression import LabeledPoint - -# Load and parse the data -def parsePoint(line): - values = [float(x) for x in line.split(' ')] - return LabeledPoint(values[0], values[1:]) - -data = sc.textFile("data/mllib/sample_svm_data.txt") -parsedData = data.map(parsePoint) - -# Build the model -model = SVMWithSGD.train(parsedData, iterations=100) - -# Evaluating the model on training data -labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) -trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) -print("Training Error = " + str(trainErr)) - -# Save and load model -model.save(sc, "myModelPath") -sameModel = SVMModel.load(sc, "myModelPath") -{% endhighlight %} +{% include_example python/mllib/svm_with_sgd_example.py %} </div> </div> @@ -406,42 +295,7 @@ Then the model is evaluated against the test dataset and saved to disk. Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`LogisticRegressionModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionModel) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel} -import org.apache.spark.mllib.evaluation.MulticlassMetrics -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.util.MLUtils - -// Load training data in LIBSVM format. -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") - -// Split data into training (60%) and test (40%). -val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) -val training = splits(0).cache() -val test = splits(1) - -// Run training algorithm to build the model -val model = new LogisticRegressionWithLBFGS() - .setNumClasses(10) - .run(training) - -// Compute raw scores on the test set. -val predictionAndLabels = test.map { case LabeledPoint(label, features) => - val prediction = model.predict(features) - (prediction, label) -} - -// Get evaluation metrics. -val metrics = new MulticlassMetrics(predictionAndLabels) -val precision = metrics.precision -println("Precision = " + precision) - -// Save and load model -model.save(sc, "myModelPath") -val sameModel = LogisticRegressionModel.load(sc, "myModelPath") -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/LogisticRegressionWithLBFGSExample.scala %} </div> @@ -454,57 +308,7 @@ Then the model is evaluated against the test dataset and saved to disk. Refer to the [`LogisticRegressionWithLBFGS` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html) and [`LogisticRegressionModel` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionModel.html) for details on the API. -{% highlight java %} -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.classification.LogisticRegressionModel; -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; -import org.apache.spark.mllib.evaluation.MulticlassMetrics; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; - -public class MultinomialLogisticRegressionExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("SVM Classifier Example"); - SparkContext sc = new SparkContext(conf); - String path = "data/mllib/sample_libsvm_data.txt"; - JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); - JavaRDD<LabeledPoint> training = splits[0].cache(); - JavaRDD<LabeledPoint> test = splits[1]; - - // Run training algorithm to build the model. - final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(10) - .run(training.rdd()); - - // Compute raw scores on the test set. - JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map( - new Function<LabeledPoint, Tuple2<Object, Object>>() { - public Tuple2<Object, Object> call(LabeledPoint p) { - Double prediction = model.predict(p.features()); - return new Tuple2<Object, Object>(prediction, p.label()); - } - } - ); - - // Get evaluation metrics. - MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); - double precision = metrics.precision(); - System.out.println("Precision = " + precision); - - // Save and load model - model.save(sc, "myModelPath"); - LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); - } -} -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java %} </div> <div data-lang="python" markdown="1"> @@ -516,30 +320,7 @@ will in the future. Refer to the [`LogisticRegressionWithLBFGS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionWithLBFGS) and [`LogisticRegressionModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionModel) for more details on the API. -{% highlight python %} -from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel -from pyspark.mllib.regression import LabeledPoint - -# Load and parse the data -def parsePoint(line): - values = [float(x) for x in line.split(' ')] - return LabeledPoint(values[0], values[1:]) - -data = sc.textFile("data/mllib/sample_svm_data.txt") -parsedData = data.map(parsePoint) - -# Build the model -model = LogisticRegressionWithLBFGS.train(parsedData) - -# Evaluating the model on training data -labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) -trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) -print("Training Error = " + str(trainErr)) - -# Save and load model -model.save(sc, "myModelPath") -sameModel = LogisticRegressionModel.load(sc, "myModelPath") -{% endhighlight %} +{% include_example python/mllib/logistic_regression_with_lbfgs_example.py %} </div> </div> @@ -575,36 +356,7 @@ values. We compute the mean squared error at the end to evaluate Refer to the [`LinearRegressionWithSGD` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.LinearRegressionWithSGD) and [`LinearRegressionModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.regression.LinearRegressionModel) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.regression.LinearRegressionModel -import org.apache.spark.mllib.regression.LinearRegressionWithSGD -import org.apache.spark.mllib.linalg.Vectors - -// Load and parse the data -val data = sc.textFile("data/mllib/ridge-data/lpsa.data") -val parsedData = data.map { line => - val parts = line.split(',') - LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) -}.cache() - -// Building the model -val numIterations = 100 -val stepSize = 0.00000001 -val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) - -// Evaluate model on training examples and compute training error -val valuesAndPreds = parsedData.map { point => - val prediction = model.predict(point.features) - (point.label, prediction) -} -val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean() -println("training Mean Squared Error = " + MSE) - -// Save and load model -model.save(sc, "myModelPath") -val sameModel = LinearRegressionModel.load(sc, "myModelPath") -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala %} [`RidgeRegressionWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD) and [`LassoWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.LassoWithSGD) can be used in a similar fashion as `LinearRegressionWithSGD`. @@ -620,70 +372,7 @@ the Scala snippet provided, is presented below: Refer to the [`LinearRegressionWithSGD` Java docs](api/java/org/apache/spark/mllib/regression/LinearRegressionWithSGD.html) and [`LinearRegressionModel` Java docs](api/java/org/apache/spark/mllib/regression/LinearRegressionModel.html) for details on the API. -{% highlight java %} -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.regression.LinearRegressionModel; -import org.apache.spark.mllib.regression.LinearRegressionWithSGD; -import org.apache.spark.SparkConf; - -public class LinearRegression { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Linear Regression Example"); - JavaSparkContext sc = new JavaSparkContext(conf); - - // Load and parse the data - String path = "data/mllib/ridge-data/lpsa.data"; - JavaRDD<String> data = sc.textFile(path); - JavaRDD<LabeledPoint> parsedData = data.map( - new Function<String, LabeledPoint>() { - public LabeledPoint call(String line) { - String[] parts = line.split(","); - String[] features = parts[1].split(" "); - double[] v = new double[features.length]; - for (int i = 0; i < features.length - 1; i++) - v[i] = Double.parseDouble(features[i]); - return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); - } - } - ); - parsedData.cache(); - - // Building the model - int numIterations = 100; - double stepSize = 0.00000001; - final LinearRegressionModel model = - LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize); - - // Evaluate model on training examples and compute training error - JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map( - new Function<LabeledPoint, Tuple2<Double, Double>>() { - public Tuple2<Double, Double> call(LabeledPoint point) { - double prediction = model.predict(point.features()); - return new Tuple2<Double, Double>(prediction, point.label()); - } - } - ); - double MSE = new JavaDoubleRDD(valuesAndPreds.map( - new Function<Tuple2<Double, Double>, Object>() { - public Object call(Tuple2<Double, Double> pair) { - return Math.pow(pair._1() - pair._2(), 2.0); - } - } - ).rdd()).mean(); - System.out.println("training Mean Squared Error = " + MSE); - - // Save and load model - model.save(sc.sc(), "myModelPath"); - LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath"); - } -} -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaLinearRegressionWithSGDExample.java %} </div> <div data-lang="python" markdown="1"> @@ -696,29 +385,7 @@ Note that the Python API does not yet support model save/load but will in the fu Refer to the [`LinearRegressionWithSGD` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.LinearRegressionWithSGD) and [`LinearRegressionModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.regression.LinearRegressionModel) for more details on the API. -{% highlight python %} -from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel - -# Load and parse the data -def parsePoint(line): - values = [float(x) for x in line.replace(',', ' ').split(' ')] - return LabeledPoint(values[0], values[1:]) - -data = sc.textFile("data/mllib/ridge-data/lpsa.data") -parsedData = data.map(parsePoint) - -# Build the model -model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001) - -# Evaluate the model on training data -valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) -MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count() -print("Mean Squared Error = " + str(MSE)) - -# Save and load model -model.save(sc, "myModelPath") -sameModel = LinearRegressionModel.load(sc, "myModelPath") -{% endhighlight %} +{% include_example python/mllib/linear_regression_with_sgd_example.py %} </div> </div> @@ -748,108 +415,50 @@ online to the first stream, and make predictions on the second stream. First, we import the necessary classes for parsing our input data and creating the model. -{% highlight scala %} - -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD - -{% endhighlight %} - Then we make input streams for training and testing data. We assume a StreamingContext `ssc` has already been created, see [Spark Streaming Programming Guide](streaming-programming-guide.html#initializing) for more info. For this example, we use labeled points in training and testing streams, but in practice you will likely want to use unlabeled vectors for test data. -{% highlight scala %} - -val trainingData = ssc.textFileStream("/training/data/dir").map(LabeledPoint.parse).cache() -val testData = ssc.textFileStream("/testing/data/dir").map(LabeledPoint.parse) - -{% endhighlight %} +We create our model by initializing the weights to zero and register the streams for training and +testing then start the job. Printing predictions alongside true labels lets us easily see the +result. -We create our model by initializing the weights to 0 - -{% highlight scala %} - -val numFeatures = 3 -val model = new StreamingLinearRegressionWithSGD() - .setInitialWeights(Vectors.zeros(numFeatures)) - -{% endhighlight %} - -Now we register the streams for training and testing and start the job. -Printing predictions alongside true labels lets us easily see the result. - -{% highlight scala %} - -model.trainOn(trainingData) -model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() - -ssc.start() -ssc.awaitTermination() - -{% endhighlight %} - -We can now save text files with data to the training or testing folders. +Finally we can save text files with data to the training or testing folders. Each line should be a data point formatted as `(y,[x1,x2,x3])` where `y` is the label -and `x1,x2,x3` are the features. Anytime a text file is placed in `/training/data/dir` -the model will update. Anytime a text file is placed in `/testing/data/dir` you will see predictions. +and `x1,x2,x3` are the features. Anytime a text file is placed in `args(0)` +the model will update. Anytime a text file is placed in `args(1)` you will see predictions. As you feed more data to the training directory, the predictions will get better! +Here is a complete example: +{% include_example scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala %} + </div> <div data-lang="python" markdown="1"> First, we import the necessary classes for parsing our input data and creating the model. -{% highlight python %} -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.regression import StreamingLinearRegressionWithSGD -{% endhighlight %} - Then we make input streams for training and testing data. We assume a StreamingContext `ssc` has already been created, see [Spark Streaming Programming Guide](streaming-programming-guide.html#initializing) for more info. For this example, we use labeled points in training and testing streams, but in practice you will likely want to use unlabeled vectors for test data. -{% highlight python %} -def parse(lp): - label = float(lp[lp.find('(') + 1: lp.find(',')]) - vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) - return LabeledPoint(label, vec) - -trainingData = ssc.textFileStream("/training/data/dir").map(parse).cache() -testData = ssc.textFileStream("/testing/data/dir").map(parse) -{% endhighlight %} - -We create our model by initializing the weights to 0 - -{% highlight python %} -numFeatures = 3 -model = StreamingLinearRegressionWithSGD() -model.setInitialWeights([0.0, 0.0, 0.0]) -{% endhighlight %} +We create our model by initializing the weights to 0. Now we register the streams for training and testing and start the job. -{% highlight python %} -model.trainOn(trainingData) -print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) - -ssc.start() -ssc.awaitTermination() -{% endhighlight %} - We can now save text files with data to the training or testing folders. Each line should be a data point formatted as `(y,[x1,x2,x3])` where `y` is the label -and `x1,x2,x3` are the features. Anytime a text file is placed in `/training/data/dir` -the model will update. Anytime a text file is placed in `/testing/data/dir` you will see predictions. +and `x1,x2,x3` are the features. Anytime a text file is placed in `sys.argv[1]` +the model will update. Anytime a text file is placed in `sys.argv[2]` you will see predictions. As you feed more data to the training directory, the predictions will get better! +Here a complete example: +{% include_example python/mllib/streaming_linear_regression_example.py %} + </div> </div> diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionWithSGDExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionWithSGDExample.java new file mode 100644 index 0000000000000000000000000000000000000000..3e50118c0d9ec71a6c93e679500b2636fa168784 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionWithSGDExample.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +// $example on$ +import scala.Tuple2; + +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.regression.LinearRegressionModel; +import org.apache.spark.mllib.regression.LinearRegressionWithSGD; +// $example off$ + +/** + * Example for LinearRegressionWithSGD. + */ +public class JavaLinearRegressionWithSGDExample { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithSGDExample"); + JavaSparkContext sc = new JavaSparkContext(conf); + + // $example on$ + // Load and parse the data + String path = "data/mllib/ridge-data/lpsa.data"; + JavaRDD<String> data = sc.textFile(path); + JavaRDD<LabeledPoint> parsedData = data.map( + new Function<String, LabeledPoint>() { + public LabeledPoint call(String line) { + String[] parts = line.split(","); + String[] features = parts[1].split(" "); + double[] v = new double[features.length]; + for (int i = 0; i < features.length - 1; i++) { + v[i] = Double.parseDouble(features[i]); + } + return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); + } + } + ); + parsedData.cache(); + + // Building the model + int numIterations = 100; + double stepSize = 0.00000001; + final LinearRegressionModel model = + LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize); + + // Evaluate model on training examples and compute training error + JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map( + new Function<LabeledPoint, Tuple2<Double, Double>>() { + public Tuple2<Double, Double> call(LabeledPoint point) { + double prediction = model.predict(point.features()); + return new Tuple2<Double, Double>(prediction, point.label()); + } + } + ); + double MSE = new JavaDoubleRDD(valuesAndPreds.map( + new Function<Tuple2<Double, Double>, Object>() { + public Object call(Tuple2<Double, Double> pair) { + return Math.pow(pair._1() - pair._2(), 2.0); + } + } + ).rdd()).mean(); + System.out.println("training Mean Squared Error = " + MSE); + + // Save and load model + model.save(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel"); + LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), + "target/tmp/javaLinearRegressionWithSGDModel"); + // $example off$ + + sc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java new file mode 100644 index 0000000000000000000000000000000000000000..9d8e4a90dbc99497921be3142c6fdbac911a683f --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; + +// $example on$ +import scala.Tuple2; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.classification.LogisticRegressionModel; +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; +import org.apache.spark.mllib.evaluation.MulticlassMetrics; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +// $example off$ + +/** + * Example for LogisticRegressionWithLBFGS. + */ +public class JavaLogisticRegressionWithLBFGSExample { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample"); + SparkContext sc = new SparkContext(conf); + // $example on$ + String path = "data/mllib/sample_libsvm_data.txt"; + JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); + + // Split initial RDD into two... [60% training data, 40% testing data]. + JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); + JavaRDD<LabeledPoint> training = splits[0].cache(); + JavaRDD<LabeledPoint> test = splits[1]; + + // Run training algorithm to build the model. + final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() + .setNumClasses(10) + .run(training.rdd()); + + // Compute raw scores on the test set. + JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map( + new Function<LabeledPoint, Tuple2<Object, Object>>() { + public Tuple2<Object, Object> call(LabeledPoint p) { + Double prediction = model.predict(p.features()); + return new Tuple2<Object, Object>(prediction, p.label()); + } + } + ); + + // Get evaluation metrics. + MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); + double precision = metrics.precision(); + System.out.println("Precision = " + precision); + + // Save and load model + model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); + LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, + "target/tmp/javaLogisticRegressionWithLBFGSModel"); + // $example off$ + + sc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVMWithSGDExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVMWithSGDExample.java new file mode 100644 index 0000000000000000000000000000000000000000..720b167b2cadf6f2665d5aa7642bd1d62489e583 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVMWithSGDExample.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; + +// $example on$ +import scala.Tuple2; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.classification.SVMModel; +import org.apache.spark.mllib.classification.SVMWithSGD; +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +// $example off$ + +/** + * Example for SVMWithSGD. + */ +public class JavaSVMWithSGDExample { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("JavaSVMWithSGDExample"); + SparkContext sc = new SparkContext(conf); + // $example on$ + String path = "data/mllib/sample_libsvm_data.txt"; + JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); + + // Split initial RDD into two... [60% training data, 40% testing data]. + JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L); + training.cache(); + JavaRDD<LabeledPoint> test = data.subtract(training); + + // Run training algorithm to build the model. + int numIterations = 100; + final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations); + + // Clear the default threshold. + model.clearThreshold(); + + // Compute raw scores on the test set. + JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map( + new Function<LabeledPoint, Tuple2<Object, Object>>() { + public Tuple2<Object, Object> call(LabeledPoint p) { + Double score = model.predict(p.features()); + return new Tuple2<Object, Object>(score, p.label()); + } + } + ); + + // Get evaluation metrics. + BinaryClassificationMetrics metrics = + new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels)); + double auROC = metrics.areaUnderROC(); + + System.out.println("Area under ROC = " + auROC); + + // Save and load model + model.save(sc, "target/tmp/javaSVMWithSGDModel"); + SVMModel sameModel = SVMModel.load(sc, "target/tmp/javaSVMWithSGDModel"); + // $example off$ + + sc.stop(); + } +} diff --git a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py new file mode 100644 index 0000000000000000000000000000000000000000..6fbaeff0cd5a0e3bae3e2288220d596622184e5b --- /dev/null +++ b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py @@ -0,0 +1,54 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Linear Regression With SGD Example. +""" +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel +# $example off$ + +if __name__ == "__main__": + + sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") + + # $example on$ + # Load and parse the data + def parsePoint(line): + values = [float(x) for x in line.replace(',', ' ').split(' ')] + return LabeledPoint(values[0], values[1:]) + + data = sc.textFile("data/mllib/ridge-data/lpsa.data") + parsedData = data.map(parsePoint) + + # Build the model + model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001) + + # Evaluate the model on training data + valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) + MSE = valuesAndPreds \ + .map(lambda (v, p): (v - p)**2) \ + .reduce(lambda x, y: x + y) / valuesAndPreds.count() + print("Mean Squared Error = " + str(MSE)) + + # Save and load model + model.save(sc, "target/tmp/pythonLinearRegressionWithSGDModel") + sameModel = LinearRegressionModel.load(sc, "target/tmp/pythonLinearRegressionWithSGDModel") + # $example off$ diff --git a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py new file mode 100644 index 0000000000000000000000000000000000000000..e030b74ba6b15e3c2954d3412f56bd3ff89386ac --- /dev/null +++ b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py @@ -0,0 +1,54 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Logistic Regression With LBFGS Example. +""" +from __future__ import print_function + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel +from pyspark.mllib.regression import LabeledPoint +# $example off$ + +if __name__ == "__main__": + + sc = SparkContext(appName="PythonLogisticRegressionWithLBFGSExample") + + # $example on$ + # Load and parse the data + def parsePoint(line): + values = [float(x) for x in line.split(' ')] + return LabeledPoint(values[0], values[1:]) + + data = sc.textFile("data/mllib/sample_svm_data.txt") + parsedData = data.map(parsePoint) + + # Build the model + model = LogisticRegressionWithLBFGS.train(parsedData) + + # Evaluating the model on training data + labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) + trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) + print("Training Error = " + str(trainErr)) + + # Save and load model + model.save(sc, "target/tmp/pythonLogisticRegressionWithLBFGSModel") + sameModel = LogisticRegressionModel.load(sc, + "target/tmp/pythonLogisticRegressionWithLBFGSModel") + # $example off$ diff --git a/examples/src/main/python/mllib/streaming_linear_regression_example.py b/examples/src/main/python/mllib/streaming_linear_regression_example.py new file mode 100644 index 0000000000000000000000000000000000000000..f600496867c112d5ad909e5be179972a98c4330f --- /dev/null +++ b/examples/src/main/python/mllib/streaming_linear_regression_example.py @@ -0,0 +1,62 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Streaming Linear Regression Example. +""" +from __future__ import print_function + +# $example on$ +import sys +# $example off$ + +from pyspark import SparkContext +from pyspark.streaming import StreamingContext +# $example on$ +from pyspark.mllib.linalg import Vectors +from pyspark.mllib.regression import LabeledPoint +from pyspark.mllib.regression import StreamingLinearRegressionWithSGD +# $example off$ + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: streaming_linear_regression_example.py <trainingDir> <testDir>", + file=sys.stderr) + exit(-1) + + sc = SparkContext(appName="PythonLogisticRegressionWithLBFGSExample") + ssc = StreamingContext(sc, 1) + + # $example on$ + def parse(lp): + label = float(lp[lp.find('(') + 1: lp.find(',')]) + vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) + return LabeledPoint(label, vec) + + trainingData = ssc.textFileStream(sys.argv[1]).map(parse).cache() + testData = ssc.textFileStream(sys.argv[2]).map(parse) + + numFeatures = 3 + model = StreamingLinearRegressionWithSGD() + model.setInitialWeights([0.0, 0.0, 0.0]) + + model.trainOn(trainingData) + print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) + + ssc.start() + ssc.awaitTermination() + # $example off$ diff --git a/examples/src/main/python/mllib/svm_with_sgd_example.py b/examples/src/main/python/mllib/svm_with_sgd_example.py new file mode 100644 index 0000000000000000000000000000000000000000..309ab09cc375a9e123e39324a7500f34c8224c7c --- /dev/null +++ b/examples/src/main/python/mllib/svm_with_sgd_example.py @@ -0,0 +1,47 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark import SparkContext +# $example on$ +from pyspark.mllib.classification import SVMWithSGD, SVMModel +from pyspark.mllib.regression import LabeledPoint +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="PythonSVMWithSGDExample") + + # $example on$ + # Load and parse the data + def parsePoint(line): + values = [float(x) for x in line.split(' ')] + return LabeledPoint(values[0], values[1:]) + + data = sc.textFile("data/mllib/sample_svm_data.txt") + parsedData = data.map(parsePoint) + + # Build the model + model = SVMWithSGD.train(parsedData, iterations=100) + + # Evaluating the model on training data + labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) + trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) + print("Training Error = " + str(trainErr)) + + # Save and load model + model.save(sc, "target/tmp/pythonSVMWithSGDModel") + sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel") + # $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala new file mode 100644 index 0000000000000000000000000000000000000000..669868787e8f0ca6a6ea73631288797b49a331ca --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +// $example on$ +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.regression.LinearRegressionModel +import org.apache.spark.mllib.regression.LinearRegressionWithSGD +// $example off$ + +object LinearRegressionWithSGDExample { + + def main(args: Array[String]): Unit = { + val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") + val sc = new SparkContext(conf) + + // $example on$ + // Load and parse the data + val data = sc.textFile("data/mllib/ridge-data/lpsa.data") + val parsedData = data.map { line => + val parts = line.split(',') + LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) + }.cache() + + // Building the model + val numIterations = 100 + val stepSize = 0.00000001 + val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) + + // Evaluate model on training examples and compute training error + val valuesAndPreds = parsedData.map { point => + val prediction = model.predict(point.features) + (point.label, prediction) + } + val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() + println("training Mean Squared Error = " + MSE) + + // Save and load model + model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") + val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") + // $example off$ + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LogisticRegressionWithLBFGSExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LogisticRegressionWithLBFGSExample.scala new file mode 100644 index 0000000000000000000000000000000000000000..632a2d537e5bc99703d4b19ec656726f80c808fb --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LogisticRegressionWithLBFGSExample.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +// $example on$ +import org.apache.spark.mllib.classification.{LogisticRegressionModel, LogisticRegressionWithLBFGS} +import org.apache.spark.mllib.evaluation.MulticlassMetrics +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +object LogisticRegressionWithLBFGSExample { + + def main(args: Array[String]): Unit = { + val conf = new SparkConf().setAppName("LogisticRegressionWithLBFGSExample") + val sc = new SparkContext(conf) + + // $example on$ + // Load training data in LIBSVM format. + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + + // Split data into training (60%) and test (40%). + val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) + val training = splits(0).cache() + val test = splits(1) + + // Run training algorithm to build the model + val model = new LogisticRegressionWithLBFGS() + .setNumClasses(10) + .run(training) + + // Compute raw scores on the test set. + val predictionAndLabels = test.map { case LabeledPoint(label, features) => + val prediction = model.predict(features) + (prediction, label) + } + + // Get evaluation metrics. + val metrics = new MulticlassMetrics(predictionAndLabels) + val precision = metrics.precision + println("Precision = " + precision) + + // Save and load model + model.save(sc, "target/tmp/scalaLogisticRegressionWithLBFGSModel") + val sameModel = LogisticRegressionModel.load(sc, + "target/tmp/scalaLogisticRegressionWithLBFGSModel") + // $example off$ + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala new file mode 100644 index 0000000000000000000000000000000000000000..b73fe9b2b3faa095c33e2d2a2f5640619bdeefa6 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +// $example on$ +import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +object SVMWithSGDExample { + + def main(args: Array[String]): Unit = { + val conf = new SparkConf().setAppName("SVMWithSGDExample") + val sc = new SparkContext(conf) + + // $example on$ + // Load training data in LIBSVM format. + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") + + // Split data into training (60%) and test (40%). + val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) + val training = splits(0).cache() + val test = splits(1) + + // Run training algorithm to build the model + val numIterations = 100 + val model = SVMWithSGD.train(training, numIterations) + + // Clear the default threshold. + model.clearThreshold() + + // Compute raw scores on the test set. + val scoreAndLabels = test.map { point => + val score = model.predict(point.features) + (score, point.label) + } + + // Get evaluation metrics. + val metrics = new BinaryClassificationMetrics(scoreAndLabels) + val auROC = metrics.areaUnderROC() + + println("Area under ROC = " + auROC) + + // Save and load model + model.save(sc, "target/tmp/scalaSVMWithSGDModel") + val sameModel = SVMModel.load(sc, "target/tmp/scalaSVMWithSGDModel") + // $example off$ + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala new file mode 100644 index 0000000000000000000000000000000000000000..0a1cd2d62d5b512dcc1d86307c8b0c0fe2565471 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.SparkConf +// $example on$ +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD +// $example off$ +import org.apache.spark.streaming._ + +object StreamingLinearRegressionExample { + + def main(args: Array[String]): Unit = { + if (args.length != 2) { + System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>") + System.exit(1) + } + + val conf = new SparkConf().setAppName("StreamingLinearRegressionExample") + val ssc = new StreamingContext(conf, Seconds(1)) + + // $example on$ + val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache() + val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) + + val numFeatures = 3 + val model = new StreamingLinearRegressionWithSGD() + .setInitialWeights(Vectors.zeros(numFeatures)) + + model.trainOn(trainingData) + model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() + + ssc.start() + ssc.awaitTermination() + // $example off$ + + ssc.stop() + } +} +// scalastyle:on println