diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java index 733bc4181c70b0b1de6aba3e5be00965d1f3c50c..bdb76f004fd6890d6432624540278a912122e27f 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java @@ -32,7 +32,9 @@ import org.apache.spark.sql.SparkSession; public class JavaDecisionTreeClassificationExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaDecisionTreeClassificationExample").getOrCreate(); + .builder() + .appName("JavaDecisionTreeClassificationExample") + .getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. @@ -52,10 +54,10 @@ public class JavaDecisionTreeClassificationExample { VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") - .setMaxCategories(4) // features with > 4 distinct values are treated as continuous + .setMaxCategories(4) // features with > 4 distinct values are treated as continuous. .fit(data); - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; @@ -71,11 +73,11 @@ public class JavaDecisionTreeClassificationExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); - // Chain indexers and tree in a Pipeline + // Chain indexers and tree in a Pipeline. Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter}); - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. @@ -84,7 +86,7 @@ public class JavaDecisionTreeClassificationExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java index bd6dc3edd36313ae0c54c32f355c96a681d252fe..cffb7139edccde44fc3c790f5477ad7a946259ba 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java @@ -33,7 +33,9 @@ import org.apache.spark.sql.SparkSession; public class JavaDecisionTreeRegressionExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaDecisionTreeRegressionExample").getOrCreate(); + .builder() + .appName("JavaDecisionTreeRegressionExample") + .getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. Dataset<Row> data = spark.read().format("libsvm") @@ -47,7 +49,7 @@ public class JavaDecisionTreeRegressionExample { .setMaxCategories(4) .fit(data); - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; @@ -56,11 +58,11 @@ public class JavaDecisionTreeRegressionExample { DecisionTreeRegressor dt = new DecisionTreeRegressor() .setFeaturesCol("indexedFeatures"); - // Chain indexer and tree in a Pipeline + // Chain indexer and tree in a Pipeline. Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{featureIndexer, dt}); - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. PipelineModel model = pipeline.fit(trainingData); // Make predictions. @@ -69,7 +71,7 @@ public class JavaDecisionTreeRegressionExample { // Select example rows to display. predictions.select("label", "features").show(5); - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. RegressionEvaluator evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index 49bad0afc0b7155a7786abd4876fcc1783330d70..3265c4d7ec1fadd86dcc5e16dd974f1c00e562e2 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -62,7 +62,7 @@ public class JavaDeveloperApiExample { new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); Dataset<Row> training = spark.createDataFrame(localTraining, LabeledPoint.class); - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. MyJavaLogisticRegression lr = new MyJavaLogisticRegression(); // Print out the parameters, documentation, and any default values. System.out.println("MyJavaLogisticRegression parameters:\n" + lr.explainParams() + "\n"); @@ -70,7 +70,7 @@ public class JavaDeveloperApiExample { // We may set parameters using setter methods. lr.setMaxIter(10); - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. MyJavaLogisticRegressionModel model = lr.fit(training); // Prepare test data. @@ -214,7 +214,7 @@ class MyJavaLogisticRegressionModel } /** - * Number of classes the label can take. 2 indicates binary classification. + * Number of classes the label can take. 2 indicates binary classification. */ public int numClasses() { return 2; } diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java index 5ba8e6cf449026d5d34ab73821c84c50e31aab44..889f5785dfd8bc447da193eac6dc4f5c7e9126f9 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java @@ -38,7 +38,9 @@ import org.apache.spark.sql.SparkSession; public class JavaEstimatorTransformerParamExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaEstimatorTransformerParamExample").getOrCreate(); + .builder() + .appName("JavaEstimatorTransformerParamExample") + .getOrCreate(); // $example on$ // Prepare training data. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java index baacd796a058f5816fb8a7651782a8bd7fe9fb43..5c2e03eda90a939c4b88a480cefce668345a9800 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java @@ -75,11 +75,11 @@ public class JavaGradientBoostedTreeClassifierExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); - // Chain indexers and GBT in a Pipeline + // Chain indexers and GBT in a Pipeline. Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter}); - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. @@ -88,7 +88,7 @@ public class JavaGradientBoostedTreeClassifierExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java index 6d3f21fdafa047c8cbe6283e6c480dcf041b2fb6..769b5c3e85258f841f7ba39d775b11f5977a1964 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java @@ -34,7 +34,9 @@ import org.apache.spark.sql.SparkSession; public class JavaGradientBoostedTreeRegressorExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaGradientBoostedTreeRegressorExample").getOrCreate(); + .builder() + .appName("JavaGradientBoostedTreeRegressorExample") + .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. @@ -48,7 +50,7 @@ public class JavaGradientBoostedTreeRegressorExample { .setMaxCategories(4) .fit(data); - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; @@ -59,10 +61,10 @@ public class JavaGradientBoostedTreeRegressorExample { .setFeaturesCol("indexedFeatures") .setMaxIter(10); - // Chain indexer and GBT in a Pipeline + // Chain indexer and GBT in a Pipeline. Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {featureIndexer, gbt}); - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. PipelineModel model = pipeline.fit(trainingData); // Make predictions. @@ -71,7 +73,7 @@ public class JavaGradientBoostedTreeRegressorExample { // Select example rows to display. predictions.select("prediction", "label", "features").show(5); - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. RegressionEvaluator evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java index b6ea1fed25f870e336817588fbee9b0ec7165d17..dcd209e28e2b8790c93de821e04b4f214c9c1a5a 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java @@ -30,10 +30,12 @@ import org.apache.spark.sql.SparkSession; public class JavaLinearRegressionWithElasticNetExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaLinearRegressionWithElasticNetExample").getOrCreate(); + .builder() + .appName("JavaLinearRegressionWithElasticNetExample") + .getOrCreate(); // $example on$ - // Load training data + // Load training data. Dataset<Row> training = spark.read().format("libsvm") .load("data/mllib/sample_linear_regression_data.txt"); @@ -42,14 +44,14 @@ public class JavaLinearRegressionWithElasticNetExample { .setRegParam(0.3) .setElasticNetParam(0.8); - // Fit the model + // Fit the model. LinearRegressionModel lrModel = lr.fit(training); - // Print the coefficients and intercept for linear regression + // Print the coefficients and intercept for linear regression. System.out.println("Coefficients: " + lrModel.coefficients() + " Intercept: " + lrModel.intercept()); - // Summarize the model over the training set and print out some metrics + // Summarize the model over the training set and print out some metrics. LinearRegressionTrainingSummary trainingSummary = lrModel.summary(); System.out.println("numIterations: " + trainingSummary.totalIterations()); System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory())); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java index fd040aead4101f5708f0eede338945d52b730020..dee56799d8aeeb974f31adbfa3a79d6010bb15fc 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java @@ -31,7 +31,9 @@ import org.apache.spark.sql.functions; public class JavaLogisticRegressionSummaryExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaLogisticRegressionSummaryExample").getOrCreate(); + .builder() + .appName("JavaLogisticRegressionSummaryExample") + .getOrCreate(); // Load training data Dataset<Row> training = spark.read().format("libsvm") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java index f00c7a05cd12a614238dd1b73bf3a8cd4d48d765..6101c79fb0c98c4064be5714f0fb9fd3dc21993d 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java @@ -28,7 +28,9 @@ import org.apache.spark.sql.SparkSession; public class JavaLogisticRegressionWithElasticNetExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaLogisticRegressionWithElasticNetExample").getOrCreate(); + .builder() + .appName("JavaLogisticRegressionWithElasticNetExample") + .getOrCreate(); // $example on$ // Load training data diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java index a4ec4f58154f64759d11ca2a5c11e4bb12602cbb..975c65edc0ca6a8e08220a2123da7d2134b92083 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java @@ -43,7 +43,9 @@ import org.apache.spark.sql.SparkSession; public class JavaModelSelectionViaCrossValidationExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaModelSelectionViaCrossValidationExample").getOrCreate(); + .builder() + .appName("JavaModelSelectionViaCrossValidationExample") + .getOrCreate(); // $example on$ // Prepare training documents, which are labeled. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java index 63a0ad1cb883c9185aef4f1d85e11541a97a93c5..0f96293f0348b56acf0a008b19b66d6ea50e2a10 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java @@ -43,7 +43,9 @@ import org.apache.spark.sql.SparkSession; public class JavaModelSelectionViaTrainValidationSplitExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaModelSelectionViaTrainValidationSplitExample").getOrCreate(); + .builder() + .appName("JavaModelSelectionViaTrainValidationSplitExample") + .getOrCreate(); // $example on$ Dataset<Row> data = spark.read().format("libsvm") diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java index d547a2a64be5b0fe6023355c4927c2be2e08a14f..c7d03d8593a3e62b73ae8006e9148abcdbfeb0de 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java @@ -33,7 +33,9 @@ public class JavaMultilayerPerceptronClassifierExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaMultilayerPerceptronClassifierExample").getOrCreate(); + .builder() + .appName("JavaMultilayerPerceptronClassifierExample") + .getOrCreate(); // $example on$ // Load training data diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java index 94e3fafcab189cddb06196b88bcb0d96de9107c1..16f58a852d8a24ba6b421087eaf813b158f68b2f 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java @@ -35,7 +35,9 @@ import org.apache.spark.sql.types.StructType; public class JavaQuantileDiscretizerExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaQuantileDiscretizerExample").getOrCreate(); + .builder() + .appName("JavaQuantileDiscretizerExample") + .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java index 21e783a96897f5f0509274d371d8f0be119c2bf0..14af2fbbbbbe0957a5e0f046e2e61c87bc816b18 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java @@ -33,7 +33,9 @@ import org.apache.spark.sql.SparkSession; public class JavaRandomForestClassifierExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaRandomForestClassifierExample").getOrCreate(); + .builder() + .appName("JavaRandomForestClassifierExample") + .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java index ece184a8784fcfdbe33f2b87454d6e21876445f9..a7078453deb8a9632889c78ae68c6a3c0fef063f 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java @@ -34,7 +34,9 @@ import org.apache.spark.sql.SparkSession; public class JavaRandomForestRegressorExample { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaRandomForestRegressorExample").getOrCreate(); + .builder() + .appName("JavaRandomForestRegressorExample") + .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. @@ -62,7 +64,7 @@ public class JavaRandomForestRegressorExample { Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {featureIndexer, rf}); - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. PipelineModel model = pipeline.fit(trainingData); // Make predictions. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java index 0787079ba4e5544a8eee26d64552ba02adc96b5a..ff1eb07dc60588184514e5beaaa0b4e26d73792f 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java @@ -46,7 +46,7 @@ public class JavaSimpleParamsExample { .getOrCreate(); // Prepare training data. - // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans + // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans // into DataFrames, where it uses the bean metadata to infer the schema. List<LabeledPoint> localTraining = Lists.newArrayList( new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), @@ -56,7 +56,7 @@ public class JavaSimpleParamsExample { Dataset<Row> training = spark.createDataFrame(localTraining, LabeledPoint.class); - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. LogisticRegression lr = new LogisticRegression(); // Print out the parameters, documentation, and any default values. System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n"); @@ -65,7 +65,7 @@ public class JavaSimpleParamsExample { lr.setMaxIter(10) .setRegParam(0.01); - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. LogisticRegressionModel model1 = lr.fit(training); // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). @@ -82,7 +82,7 @@ public class JavaSimpleParamsExample { // One can also combine ParamMaps. ParamMap paramMap2 = new ParamMap(); - paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name + paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name. ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); // Now learn a new model using the paramMapCombined parameters. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java index 9516ce1f4fb1887c14a9613e09f8930de1dc792f..7c24c46d2e287773e54986d1075bcc03cfa97c6e 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java @@ -43,7 +43,9 @@ public class JavaSimpleTextClassificationPipeline { public static void main(String[] args) { SparkSession spark = SparkSession - .builder().appName("JavaSimpleTextClassificationPipeline").getOrCreate(); + .builder() + .appName("JavaSimpleTextClassificationPipeline") + .getOrCreate(); // Prepare training documents, which are labeled. List<LabeledDocument> localTraining = Lists.newArrayList( diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala index 7f6c8de9679f943cea4ce00b4929b2877d82c556..b3103ced914587453bc4d0b58f2fe7d56a0c6a95 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala @@ -47,10 +47,10 @@ object DecisionTreeClassificationExample { val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") - .setMaxCategories(4) // features with > 4 distinct values are treated as continuous + .setMaxCategories(4) // features with > 4 distinct values are treated as continuous. .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a DecisionTree model. @@ -64,11 +64,11 @@ object DecisionTreeClassificationExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels) - // Chain indexers and tree in a Pipeline + // Chain indexers and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(labelIndexer, featureIndexer, dt, labelConverter)) - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. val model = pipeline.fit(trainingData) // Make predictions. @@ -77,7 +77,7 @@ object DecisionTreeClassificationExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala index eadb02ab0d7f721be8fd1e6146d481b4a94dd101..310418008c21953276690c31f997bf892bae4843 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala @@ -23,7 +23,6 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.{Pipeline, PipelineStage, Transformer} import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier} @@ -40,7 +39,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} * {{{ * ./bin/run-example ml.DecisionTreeExample [options] * }}} - * Note that Decision Trees can take a large amount of memory. If the run-example command above + * Note that Decision Trees can take a large amount of memory. If the run-example command above * fails, try running via spark-submit and specifying the amount of memory as at least 1g. * For local mode, run * {{{ @@ -87,7 +86,7 @@ object DecisionTreeExample { .text(s"min info gain required to create a split, default: ${defaultParams.minInfoGain}") .action((x, c) => c.copy(minInfoGain = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[Boolean]("cacheNodeIds") @@ -106,7 +105,7 @@ object DecisionTreeExample { s"default: ${defaultParams.checkpointInterval}") .action((x, c) => c.copy(checkpointInterval = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -157,11 +156,10 @@ object DecisionTreeExample { * @param dataFormat "libsvm" or "dense" * @param testInput Path to test dataset. * @param algo Classification or Regression - * @param fracTest Fraction of input data to hold out for testing. Ignored if testInput given. + * @param fracTest Fraction of input data to hold out for testing. Ignored if testInput given. * @return (training dataset, test dataset) */ private[ml] def loadDatasets( - sc: SparkContext, input: String, dataFormat: String, testInput: String, @@ -200,18 +198,21 @@ object DecisionTreeExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"DecisionTreeExample with $params") - val sc = new SparkContext(conf) - params.checkpointDir.foreach(sc.setCheckpointDir) + val spark = SparkSession + .builder + .appName(s"DecisionTreeExample with $params") + .getOrCreate() + + params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir) val algo = params.algo.toLowerCase println(s"DecisionTreeExample with parameters:\n$params") // Load training and test data and cache it. val (training: DataFrame, test: DataFrame) = - loadDatasets(sc, params.input, params.dataFormat, params.testInput, algo, params.fracTest) + loadDatasets(params.input, params.dataFormat, params.testInput, algo, params.fracTest) - // Set up Pipeline + // Set up Pipeline. val stages = new mutable.ArrayBuffer[PipelineStage]() // (1) For classification, re-index classes. val labelColName = if (algo == "classification") "indexedLabel" else "label" @@ -228,7 +229,7 @@ object DecisionTreeExample { .setOutputCol("indexedFeatures") .setMaxCategories(10) stages += featuresIndexer - // (3) Learn Decision Tree + // (3) Learn Decision Tree. val dt = algo match { case "classification" => new DecisionTreeClassifier() @@ -255,13 +256,13 @@ object DecisionTreeExample { stages += dt val pipeline = new Pipeline().setStages(stages.toArray) - // Fit the Pipeline + // Fit the Pipeline. val startTime = System.nanoTime() val pipelineModel = pipeline.fit(training) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") - // Get the trained Decision Tree from the fitted PipelineModel + // Get the trained Decision Tree from the fitted PipelineModel. algo match { case "classification" => val treeModel = pipelineModel.stages.last.asInstanceOf[DecisionTreeClassificationModel] @@ -280,7 +281,7 @@ object DecisionTreeExample { case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - // Evaluate model on training, test data + // Evaluate model on training, test data. algo match { case "classification" => println("Training data results:") @@ -296,11 +297,11 @@ object DecisionTreeExample { throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - sc.stop() + spark.stop() } /** - * Evaluate the given ClassificationModel on data. Print the results. + * Evaluate the given ClassificationModel on data. Print the results. * @param model Must fit ClassificationModel abstraction * @param data DataFrame with "prediction" and labelColName columns * @param labelColName Name of the labelCol parameter for the model @@ -314,7 +315,7 @@ object DecisionTreeExample { val fullPredictions = model.transform(data).cache() val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0)) val labels = fullPredictions.select(labelColName).rdd.map(_.getDouble(0)) - // Print number of classes for reference + // Print number of classes for reference. val numClasses = MetadataUtils.getNumClasses(fullPredictions.schema(labelColName)) match { case Some(n) => n case None => throw new RuntimeException( @@ -325,7 +326,7 @@ object DecisionTreeExample { } /** - * Evaluate the given RegressionModel on data. Print the results. + * Evaluate the given RegressionModel on data. Print the results. * @param model Must fit RegressionModel abstraction * @param data DataFrame with "prediction" and labelColName columns * @param labelColName Name of the labelCol parameter for the model diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala index 799070ef47da607c87e2a79acdb453d0820bac5f..ee61200ad1d0c8341d9753e92caca7fcc664c0b1 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala @@ -46,7 +46,7 @@ object DecisionTreeRegressionExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a DecisionTree model. @@ -54,11 +54,11 @@ object DecisionTreeRegressionExample { .setLabelCol("label") .setFeaturesCol("indexedFeatures") - // Chain indexer and tree in a Pipeline + // Chain indexer and tree in a Pipeline. val pipeline = new Pipeline() .setStages(Array(featureIndexer, dt)) - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. val model = pipeline.fit(trainingData) // Make predictions. @@ -67,7 +67,7 @@ object DecisionTreeRegressionExample { // Select example rows to display. predictions.select("prediction", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index a522d2127edcab403f7be2659155958aaa0e79b4..b8f47bf12b87284033c6285804af469b39f9ca79 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala @@ -50,7 +50,7 @@ object DeveloperApiExample { LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)))) - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. val lr = new MyLogisticRegression() // Print out the parameters, documentation, and any default values. println("MyLogisticRegression parameters:\n" + lr.explainParams() + "\n") @@ -58,7 +58,7 @@ object DeveloperApiExample { // We may set parameters using setter methods. lr.setMaxIter(10) - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. val model = lr.fit(training.toDF()) // Prepare test data. @@ -84,7 +84,7 @@ object DeveloperApiExample { /** * Example of defining a parameter trait for a user-defined type of [[Classifier]]. * - * NOTE: This is private since it is an example. In practice, you may not want it to be private. + * NOTE: This is private since it is an example. In practice, you may not want it to be private. */ private trait MyLogisticRegressionParams extends ClassifierParams { @@ -96,7 +96,7 @@ private trait MyLogisticRegressionParams extends ClassifierParams { * - def getMyParamName * - def setMyParamName * Here, we have a trait to be mixed in with the Estimator and Model (MyLogisticRegression - * and MyLogisticRegressionModel). We place the setter (setMaxIter) method in the Estimator + * and MyLogisticRegressionModel). We place the setter (setMaxIter) method in the Estimator * class since the maxIter parameter is only used during training (not in the Model). */ val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations") @@ -106,7 +106,7 @@ private trait MyLogisticRegressionParams extends ClassifierParams { /** * Example of defining a type of [[Classifier]]. * - * NOTE: This is private since it is an example. In practice, you may not want it to be private. + * NOTE: This is private since it is an example. In practice, you may not want it to be private. */ private class MyLogisticRegression(override val uid: String) extends Classifier[Vector, MyLogisticRegression, MyLogisticRegressionModel] @@ -138,7 +138,7 @@ private class MyLogisticRegression(override val uid: String) /** * Example of defining a type of [[ClassificationModel]]. * - * NOTE: This is private since it is an example. In practice, you may not want it to be private. + * NOTE: This is private since it is an example. In practice, you may not want it to be private. */ private class MyLogisticRegressionModel( override val uid: String, @@ -169,7 +169,7 @@ private class MyLogisticRegressionModel( Vectors.dense(-margin, margin) } - /** Number of classes the label can take. 2 indicates binary classification. */ + /** Number of classes the label can take. 2 indicates binary classification. */ override val numClasses: Int = 2 /** Number of features the model was trained on. */ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala index 972241e76922e865aa9a87fb24f790edf35a787e..a2918d66ea67e829ed5dba142423f174bf58329c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala @@ -43,7 +43,7 @@ object EstimatorTransformerParamExample { (1.0, Vectors.dense(0.0, 1.2, -0.5)) )).toDF("label", "features") - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. val lr = new LogisticRegression() // Print out the parameters, documentation, and any default values. println("LogisticRegression parameters:\n" + lr.explainParams() + "\n") @@ -52,7 +52,7 @@ object EstimatorTransformerParamExample { lr.setMaxIter(10) .setRegParam(0.01) - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. val model1 = lr.fit(training) // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). @@ -63,11 +63,11 @@ object EstimatorTransformerParamExample { // We may alternatively specify parameters using a ParamMap, // which supports several methods for specifying parameters. val paramMap = ParamMap(lr.maxIter -> 20) - .put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. + .put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. .put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params. // One can also combine ParamMaps. - val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name + val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name. val paramMapCombined = paramMap ++ paramMap2 // Now learn a new model using the paramMapCombined parameters. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala index 6b0be0f34e196da3ea31b981efe3f87b7a91bff9..a4274ae95405eddf202fa74a49ea1b19067369be 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala @@ -23,13 +23,12 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier} import org.apache.spark.ml.feature.{StringIndexer, VectorIndexer} import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor} -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, SparkSession} /** @@ -37,7 +36,7 @@ import org.apache.spark.sql.DataFrame * {{{ * ./bin/run-example ml.GBTExample [options] * }}} - * Decision Trees and ensembles can take a large amount of memory. If the run-example command + * Decision Trees and ensembles can take a large amount of memory. If the run-example command * above fails, try running via spark-submit and specifying the amount of memory as at least 1g. * For local mode, run * {{{ @@ -88,7 +87,7 @@ object GBTExample { .text(s"number of trees in ensemble, default: ${defaultParams.maxIter}") .action((x, c) => c.copy(maxIter = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[Boolean]("cacheNodeIds") @@ -109,7 +108,7 @@ object GBTExample { s"default: ${defaultParams.checkpointInterval}") .action((x, c) => c.copy(checkpointInterval = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -136,15 +135,18 @@ object GBTExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"GBTExample with $params") - val sc = new SparkContext(conf) - params.checkpointDir.foreach(sc.setCheckpointDir) + val spark = SparkSession + .builder + .appName(s"GBTExample with $params") + .getOrCreate() + + params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir) val algo = params.algo.toLowerCase println(s"GBTExample with parameters:\n$params") // Load training and test data and cache it. - val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input, + val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input, params.dataFormat, params.testInput, algo, params.fracTest) // Set up Pipeline @@ -164,7 +166,7 @@ object GBTExample { .setOutputCol("indexedFeatures") .setMaxCategories(10) stages += featuresIndexer - // (3) Learn GBT + // (3) Learn GBT. val dt = algo match { case "classification" => new GBTClassifier() @@ -193,13 +195,13 @@ object GBTExample { stages += dt val pipeline = new Pipeline().setStages(stages.toArray) - // Fit the Pipeline + // Fit the Pipeline. val startTime = System.nanoTime() val pipelineModel = pipeline.fit(training) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") - // Get the trained GBT from the fitted PipelineModel + // Get the trained GBT from the fitted PipelineModel. algo match { case "classification" => val rfModel = pipelineModel.stages.last.asInstanceOf[GBTClassificationModel] @@ -218,7 +220,7 @@ object GBTExample { case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - // Evaluate model on training, test data + // Evaluate model on training, test data. algo match { case "classification" => println("Training data results:") @@ -234,7 +236,7 @@ object GBTExample { throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala index b6a8baba2d95f5d570226555a80d178da82e5789..0d1ffbe2259c4b8c9904987b6694fa8e9011a704 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala @@ -51,7 +51,7 @@ object GradientBoostedTreeClassifierExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a GBT model. @@ -66,11 +66,11 @@ object GradientBoostedTreeClassifierExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels) - // Chain indexers and GBT in a Pipeline + // Chain indexers and GBT in a Pipeline. val pipeline = new Pipeline() .setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter)) - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. val model = pipeline.fit(trainingData) // Make predictions. @@ -79,7 +79,7 @@ object GradientBoostedTreeClassifierExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala index 62285b83cbb9dfe01c2a023b4abab3b32b5c1d37..e53aab7f326d344a71a00f661e963e95e43f1547 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala @@ -45,7 +45,7 @@ object GradientBoostedTreeRegressorExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a GBT model. @@ -54,11 +54,11 @@ object GradientBoostedTreeRegressorExample { .setFeaturesCol("indexedFeatures") .setMaxIter(10) - // Chain indexer and GBT in a Pipeline + // Chain indexer and GBT in a Pipeline. val pipeline = new Pipeline() .setStages(Array(featureIndexer, gbt)) - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. val model = pipeline.fit(trainingData) // Make predictions. @@ -67,7 +67,7 @@ object GradientBoostedTreeRegressorExample { // Select example rows to display. predictions.select("prediction", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala index 25be87811da903434b1b4aa578c8625be4c6450a..de96fb2979ad10af30b8117a982089da613214d6 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala @@ -22,10 +22,9 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.regression.LinearRegression -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, SparkSession} /** * An example runner for linear regression with elastic-net (mixing L1/L2) regularization. @@ -74,11 +73,11 @@ object LinearRegressionExample { s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}") .action((x, c) => c.copy(tol = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -105,13 +104,15 @@ object LinearRegressionExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"LinearRegressionExample with $params") - val sc = new SparkContext(conf) + val spark = SparkSession + .builder + .appName(s"LinearRegressionExample with $params") + .getOrCreate() println(s"LinearRegressionExample with parameters:\n$params") // Load training and test data and cache it. - val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input, + val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input, params.dataFormat, params.testInput, "regression", params.fracTest) val lir = new LinearRegression() @@ -136,7 +137,7 @@ object LinearRegressionExample { println("Test data results:") DecisionTreeExample.evaluateRegressionModel(lirModel, test, "label") - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala index a380c90662a50356996715847f42015355d876d0..c2a87e1ddfd5529de451780688ff7adbcb5e8178 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala @@ -23,12 +23,11 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.feature.StringIndexer -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, SparkSession} /** * An example runner for logistic regression with elastic-net (mixing L1/L2) regularization. @@ -81,11 +80,11 @@ object LogisticRegressionExample { s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}") .action((x, c) => c.copy(tol = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -112,16 +111,18 @@ object LogisticRegressionExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"LogisticRegressionExample with $params") - val sc = new SparkContext(conf) + val spark = SparkSession + .builder + .appName(s"LogisticRegressionExample with $params") + .getOrCreate() println(s"LogisticRegressionExample with parameters:\n$params") // Load training and test data and cache it. - val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input, + val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input, params.dataFormat, params.testInput, "classification", params.fracTest) - // Set up Pipeline + // Set up Pipeline. val stages = new mutable.ArrayBuffer[PipelineStage]() val labelIndexer = new StringIndexer() @@ -141,7 +142,7 @@ object LogisticRegressionExample { stages += lor val pipeline = new Pipeline().setStages(stages.toArray) - // Fit the Pipeline + // Fit the Pipeline. val startTime = System.nanoTime() val pipelineModel = pipeline.fit(training) val elapsedTime = (System.nanoTime() - startTime) / 1e9 @@ -156,7 +157,7 @@ object LogisticRegressionExample { println("Test data results:") DecisionTreeExample.evaluateClassificationModel(pipelineModel, test, "indexedLabel") - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala index fcba813d5be4b70ded9329bdeabc35b0a2b2572e..616263b8e9f48c41bd3bb1231a0a2f1273fa279b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala @@ -27,7 +27,9 @@ object LogisticRegressionWithElasticNetExample { def main(args: Array[String]): Unit = { val spark = SparkSession - .builder.appName("LogisticRegressionWithElasticNetExample").getOrCreate() + .builder + .appName("LogisticRegressionWithElasticNetExample") + .getOrCreate() // $example on$ // Load training data diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala index 5fb3536060c911b61985e39edaa0e46911647e40..c29d36210ab1397f2a894956498a742273c621c2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala @@ -42,7 +42,9 @@ object ModelSelectionViaCrossValidationExample { def main(args: Array[String]): Unit = { val spark = SparkSession - .builder.appName("ModelSelectionViaCrossValidationExample").getOrCreate() + .builder + .appName("ModelSelectionViaCrossValidationExample") + .getOrCreate() // $example on$ // Prepare training data from a list of (id, text, label) tuples. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala index 6bc082982c8940a22be1707d9f1fe04b5cbd121c..75fef2922adbdf89519fee48e10e8cce611565bd 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala @@ -36,7 +36,9 @@ object ModelSelectionViaTrainValidationSplitExample { def main(args: Array[String]): Unit = { val spark = SparkSession - .builder.appName("ModelSelectionViaTrainValidationSplitExample").getOrCreate() + .builder + .appName("ModelSelectionViaTrainValidationSplitExample") + .getOrCreate() // $example on$ // Prepare training and test data. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala index ae0bd945d8fed2d61e16e9602c9cbb6bc777ed2c..cccc4a6ea26b540e3ecab2a8902003f124ee2181 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala @@ -51,7 +51,7 @@ object RandomForestClassifierExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a RandomForest model. @@ -66,11 +66,11 @@ object RandomForestClassifierExample { .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels) - // Chain indexers and forest in a Pipeline + // Chain indexers and forest in a Pipeline. val pipeline = new Pipeline() .setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) - // Train model. This also runs the indexers. + // Train model. This also runs the indexers. val model = pipeline.fit(trainingData) // Make predictions. @@ -79,7 +79,7 @@ object RandomForestClassifierExample { // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala index 7a00d99dfe53d11cbf3569df676515123900d255..2419dc49cd51e8f3abf356e45daa8e5838193a47 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala @@ -23,13 +23,12 @@ import scala.language.reflectiveCalls import scopt.OptionParser -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} import org.apache.spark.ml.feature.{StringIndexer, VectorIndexer} import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, SparkSession} /** @@ -37,7 +36,7 @@ import org.apache.spark.sql.DataFrame * {{{ * ./bin/run-example ml.RandomForestExample [options] * }}} - * Decision Trees and ensembles can take a large amount of memory. If the run-example command + * Decision Trees and ensembles can take a large amount of memory. If the run-example command * above fails, try running via spark-submit and specifying the amount of memory as at least 1g. * For local mode, run * {{{ @@ -94,7 +93,7 @@ object RandomForestExample { s" default: ${defaultParams.numTrees}") .action((x, c) => c.copy(featureSubsetStrategy = x)) opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + + .text(s"fraction of data to hold out for testing. If given option testInput, " + s"this option is ignored. default: ${defaultParams.fracTest}") .action((x, c) => c.copy(fracTest = x)) opt[Boolean]("cacheNodeIds") @@ -115,7 +114,7 @@ object RandomForestExample { s"default: ${defaultParams.checkpointInterval}") .action((x, c) => c.copy(checkpointInterval = x)) opt[String]("testInput") - .text(s"input path to test dataset. If given, option fracTest is ignored." + + .text(s"input path to test dataset. If given, option fracTest is ignored." + s" default: ${defaultParams.testInput}") .action((x, c) => c.copy(testInput = x)) opt[String]("dataFormat") @@ -142,18 +141,21 @@ object RandomForestExample { } def run(params: Params) { - val conf = new SparkConf().setAppName(s"RandomForestExample with $params") - val sc = new SparkContext(conf) - params.checkpointDir.foreach(sc.setCheckpointDir) + val spark = SparkSession + .builder + .appName(s"RandomForestExample with $params") + .getOrCreate() + + params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir) val algo = params.algo.toLowerCase println(s"RandomForestExample with parameters:\n$params") // Load training and test data and cache it. - val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input, + val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input, params.dataFormat, params.testInput, algo, params.fracTest) - // Set up Pipeline + // Set up Pipeline. val stages = new mutable.ArrayBuffer[PipelineStage]() // (1) For classification, re-index classes. val labelColName = if (algo == "classification") "indexedLabel" else "label" @@ -170,7 +172,7 @@ object RandomForestExample { .setOutputCol("indexedFeatures") .setMaxCategories(10) stages += featuresIndexer - // (3) Learn Random Forest + // (3) Learn Random Forest. val dt = algo match { case "classification" => new RandomForestClassifier() @@ -201,13 +203,13 @@ object RandomForestExample { stages += dt val pipeline = new Pipeline().setStages(stages.toArray) - // Fit the Pipeline + // Fit the Pipeline. val startTime = System.nanoTime() val pipelineModel = pipeline.fit(training) val elapsedTime = (System.nanoTime() - startTime) / 1e9 println(s"Training time: $elapsedTime seconds") - // Get the trained Random Forest from the fitted PipelineModel + // Get the trained Random Forest from the fitted PipelineModel. algo match { case "classification" => val rfModel = pipelineModel.stages.last.asInstanceOf[RandomForestClassificationModel] @@ -226,7 +228,7 @@ object RandomForestExample { case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - // Evaluate model on training, test data + // Evaluate model on training, test data. algo match { case "classification" => println("Training data results:") @@ -242,7 +244,7 @@ object RandomForestExample { throw new IllegalArgumentException("Algo ${params.algo} not supported.") } - sc.stop() + spark.stop() } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala index 96dc2f05be974c8caf8b4f19be5c89fd1c5b84f5..9a0a001c26ef5ef96816a02ac1d509cd9eef4f7b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala @@ -45,7 +45,7 @@ object RandomForestRegressorExample { .setMaxCategories(4) .fit(data) - // Split the data into training and test sets (30% held out for testing) + // Split the data into training and test sets (30% held out for testing). val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) // Train a RandomForest model. @@ -53,11 +53,11 @@ object RandomForestRegressorExample { .setLabelCol("label") .setFeaturesCol("indexedFeatures") - // Chain indexer and forest in a Pipeline + // Chain indexer and forest in a Pipeline. val pipeline = new Pipeline() .setStages(Array(featureIndexer, rf)) - // Train model. This also runs the indexer. + // Train model. This also runs the indexer. val model = pipeline.fit(trainingData) // Make predictions. @@ -66,7 +66,7 @@ object RandomForestRegressorExample { // Select example rows to display. predictions.select("prediction", "label", "features").show(5) - // Select (prediction, true label) and compute test error + // Select (prediction, true label) and compute test error. val evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala index 3547dd95bdcedd727a6c2e59d94b284c93ae710f..83bab5c55758adcc5fb968a6cd73fa5ad4b774e1 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala @@ -41,7 +41,7 @@ object SimpleParamsExample { import spark.implicits._ // Prepare training data. - // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes + // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes // into DataFrames, where it uses the case class metadata to infer the schema. val training = spark.createDataFrame(Seq( LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), @@ -49,7 +49,7 @@ object SimpleParamsExample { LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)))) - // Create a LogisticRegression instance. This instance is an Estimator. + // Create a LogisticRegression instance. This instance is an Estimator. val lr = new LogisticRegression() // Print out the parameters, documentation, and any default values. println("LogisticRegression parameters:\n" + lr.explainParams() + "\n") @@ -58,7 +58,7 @@ object SimpleParamsExample { lr.setMaxIter(10) .setRegParam(0.01) - // Learn a LogisticRegression model. This uses the parameters stored in lr. + // Learn a LogisticRegression model. This uses the parameters stored in lr. val model1 = lr.fit(training) // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). @@ -69,7 +69,7 @@ object SimpleParamsExample { // We may alternatively specify parameters using a ParamMap, // which supports several methods for specifying parameters. val paramMap = ParamMap(lr.maxIter -> 20) - paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. + paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.45, 0.55)) // Specify multiple Params. // One can also combine ParamMaps.