[SPARK-16403][EXAMPLES] Cleanup to remove unused imports, consistent style, minor fixes

## What changes were proposed in this pull request? Cleanup of examples, mostly from PySpark-ML to fix minor issues: unused imports, style consistency, pipeline_example is a duplicate, use future print funciton, and a spelling error. * The "Pipeline Example" is duplicated by "Simple Text Classification Pipeline" in Scala, Python, and Java. * "Estimator Transformer Param Example" is duplicated by "Simple Params Example" in Scala, Python and Java * Synced random_forest_classifier_example.py with Scala by adding IndexToString label converted * Synced train_validation_split.py (in Scala ModelSelectionViaTrainValidationExample) by adjusting data split, adding grid for intercept. * RegexTokenizer was doing nothing in tokenizer_example.py and JavaTokenizerExample.java, synced with Scala version ## How was this patch tested? local tests and run modified examples Author: Bryan Cutler <cutlerb@gmail.com> Closes #14081 from BryanCutler/examples-cleanup-SPARK-16403.

[SPARK-16403][EXAMPLES] Cleanup to remove unused imports, consistent style, minor fixes
e3f8a033 · Bryan Cutler · Sean Owen · 252d4f27 · e3f8a033 · 252d4f27
Commit e3f8a033 authored 8 years ago by Bryan Cutler Committed by Sean Owen 8 years ago
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
@@ -60,7 +60,7 @@ public class JavaPipelineExample {
      .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
      .setMaxIter(10)
-      .setRegParam(0.01);
+      .setRegParam(0.001);
    Pipeline pipeline = new Pipeline()
      .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

@@ -71,7 +71,7 @@ public class JavaPipelineExample {
    Dataset<Row> test = spark.createDataFrame(Arrays.asList(
      new JavaDocument(4L, "spark i j k"),
      new JavaDocument(5L, "l m n"),
-      new JavaDocument(6L, "mapreduce spark"),
+      new JavaDocument(6L, "spark hadoop spark"),
      new JavaDocument(7L, "apache hadoop")
    ), JavaDocument.class);


--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import java.util.List;
-
-import com.google.common.collect.Lists;
-
-import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SparkSession;
-
-/**
- * A simple example demonstrating ways to specify parameters for Estimators and Transformers.
- * Run with
- * {{{
- * bin/run-example ml.JavaSimpleParamsExample
- * }}}
- */
-public class JavaSimpleParamsExample {
-
-  public static void main(String[] args) {
-    SparkSession spark = SparkSession
-      .builder()
-      .appName("JavaSimpleParamsExample")
-      .getOrCreate();
-
-    // Prepare training data.
-    // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
-    // into DataFrames, where it uses the bean metadata to infer the schema.
-    List<LabeledPoint> localTraining = Lists.newArrayList(
-      new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
-      new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
-      new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
-      new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-    Dataset<Row> training =
-      spark.createDataFrame(localTraining, LabeledPoint.class);
-
-    // Create a LogisticRegression instance. This instance is an Estimator.
-    LogisticRegression lr = new LogisticRegression();
-    // Print out the parameters, documentation, and any default values.
-    System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
-
-    // We may set parameters using setter methods.
-    lr.setMaxIter(10)
-      .setRegParam(0.01);
-
-    // Learn a LogisticRegression model. This uses the parameters stored in lr.
-    LogisticRegressionModel model1 = lr.fit(training);
-    // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
-    // we can view the parameters it used during fit().
-    // This prints the parameter (name: value) pairs, where names are unique IDs for this
-    // LogisticRegression instance.
-    System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
-
-    // We may alternatively specify parameters using a ParamMap.
-    ParamMap paramMap = new ParamMap();
-    paramMap.put(lr.maxIter().w(20)); // Specify 1 Param.
-    paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter.
-    double[] thresholds = {0.5, 0.5};
-    paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params.
-
-    // One can also combine ParamMaps.
-    ParamMap paramMap2 = new ParamMap();
-    paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name.
-    ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
-
-    // Now learn a new model using the paramMapCombined parameters.
-    // paramMapCombined overrides all parameters set earlier via lr.set* methods.
-    LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
-    System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
-
-    // Prepare test documents.
-    List<LabeledPoint> localTest = Lists.newArrayList(
-        new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
-        new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
-        new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-    Dataset<Row> test = spark.createDataFrame(localTest, LabeledPoint.class);
-
-    // Make predictions on test documents using the Transformer.transform() method.
-    // LogisticRegressionModel.transform will only use the 'features' column.
-    // Note that model2.transform() outputs a 'myProbability' column instead of the usual
-    // 'probability' column since we renamed the lr.probabilityCol parameter previously.
-    Dataset<Row> results = model2.transform(test);
-    Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
-    for (Row r: rows.collectAsList()) {
-      System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
-          + ", prediction=" + r.get(3));
-    }
-
-    spark.stop();
-  }
-}
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import java.util.List;
-
-import com.google.common.collect.Lists;
-
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineModel;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SparkSession;
-
-/**
- * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
- * bean classes {@link LabeledDocument} and {@link Document} defined in the Scala counterpart of
- * this example {@link SimpleTextClassificationPipeline}. Run with
- * <pre>
- * bin/run-example ml.JavaSimpleTextClassificationPipeline
- * </pre>
- */
-public class JavaSimpleTextClassificationPipeline {
-
-  public static void main(String[] args) {
-    SparkSession spark = SparkSession
-      .builder()
-      .appName("JavaSimpleTextClassificationPipeline")
-      .getOrCreate();
-
-    // Prepare training documents, which are labeled.
-    List<LabeledDocument> localTraining = Lists.newArrayList(
-      new LabeledDocument(0L, "a b c d e spark", 1.0),
-      new LabeledDocument(1L, "b d", 0.0),
-      new LabeledDocument(2L, "spark f g h", 1.0),
-      new LabeledDocument(3L, "hadoop mapreduce", 0.0));
-    Dataset<Row> training =
-      spark.createDataFrame(localTraining, LabeledDocument.class);
-
-    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
-    Tokenizer tokenizer = new Tokenizer()
-      .setInputCol("text")
-      .setOutputCol("words");
-    HashingTF hashingTF = new HashingTF()
-      .setNumFeatures(1000)
-      .setInputCol(tokenizer.getOutputCol())
-      .setOutputCol("features");
-    LogisticRegression lr = new LogisticRegression()
-      .setMaxIter(10)
-      .setRegParam(0.001);
-    Pipeline pipeline = new Pipeline()
-      .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
-
-    // Fit the pipeline to training documents.
-    PipelineModel model = pipeline.fit(training);
-
-    // Prepare test documents, which are unlabeled.
-    List<Document> localTest = Lists.newArrayList(
-      new Document(4L, "spark i j k"),
-      new Document(5L, "l m n"),
-      new Document(6L, "spark hadoop spark"),
-      new Document(7L, "apache hadoop"));
-    Dataset<Row> test = spark.createDataFrame(localTest, Document.class);
-
-    // Make predictions on test documents.
-    Dataset<Row> predictions = model.transform(test);
-    for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) {
-      System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
-          + ", prediction=" + r.get(3));
-    }
-
-    spark.stop();
-  }
-}
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
@@ -47,7 +47,7 @@ public class JavaStopWordsRemoverExample {
      .setOutputCol("filtered");

    List<Row> data = Arrays.asList(
-      RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+      RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
      RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
    );


--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -57,17 +57,24 @@ public class JavaTokenizerExample {

    Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");

-    Dataset<Row> wordsDataFrame = tokenizer.transform(sentenceDataFrame);
-    for (Row r : wordsDataFrame.select("words", "label").takeAsList(3)) {
+    RegexTokenizer regexTokenizer = new RegexTokenizer()
+        .setInputCol("sentence")
+        .setOutputCol("words")
+        .setPattern("\\W");  // alternatively .setPattern("\\w+").setGaps(false);
+
+    Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
+    for (Row r : tokenized.select("words", "label").takeAsList(3)) {
      java.util.List<String> words = r.getList(0);
      for (String word : words) System.out.print(word + " ");
      System.out.println();
    }

-    RegexTokenizer regexTokenizer = new RegexTokenizer()
-      .setInputCol("sentence")
-      .setOutputCol("words")
-      .setPattern("\\W");  // alternatively .setPattern("\\w+").setGaps(false);
+    Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
+    for (Row r : regexTokenized.select("words", "label").takeAsList(3)) {
+      java.util.List<String> words = r.getList(0);
+      for (String word : words) System.out.print(word + " ");
+      System.out.println();
+    }
    // $example off$
    spark.stop();
  }

--- a/examples/src/main/python/ml/aft_survival_regression.py
+++ b/examples/src/main/python/ml/aft_survival_regression.py
@@ -32,7 +32,7 @@ Run with:
 if __name__ == "__main__":
    spark = SparkSession \
        .builder \
-        .appName("PythonAFTSurvivalRegressionExample") \
+        .appName("AFTSurvivalRegressionExample") \
        .getOrCreate()

    # $example on$

--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -31,7 +31,7 @@ Run with:
 if __name__ == "__main__":
    spark = SparkSession\
        .builder\
-        .appName("PythonBisectingKMeansExample")\
+        .appName("BisectingKMeansExample")\
        .getOrCreate()

    # $example on$

--- a/examples/src/main/python/ml/cross_validator.py
+++ b/examples/src/main/python/ml/cross_validator.py
@@ -24,7 +24,7 @@ from pyspark.ml.evaluation import BinaryClassificationEvaluator
 from pyspark.ml.feature import HashingTF, Tokenizer
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 # $example off$
-from pyspark.sql import Row, SparkSession
+from pyspark.sql import SparkSession

 """
 A simple example demonstrating model selection using CrossValidator.
@@ -39,6 +39,7 @@ if __name__ == "__main__":
        .builder\
        .appName("CrossValidatorExample")\
        .getOrCreate()
+
    # $example on$
    # Prepare training documents, which are labeled.
    training = spark.createDataFrame([

--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@@ -34,15 +34,16 @@ if __name__ == "__main__":
    if len(sys.argv) > 2:
        print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
        exit(-1)
-    spark = SparkSession\
-        .builder\
-        .appName("DataFrameExample")\
-        .getOrCreate()
-    if len(sys.argv) == 2:
+    elif len(sys.argv) == 2:
        input = sys.argv[1]
    else:
        input = "data/mllib/sample_libsvm_data.txt"

+    spark = SparkSession \
+        .builder \
+        .appName("DataFrameExample") \
+        .getOrCreate()
+
    # Load input data
    print("Loading LIBSVM file with UDT from " + input + ".")
    df = spark.read.format("libsvm").load(input).cache()

--- a/examples/src/main/python/ml/decision_tree_classification_example.py
+++ b/examples/src/main/python/ml/decision_tree_classification_example.py
@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
 if __name__ == "__main__":
    spark = SparkSession\
        .builder\
-        .appName("decision_tree_classification_example")\
+        .appName("DecisionTreeClassificationExample")\
        .getOrCreate()

    # $example on$

--- a/examples/src/main/python/ml/estimator_transformer_param_example.py
+++ b/examples/src/main/python/ml/estimator_transformer_param_example.py
@@ -18,6 +18,7 @@
 """
 Estimator Transformer Param Example.
 """
+from __future__ import print_function

 # $example on$
 from pyspark.ml.linalg import Vectors
@@ -42,7 +43,7 @@ if __name__ == "__main__":
    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
-    print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
+    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    model1 = lr.fit(training)
@@ -51,8 +52,8 @@ if __name__ == "__main__":
    # we can view the parameters it used during fit().
    # This prints the parameter (name: value) pairs, where names are unique IDs for this
    # LogisticRegression instance.
-    print "Model 1 was fit using parameters: "
-    print model1.extractParamMap()
+    print("Model 1 was fit using parameters: ")
+    print(model1.extractParamMap())

    # We may alternatively specify parameters using a Python dictionary as a paramMap
    paramMap = {lr.maxIter: 20}
@@ -67,8 +68,8 @@ if __name__ == "__main__":
    # Now learn a new model using the paramMapCombined parameters.
    # paramMapCombined overrides all parameters set earlier via lr.set* methods.
    model2 = lr.fit(training, paramMapCombined)
-    print "Model 2 was fit using parameters: "
-    print model2.extractParamMap()
+    print("Model 2 was fit using parameters: ")
+    print(model2.extractParamMap())

    # Prepare test data
    test = spark.createDataFrame([
@@ -81,9 +82,12 @@ if __name__ == "__main__":
    # Note that model2.transform() outputs a "myProbability" column instead of the usual
    # 'probability' column since we renamed the lr.probabilityCol parameter previously.
    prediction = model2.transform(test)
-    selected = prediction.select("features", "label", "myProbability", "prediction")
-    for row in selected.collect():
-        print row
+    result = prediction.select("features", "label", "myProbability", "prediction") \
+        .collect()
+
+    for row in result:
+        print("features=%s, label=%s -> prob=%s, prediction=%s"
+              % (row.features, row.label, row.myProbability, row.prediction))
    # $example off$

    spark.stop()
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -31,7 +31,7 @@ Run with:
 if __name__ == "__main__":
    spark = SparkSession\
        .builder\
-        .appName("PythonGuassianMixtureExample")\
+        .appName("GaussianMixtureExample")\
        .getOrCreate()

    # $example on$

--- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
 if __name__ == "__main__":
    spark = SparkSession\
        .builder\
-        .appName("gradient_boosted_tree_classifier_example")\
+        .appName("GradientBoostedTreeClassifierExample")\
        .getOrCreate()

    # $example on$

--- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
 if __name__ == "__main__":
    spark = SparkSession\
        .builder\
-        .appName("gradient_boosted_tree_regressor_example")\
+        .appName("GradientBoostedTreeRegressorExample")\
        .getOrCreate()

    # $example on$

--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -21,7 +21,7 @@ Isotonic Regression Example.
 from __future__ import print_function

 # $example on$
-from pyspark.ml.regression import IsotonicRegression, IsotonicRegressionModel
+from pyspark.ml.regression import IsotonicRegression
 # $example off$
 from pyspark.sql import SparkSession

@@ -30,11 +30,11 @@ An example demonstrating isotonic regression.
 Run with:
  bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
 """
-if __name__ == "__main__":

+if __name__ == "__main__":
    spark = SparkSession\
        .builder\
-        .appName("PythonIsotonicRegressionExample")\
+        .appName("IsotonicRegressionExample")\
        .getOrCreate()

    # $example on$

--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@@ -31,12 +31,10 @@ Run with:
 This example requires NumPy (http://www.numpy.org/).
 """

-
 if __name__ == "__main__":
-
    spark = SparkSession\
        .builder\
-        .appName("PythonKMeansExample")\
+        .appName("KMeansExample")\
        .getOrCreate()

    # $example on$

--- a/examples/src/main/python/ml/lda_example.py
+++ b/examples/src/main/python/ml/lda_example.py
@@ -23,16 +23,13 @@ from pyspark.ml.clustering import LDA
 # $example off$
 from pyspark.sql import SparkSession

-
 """
 An example demonstrating LDA.
 Run with:
  bin/spark-submit examples/src/main/python/ml/lda_example.py
 """

-
 if __name__ == "__main__":
-    # Creates a SparkSession
    spark = SparkSession \
        .builder \
        .appName("LDAExample") \

--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -31,18 +31,23 @@ if __name__ == "__main__":
    # Load training data
    data = spark.read.format("libsvm")\
        .load("data/mllib/sample_multiclass_classification_data.txt")
+
    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]
+
    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]
+
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
+
    # train the model
    model = trainer.fit(train)
+
    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")

--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -34,8 +34,10 @@ if __name__ == "__main__":
        (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
        (2, ["Logistic", "regression", "models", "are", "neat"])
    ], ["label", "words"])
+
    ngram = NGram(inputCol="words", outputCol="ngrams")
    ngramDataFrame = ngram.transform(wordDataFrame)
+
    for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
        print(ngrams_label)
    # $example off$

--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@@ -26,13 +26,14 @@ from pyspark.sql import SparkSession
 if __name__ == "__main__":
    spark = SparkSession\
        .builder\
-        .appName("naive_bayes_example")\
+        .appName("NaiveBayesExample")\
        .getOrCreate()

    # $example on$
    # Load training data
    data = spark.read.format("libsvm") \
        .load("data/mllib/sample_libsvm_data.txt")
+
    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
@@ -43,6 +44,7 @@ if __name__ == "__main__":

    # train the model
    model = nb.fit(train)
+
    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")