Skip to content
Snippets Groups Projects
Commit e3f8a033 authored by Bryan Cutler's avatar Bryan Cutler Committed by Sean Owen
Browse files

[SPARK-16403][EXAMPLES] Cleanup to remove unused imports, consistent style, minor fixes

## What changes were proposed in this pull request?

Cleanup of examples, mostly from PySpark-ML to fix minor issues:  unused imports, style consistency, pipeline_example is a duplicate, use future print funciton, and a spelling error.

* The "Pipeline Example" is duplicated by "Simple Text Classification Pipeline" in Scala, Python, and Java.

* "Estimator Transformer Param Example" is duplicated by "Simple Params Example" in Scala, Python and Java

* Synced random_forest_classifier_example.py with Scala by adding IndexToString label converted

* Synced train_validation_split.py (in Scala ModelSelectionViaTrainValidationExample) by adjusting data split, adding grid for intercept.

* RegexTokenizer was doing nothing in tokenizer_example.py and JavaTokenizerExample.java, synced with Scala version

## How was this patch tested?
local tests and run modified examples

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #14081 from BryanCutler/examples-cleanup-SPARK-16403.
parent 252d4f27
No related branches found
No related tags found
No related merge requests found
Showing
with 56 additions and 245 deletions
......@@ -60,7 +60,7 @@ public class JavaPipelineExample {
.setOutputCol("features");
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01);
.setRegParam(0.001);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
......@@ -71,7 +71,7 @@ public class JavaPipelineExample {
Dataset<Row> test = spark.createDataFrame(Arrays.asList(
new JavaDocument(4L, "spark i j k"),
new JavaDocument(5L, "l m n"),
new JavaDocument(6L, "mapreduce spark"),
new JavaDocument(6L, "spark hadoop spark"),
new JavaDocument(7L, "apache hadoop")
), JavaDocument.class);
......
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.examples.ml;
import java.util.List;
import com.google.common.collect.Lists;
import org.apache.spark.ml.classification.LogisticRegressionModel;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
/**
* A simple example demonstrating ways to specify parameters for Estimators and Transformers.
* Run with
* {{{
* bin/run-example ml.JavaSimpleParamsExample
* }}}
*/
public class JavaSimpleParamsExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("JavaSimpleParamsExample")
.getOrCreate();
// Prepare training data.
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
// into DataFrames, where it uses the bean metadata to infer the schema.
List<LabeledPoint> localTraining = Lists.newArrayList(
new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
Dataset<Row> training =
spark.createDataFrame(localTraining, LabeledPoint.class);
// Create a LogisticRegression instance. This instance is an Estimator.
LogisticRegression lr = new LogisticRegression();
// Print out the parameters, documentation, and any default values.
System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
// We may set parameters using setter methods.
lr.setMaxIter(10)
.setRegParam(0.01);
// Learn a LogisticRegression model. This uses the parameters stored in lr.
LogisticRegressionModel model1 = lr.fit(training);
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
// This prints the parameter (name: value) pairs, where names are unique IDs for this
// LogisticRegression instance.
System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
// We may alternatively specify parameters using a ParamMap.
ParamMap paramMap = new ParamMap();
paramMap.put(lr.maxIter().w(20)); // Specify 1 Param.
paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter.
double[] thresholds = {0.5, 0.5};
paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params.
// One can also combine ParamMaps.
ParamMap paramMap2 = new ParamMap();
paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name.
ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
// Now learn a new model using the paramMapCombined parameters.
// paramMapCombined overrides all parameters set earlier via lr.set* methods.
LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
// Prepare test documents.
List<LabeledPoint> localTest = Lists.newArrayList(
new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
Dataset<Row> test = spark.createDataFrame(localTest, LabeledPoint.class);
// Make predictions on test documents using the Transformer.transform() method.
// LogisticRegressionModel.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'myProbability' column instead of the usual
// 'probability' column since we renamed the lr.probabilityCol parameter previously.
Dataset<Row> results = model2.transform(test);
Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
for (Row r: rows.collectAsList()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}
spark.stop();
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.examples.ml;
import java.util.List;
import com.google.common.collect.Lists;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
/**
* A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
* bean classes {@link LabeledDocument} and {@link Document} defined in the Scala counterpart of
* this example {@link SimpleTextClassificationPipeline}. Run with
* <pre>
* bin/run-example ml.JavaSimpleTextClassificationPipeline
* </pre>
*/
public class JavaSimpleTextClassificationPipeline {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("JavaSimpleTextClassificationPipeline")
.getOrCreate();
// Prepare training documents, which are labeled.
List<LabeledDocument> localTraining = Lists.newArrayList(
new LabeledDocument(0L, "a b c d e spark", 1.0),
new LabeledDocument(1L, "b d", 0.0),
new LabeledDocument(2L, "spark f g h", 1.0),
new LabeledDocument(3L, "hadoop mapreduce", 0.0));
Dataset<Row> training =
spark.createDataFrame(localTraining, LabeledDocument.class);
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
Tokenizer tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words");
HashingTF hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(tokenizer.getOutputCol())
.setOutputCol("features");
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.001);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
// Fit the pipeline to training documents.
PipelineModel model = pipeline.fit(training);
// Prepare test documents, which are unlabeled.
List<Document> localTest = Lists.newArrayList(
new Document(4L, "spark i j k"),
new Document(5L, "l m n"),
new Document(6L, "spark hadoop spark"),
new Document(7L, "apache hadoop"));
Dataset<Row> test = spark.createDataFrame(localTest, Document.class);
// Make predictions on test documents.
Dataset<Row> predictions = model.transform(test);
for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}
spark.stop();
}
}
......@@ -47,7 +47,7 @@ public class JavaStopWordsRemoverExample {
.setOutputCol("filtered");
List<Row> data = Arrays.asList(
RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
);
......
......@@ -57,17 +57,24 @@ public class JavaTokenizerExample {
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
Dataset<Row> wordsDataFrame = tokenizer.transform(sentenceDataFrame);
for (Row r : wordsDataFrame.select("words", "label").takeAsList(3)) {
RegexTokenizer regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
for (Row r : tokenized.select("words", "label").takeAsList(3)) {
java.util.List<String> words = r.getList(0);
for (String word : words) System.out.print(word + " ");
System.out.println();
}
RegexTokenizer regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
for (Row r : regexTokenized.select("words", "label").takeAsList(3)) {
java.util.List<String> words = r.getList(0);
for (String word : words) System.out.print(word + " ");
System.out.println();
}
// $example off$
spark.stop();
}
......
......@@ -32,7 +32,7 @@ Run with:
if __name__ == "__main__":
spark = SparkSession \
.builder \
.appName("PythonAFTSurvivalRegressionExample") \
.appName("AFTSurvivalRegressionExample") \
.getOrCreate()
# $example on$
......
......@@ -31,7 +31,7 @@ Run with:
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("PythonBisectingKMeansExample")\
.appName("BisectingKMeansExample")\
.getOrCreate()
# $example on$
......
......@@ -24,7 +24,7 @@ from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# $example off$
from pyspark.sql import Row, SparkSession
from pyspark.sql import SparkSession
"""
A simple example demonstrating model selection using CrossValidator.
......@@ -39,6 +39,7 @@ if __name__ == "__main__":
.builder\
.appName("CrossValidatorExample")\
.getOrCreate()
# $example on$
# Prepare training documents, which are labeled.
training = spark.createDataFrame([
......
......@@ -34,15 +34,16 @@ if __name__ == "__main__":
if len(sys.argv) > 2:
print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
exit(-1)
spark = SparkSession\
.builder\
.appName("DataFrameExample")\
.getOrCreate()
if len(sys.argv) == 2:
elif len(sys.argv) == 2:
input = sys.argv[1]
else:
input = "data/mllib/sample_libsvm_data.txt"
spark = SparkSession \
.builder \
.appName("DataFrameExample") \
.getOrCreate()
# Load input data
print("Loading LIBSVM file with UDT from " + input + ".")
df = spark.read.format("libsvm").load(input).cache()
......
......@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("decision_tree_classification_example")\
.appName("DecisionTreeClassificationExample")\
.getOrCreate()
# $example on$
......
......@@ -18,6 +18,7 @@
"""
Estimator Transformer Param Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml.linalg import Vectors
......@@ -42,7 +43,7 @@ if __name__ == "__main__":
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)
......@@ -51,8 +52,8 @@ if __name__ == "__main__":
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print "Model 1 was fit using parameters: "
print model1.extractParamMap()
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
......@@ -67,8 +68,8 @@ if __name__ == "__main__":
# Now learn a new model using the paramMapCombined parameters.
# paramMapCombined overrides all parameters set earlier via lr.set* methods.
model2 = lr.fit(training, paramMapCombined)
print "Model 2 was fit using parameters: "
print model2.extractParamMap()
print("Model 2 was fit using parameters: ")
print(model2.extractParamMap())
# Prepare test data
test = spark.createDataFrame([
......@@ -81,9 +82,12 @@ if __name__ == "__main__":
# Note that model2.transform() outputs a "myProbability" column instead of the usual
# 'probability' column since we renamed the lr.probabilityCol parameter previously.
prediction = model2.transform(test)
selected = prediction.select("features", "label", "myProbability", "prediction")
for row in selected.collect():
print row
result = prediction.select("features", "label", "myProbability", "prediction") \
.collect()
for row in result:
print("features=%s, label=%s -> prob=%s, prediction=%s"
% (row.features, row.label, row.myProbability, row.prediction))
# $example off$
spark.stop()
......@@ -31,7 +31,7 @@ Run with:
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("PythonGuassianMixtureExample")\
.appName("GaussianMixtureExample")\
.getOrCreate()
# $example on$
......
......@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("gradient_boosted_tree_classifier_example")\
.appName("GradientBoostedTreeClassifierExample")\
.getOrCreate()
# $example on$
......
......@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("gradient_boosted_tree_regressor_example")\
.appName("GradientBoostedTreeRegressorExample")\
.getOrCreate()
# $example on$
......
......@@ -21,7 +21,7 @@ Isotonic Regression Example.
from __future__ import print_function
# $example on$
from pyspark.ml.regression import IsotonicRegression, IsotonicRegressionModel
from pyspark.ml.regression import IsotonicRegression
# $example off$
from pyspark.sql import SparkSession
......@@ -30,11 +30,11 @@ An example demonstrating isotonic regression.
Run with:
bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
"""
if __name__ == "__main__":
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("PythonIsotonicRegressionExample")\
.appName("IsotonicRegressionExample")\
.getOrCreate()
# $example on$
......
......@@ -31,12 +31,10 @@ Run with:
This example requires NumPy (http://www.numpy.org/).
"""
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("PythonKMeansExample")\
.appName("KMeansExample")\
.getOrCreate()
# $example on$
......
......@@ -23,16 +23,13 @@ from pyspark.ml.clustering import LDA
# $example off$
from pyspark.sql import SparkSession
"""
An example demonstrating LDA.
Run with:
bin/spark-submit examples/src/main/python/ml/lda_example.py
"""
if __name__ == "__main__":
# Creates a SparkSession
spark = SparkSession \
.builder \
.appName("LDAExample") \
......
......@@ -31,18 +31,23 @@ if __name__ == "__main__":
# Load training data
data = spark.read.format("libsvm")\
.load("data/mllib/sample_multiclass_classification_data.txt")
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
# train the model
model = trainer.fit(train)
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
......
......@@ -34,8 +34,10 @@ if __name__ == "__main__":
(1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
(2, ["Logistic", "regression", "models", "are", "neat"])
], ["label", "words"])
ngram = NGram(inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordDataFrame)
for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
print(ngrams_label)
# $example off$
......
......@@ -26,13 +26,14 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("naive_bayes_example")\
.appName("NaiveBayesExample")\
.getOrCreate()
# $example on$
# Load training data
data = spark.read.format("libsvm") \
.load("data/mllib/sample_libsvm_data.txt")
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
......@@ -43,6 +44,7 @@ if __name__ == "__main__":
# train the model
model = nb.fit(train)
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment