Skip to content
Snippets Groups Projects
Commit e2ec32da authored by hyukjinkwon's avatar hyukjinkwon Committed by Nick Pentreath
Browse files

[SPARK-15031][EXAMPLES][FOLLOW-UP] Make Python param example working with SparkSession

## What changes were proposed in this pull request?

It seems most of Python examples were changed to use SparkSession by https://github.com/apache/spark/pull/12809. This PR said both examples below:

- `simple_params_example.py`
- `aft_survival_regression.py`

are not changed because it dose not work. It seems `aft_survival_regression.py` is changed by https://github.com/apache/spark/pull/13050 but `simple_params_example.py` is not yet.

This PR corrects the example and make this use SparkSession.

In more detail, it seems `threshold` is replaced to `thresholds` here and there by https://github.com/apache/spark/commit/5a23213c148bfe362514f9c71f5273ebda0a848a. However, when it calls `lr.fit(training, paramMap)` this overwrites the values. So, `threshold` was 5 and `thresholds` becomes 5.5 (by `1 / (1 + thresholds(0) / thresholds(1)`).

According to the comment below. this is not allowed, https://github.com/apache/spark/blob/354f8f11bd4b20fa99bd67a98da3525fd3d75c81/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala#L58-L61.

So, in this PR, it sets the equivalent value so that this does not throw an exception.

## How was this patch tested?

Manully (`mvn package -DskipTests && spark-submit simple_params_example.py`)

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #13135 from HyukjinKwon/SPARK-15031.
parent 661c2104
No related branches found
No related tags found
No related merge requests found
...@@ -77,7 +77,7 @@ public class JavaSimpleParamsExample { ...@@ -77,7 +77,7 @@ public class JavaSimpleParamsExample {
ParamMap paramMap = new ParamMap(); ParamMap paramMap = new ParamMap();
paramMap.put(lr.maxIter().w(20)); // Specify 1 Param. paramMap.put(lr.maxIter().w(20)); // Specify 1 Param.
paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter. paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter.
double[] thresholds = {0.45, 0.55}; double[] thresholds = {0.5, 0.5};
paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params. paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params.
// One can also combine ParamMaps. // One can also combine ParamMaps.
......
...@@ -20,11 +20,10 @@ from __future__ import print_function ...@@ -20,11 +20,10 @@ from __future__ import print_function
import pprint import pprint
import sys import sys
from pyspark import SparkContext
from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.linalg import DenseVector from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SQLContext from pyspark.sql import SparkSession
""" """
A simple example demonstrating ways to specify parameters for Estimators and Transformers. A simple example demonstrating ways to specify parameters for Estimators and Transformers.
...@@ -33,21 +32,20 @@ Run with: ...@@ -33,21 +32,20 @@ Run with:
""" """
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) > 1: spark = SparkSession \
print("Usage: simple_params_example", file=sys.stderr) .builder \
exit(1) .appName("SimpleTextClassificationPipeline") \
sc = SparkContext(appName="PythonSimpleParamsExample") .getOrCreate()
sqlContext = SQLContext(sc)
# prepare training data. # prepare training data.
# We create an RDD of LabeledPoints and convert them into a DataFrame. # We create an RDD of LabeledPoints and convert them into a DataFrame.
# A LabeledPoint is an Object with two fields named label and features # A LabeledPoint is an Object with two fields named label and features
# and Spark SQL identifies these fields and creates the schema appropriately. # and Spark SQL identifies these fields and creates the schema appropriately.
training = sc.parallelize([ training = spark.createDataFrame([
LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])), LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])), LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),
LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])), LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])),
LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))]).toDF() LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))])
# Create a LogisticRegression instance with maxIter = 10. # Create a LogisticRegression instance with maxIter = 10.
# This instance is an Estimator. # This instance is an Estimator.
...@@ -70,7 +68,7 @@ if __name__ == "__main__": ...@@ -70,7 +68,7 @@ if __name__ == "__main__":
# We may alternatively specify parameters using a parameter map. # We may alternatively specify parameters using a parameter map.
# paramMap overrides all lr parameters set earlier. # paramMap overrides all lr parameters set earlier.
paramMap = {lr.maxIter: 20, lr.thresholds: [0.45, 0.55], lr.probabilityCol: "myProbability"} paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"}
# Now learn a new model using the new parameters. # Now learn a new model using the new parameters.
model2 = lr.fit(training, paramMap) model2 = lr.fit(training, paramMap)
...@@ -78,10 +76,10 @@ if __name__ == "__main__": ...@@ -78,10 +76,10 @@ if __name__ == "__main__":
pprint.pprint(model2.extractParamMap()) pprint.pprint(model2.extractParamMap())
# prepare test data. # prepare test data.
test = sc.parallelize([ test = spark.createDataFrame([
LabeledPoint(1.0, DenseVector([-1.0, 1.5, 1.3])), LabeledPoint(1.0, DenseVector([-1.0, 1.5, 1.3])),
LabeledPoint(0.0, DenseVector([3.0, 2.0, -0.1])), LabeledPoint(0.0, DenseVector([3.0, 2.0, -0.1])),
LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))]).toDF() LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))])
# Make predictions on test data using the Transformer.transform() method. # Make predictions on test data using the Transformer.transform() method.
# LogisticRegressionModel.transform will only use the 'features' column. # LogisticRegressionModel.transform will only use the 'features' column.
...@@ -95,4 +93,4 @@ if __name__ == "__main__": ...@@ -95,4 +93,4 @@ if __name__ == "__main__":
print("features=%s,label=%s -> prob=%s, prediction=%s" print("features=%s,label=%s -> prob=%s, prediction=%s"
% (row.features, row.label, row.myProbability, row.prediction)) % (row.features, row.label, row.myProbability, row.prediction))
sc.stop() spark.stop()
...@@ -70,7 +70,7 @@ object SimpleParamsExample { ...@@ -70,7 +70,7 @@ object SimpleParamsExample {
// which supports several methods for specifying parameters. // which supports several methods for specifying parameters.
val paramMap = ParamMap(lr.maxIter -> 20) val paramMap = ParamMap(lr.maxIter -> 20)
paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.45, 0.55)) // Specify multiple Params. paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.5, 0.5)) // Specify multiple Params.
// One can also combine ParamMaps. // One can also combine ParamMaps.
val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment