diff --git a/data/mllib/sample_isotonic_regression_data.txt b/data/mllib/sample_isotonic_regression_data.txt deleted file mode 100644 index d257b509d4d37fa88409f228c70f1e31f6c666b1..0000000000000000000000000000000000000000 --- a/data/mllib/sample_isotonic_regression_data.txt +++ /dev/null @@ -1,100 +0,0 @@ -0.24579296,0.01 -0.28505864,0.02 -0.31208567,0.03 -0.35900051,0.04 -0.35747068,0.05 -0.16675166,0.06 -0.17491076,0.07 -0.04181540,0.08 -0.04793473,0.09 -0.03926568,0.10 -0.12952575,0.11 -0.00000000,0.12 -0.01376849,0.13 -0.13105558,0.14 -0.08873024,0.15 -0.12595614,0.16 -0.15247323,0.17 -0.25956145,0.18 -0.20040796,0.19 -0.19581846,0.20 -0.15757267,0.21 -0.13717491,0.22 -0.19020908,0.23 -0.19581846,0.24 -0.20091790,0.25 -0.16879143,0.26 -0.18510964,0.27 -0.20040796,0.28 -0.29576747,0.29 -0.43396226,0.30 -0.53391127,0.31 -0.52116267,0.32 -0.48546660,0.33 -0.49209587,0.34 -0.54156043,0.35 -0.59765426,0.36 -0.56144824,0.37 -0.58592555,0.38 -0.52983172,0.39 -0.50178480,0.40 -0.52626211,0.41 -0.58286588,0.42 -0.64660887,0.43 -0.68077511,0.44 -0.74298827,0.45 -0.64864865,0.46 -0.67261601,0.47 -0.65782764,0.48 -0.69811321,0.49 -0.63029067,0.50 -0.61601224,0.51 -0.63233044,0.52 -0.65323814,0.53 -0.65323814,0.54 -0.67363590,0.55 -0.67006629,0.56 -0.51555329,0.57 -0.50892402,0.58 -0.33299337,0.59 -0.36206017,0.60 -0.43090260,0.61 -0.45996940,0.62 -0.56348802,0.63 -0.54920959,0.64 -0.48393677,0.65 -0.48495665,0.66 -0.46965834,0.67 -0.45181030,0.68 -0.45843957,0.69 -0.47118817,0.70 -0.51555329,0.71 -0.58031617,0.72 -0.55481897,0.73 -0.56297807,0.74 -0.56603774,0.75 -0.57929628,0.76 -0.64762876,0.77 -0.66241713,0.78 -0.69301377,0.79 -0.65119837,0.80 -0.68332483,0.81 -0.66598674,0.82 -0.73890872,0.83 -0.73992861,0.84 -0.84242733,0.85 -0.91330954,0.86 -0.88016318,0.87 -0.90719021,0.88 -0.93115757,0.89 -0.93115757,0.90 -0.91942886,0.91 -0.92911780,0.92 -0.95665477,0.93 -0.95002550,0.94 -0.96940337,0.95 -1.00000000,0.96 -0.89801122,0.97 -0.90311066,0.98 -0.90362060,0.99 -0.83477817,1.0 \ No newline at end of file diff --git a/data/mllib/sample_isotonic_regression_libsvm_data.txt b/data/mllib/sample_isotonic_regression_libsvm_data.txt new file mode 100644 index 0000000000000000000000000000000000000000..f39fe0269c2fe9a5765b5cd1a65f367cf42854b1 --- /dev/null +++ b/data/mllib/sample_isotonic_regression_libsvm_data.txt @@ -0,0 +1,100 @@ +0.24579296 1:0.01 +0.28505864 1:0.02 +0.31208567 1:0.03 +0.35900051 1:0.04 +0.35747068 1:0.05 +0.16675166 1:0.06 +0.17491076 1:0.07 +0.04181540 1:0.08 +0.04793473 1:0.09 +0.03926568 1:0.10 +0.12952575 1:0.11 +0.00000000 1:0.12 +0.01376849 1:0.13 +0.13105558 1:0.14 +0.08873024 1:0.15 +0.12595614 1:0.16 +0.15247323 1:0.17 +0.25956145 1:0.18 +0.20040796 1:0.19 +0.19581846 1:0.20 +0.15757267 1:0.21 +0.13717491 1:0.22 +0.19020908 1:0.23 +0.19581846 1:0.24 +0.20091790 1:0.25 +0.16879143 1:0.26 +0.18510964 1:0.27 +0.20040796 1:0.28 +0.29576747 1:0.29 +0.43396226 1:0.30 +0.53391127 1:0.31 +0.52116267 1:0.32 +0.48546660 1:0.33 +0.49209587 1:0.34 +0.54156043 1:0.35 +0.59765426 1:0.36 +0.56144824 1:0.37 +0.58592555 1:0.38 +0.52983172 1:0.39 +0.50178480 1:0.40 +0.52626211 1:0.41 +0.58286588 1:0.42 +0.64660887 1:0.43 +0.68077511 1:0.44 +0.74298827 1:0.45 +0.64864865 1:0.46 +0.67261601 1:0.47 +0.65782764 1:0.48 +0.69811321 1:0.49 +0.63029067 1:0.50 +0.61601224 1:0.51 +0.63233044 1:0.52 +0.65323814 1:0.53 +0.65323814 1:0.54 +0.67363590 1:0.55 +0.67006629 1:0.56 +0.51555329 1:0.57 +0.50892402 1:0.58 +0.33299337 1:0.59 +0.36206017 1:0.60 +0.43090260 1:0.61 +0.45996940 1:0.62 +0.56348802 1:0.63 +0.54920959 1:0.64 +0.48393677 1:0.65 +0.48495665 1:0.66 +0.46965834 1:0.67 +0.45181030 1:0.68 +0.45843957 1:0.69 +0.47118817 1:0.70 +0.51555329 1:0.71 +0.58031617 1:0.72 +0.55481897 1:0.73 +0.56297807 1:0.74 +0.56603774 1:0.75 +0.57929628 1:0.76 +0.64762876 1:0.77 +0.66241713 1:0.78 +0.69301377 1:0.79 +0.65119837 1:0.80 +0.68332483 1:0.81 +0.66598674 1:0.82 +0.73890872 1:0.83 +0.73992861 1:0.84 +0.84242733 1:0.85 +0.91330954 1:0.86 +0.88016318 1:0.87 +0.90719021 1:0.88 +0.93115757 1:0.89 +0.93115757 1:0.90 +0.91942886 1:0.91 +0.92911780 1:0.92 +0.95665477 1:0.93 +0.95002550 1:0.94 +0.96940337 1:0.95 +1.00000000 1:0.96 +0.89801122 1:0.97 +0.90311066 1:0.98 +0.90362060 1:0.99 +0.83477817 1:1.0 \ No newline at end of file diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index d7e5521cbcb2986f7a3dd7df01bb0bd25af9974b..3d6106b532ff9e4c12b5649f55d728f3a8d0ceee 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -691,6 +691,76 @@ The implementation matches the result from R's survival function </div> +## Isotonic regression +[Isotonic regression](http://en.wikipedia.org/wiki/Isotonic_regression) +belongs to the family of regression algorithms. Formally isotonic regression is a problem where +given a finite set of real numbers `$Y = {y_1, y_2, ..., y_n}$` representing observed responses +and `$X = {x_1, x_2, ..., x_n}$` the unknown response values to be fitted +finding a function that minimises + +`\begin{equation} + f(x) = \sum_{i=1}^n w_i (y_i - x_i)^2 +\end{equation}` + +with respect to complete order subject to +`$x_1\le x_2\le ...\le x_n$` where `$w_i$` are positive weights. +The resulting function is called isotonic regression and it is unique. +It can be viewed as least squares problem under order restriction. +Essentially isotonic regression is a +[monotonic function](http://en.wikipedia.org/wiki/Monotonic_function) +best fitting the original data points. + +We implement a +[pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111) +which uses an approach to +[parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10). +The training input is a DataFrame which contains three columns +label, features and weight. Additionally IsotonicRegression algorithm has one +optional parameter called $isotonic$ defaulting to true. +This argument specifies if the isotonic regression is +isotonic (monotonically increasing) or antitonic (monotonically decreasing). + +Training returns an IsotonicRegressionModel that can be used to predict +labels for both known and unknown features. The result of isotonic regression +is treated as piecewise linear function. The rules for prediction therefore are: + +* If the prediction input exactly matches a training feature + then associated prediction is returned. In case there are multiple predictions with the same + feature then one of them is returned. Which one is undefined + (same as java.util.Arrays.binarySearch). +* If the prediction input is lower or higher than all training features + then prediction with lowest or highest feature is returned respectively. + In case there are multiple predictions with the same feature + then the lowest or highest is returned respectively. +* If the prediction input falls between two training features then prediction is treated + as piecewise linear function and interpolated value is calculated from the + predictions of the two closest features. In case there are multiple values + with the same feature then the same rules as in previous point are used. + +### Examples + +<div class="codetabs"> +<div data-lang="scala" markdown="1"> + +Refer to the [`IsotonicRegression` Scala docs](api/scala/index.html#org.apache.spark.ml.regression.IsotonicRegression) for details on the API. + +{% include_example scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala %} +</div> +<div data-lang="java" markdown="1"> + +Refer to the [`IsotonicRegression` Java docs](api/java/org/apache/spark/ml/regression/IsotonicRegression.html) for details on the API. + +{% include_example java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java %} +</div> +<div data-lang="python" markdown="1"> + +Refer to the [`IsotonicRegression` Python docs](api/python/pyspark.ml.html#pyspark.ml.regression.IsotonicRegression) for more details on the API. + +{% include_example python/ml/isotonic_regression_example.py %} +</div> +</div> + + # Decision trees diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java new file mode 100644 index 0000000000000000000000000000000000000000..0ec17b047155306a1343525242a9d3e0a22140a6 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.examples.ml; + +// $example on$ + +import org.apache.spark.ml.regression.IsotonicRegression; +import org.apache.spark.ml.regression.IsotonicRegressionModel; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +// $example off$ +import org.apache.spark.sql.SparkSession; + +/** + * An example demonstrating IsotonicRegression. + * Run with + * <pre> + * bin/run-example ml.JavaIsotonicRegressionExample + * </pre> + */ +public class JavaIsotonicRegressionExample { + + public static void main(String[] args) { + // Create a SparkSession. + SparkSession spark = SparkSession + .builder() + .appName("JavaIsotonicRegressionExample") + .getOrCreate(); + + // $example on$ + // Loads data. + Dataset<Row> dataset = spark.read().format("libsvm") + .load("data/mllib/sample_isotonic_regression_libsvm_data.txt"); + + // Trains an isotonic regression model. + IsotonicRegression ir = new IsotonicRegression(); + IsotonicRegressionModel model = ir.fit(dataset); + + System.out.println("Boundaries in increasing order: " + model.boundaries()); + System.out.println("Predictions associated with the boundaries: " + model.predictions()); + + // Makes predictions. + model.transform(dataset).show(); + // $example off$ + + spark.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java index c6361a37299887fcd641222121ed34642de66db2..a30b5f1f73eaf4f3232c9d34bbeb7913ee16f78d 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java @@ -17,6 +17,7 @@ package org.apache.spark.examples.mllib; // $example on$ + import scala.Tuple2; import scala.Tuple3; import org.apache.spark.api.java.function.Function; @@ -27,6 +28,8 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.mllib.regression.IsotonicRegression; import org.apache.spark.mllib.regression.IsotonicRegressionModel; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; // $example off$ import org.apache.spark.SparkConf; @@ -35,27 +38,29 @@ public class JavaIsotonicRegressionExample { SparkConf sparkConf = new SparkConf().setAppName("JavaIsotonicRegressionExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // $example on$ - JavaRDD<String> data = jsc.textFile("data/mllib/sample_isotonic_regression_data.txt"); + JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile( + jsc.sc(), "data/mllib/sample_isotonic_regression_libsvm_data.txt").toJavaRDD(); // Create label, feature, weight tuples from input data with weight set to default value 1.0. JavaRDD<Tuple3<Double, Double, Double>> parsedData = data.map( - new Function<String, Tuple3<Double, Double, Double>>() { - public Tuple3<Double, Double, Double> call(String line) { - String[] parts = line.split(","); - return new Tuple3<>(new Double(parts[0]), new Double(parts[1]), 1.0); + new Function<LabeledPoint, Tuple3<Double, Double, Double>>() { + public Tuple3<Double, Double, Double> call(LabeledPoint point) { + return new Tuple3<>(new Double(point.label()), + new Double(point.features().apply(0)), 1.0); } } ); // Split data into training (60%) and test (40%) sets. JavaRDD<Tuple3<Double, Double, Double>>[] splits = - parsedData.randomSplit(new double[]{0.6, 0.4}, 11L); + parsedData.randomSplit(new double[]{0.6, 0.4}, 11L); JavaRDD<Tuple3<Double, Double, Double>> training = splits[0]; JavaRDD<Tuple3<Double, Double, Double>> test = splits[1]; // Create isotonic regression model from training data. // Isotonic parameter defaults to true so it is only shown for demonstration - final IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(training); + final IsotonicRegressionModel model = + new IsotonicRegression().setIsotonic(true).run(training); // Create tuples of predicted and real labels. JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair( diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py new file mode 100644 index 0000000000000000000000000000000000000000..1e61bd8eff1433a714c93a487b37ad79bd04697d --- /dev/null +++ b/examples/src/main/python/ml/isotonic_regression_example.py @@ -0,0 +1,54 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Isotonic Regression Example. +""" +from __future__ import print_function + +# $example on$ +from pyspark.ml.regression import IsotonicRegression, IsotonicRegressionModel +# $example off$ +from pyspark.sql import SparkSession + +""" +An example demonstrating isotonic regression. +Run with: + bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py +""" +if __name__ == "__main__": + + spark = SparkSession\ + .builder\ + .appName("PythonIsotonicRegressionExample")\ + .getOrCreate() + + # $example on$ + # Loads data. + dataset = spark.read.format("libsvm")\ + .load("data/mllib/sample_isotonic_regression_libsvm_data.txt") + + # Trains an isotonic regression model. + model = IsotonicRegression().fit(dataset) + print("Boundaries in increasing order: " + str(model.boundaries)) + print("Predictions associated with the boundaries: " + str(model.predictions)) + + # Makes predictions. + model.transform(dataset).show() + # $example off$ + + spark.stop() diff --git a/examples/src/main/python/mllib/isotonic_regression_example.py b/examples/src/main/python/mllib/isotonic_regression_example.py index 89dc9f4b6611a523c7616b9cfbe8395f3dbc9e7b..33d618ab48ea9b21dc4c518c00c28177927eb8be 100644 --- a/examples/src/main/python/mllib/isotonic_regression_example.py +++ b/examples/src/main/python/mllib/isotonic_regression_example.py @@ -23,7 +23,8 @@ from __future__ import print_function from pyspark import SparkContext # $example on$ import math -from pyspark.mllib.regression import IsotonicRegression, IsotonicRegressionModel +from pyspark.mllib.regression import LabeledPoint, IsotonicRegression, IsotonicRegressionModel +from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": @@ -31,10 +32,14 @@ if __name__ == "__main__": sc = SparkContext(appName="PythonIsotonicRegressionExample") # $example on$ - data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt") + # Load and parse the data + def parsePoint(labeledData): + return (labeledData.label, labeledData.features[0], 1.0) + + data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_isotonic_regression_libsvm_data.txt") # Create label, feature, weight tuples from input data with weight set to default value 1.0. - parsedData = data.map(lambda line: tuple([float(x) for x in line.split(',')]) + (1.0,)) + parsedData = data.map(parsePoint) # Split data into training (60%) and test (40%) sets. training, test = parsedData.randomSplit([0.6, 0.4], 11) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala new file mode 100644 index 0000000000000000000000000000000000000000..7c5d3f23411f01916a150378b97901ad417a7ce5 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.regression.IsotonicRegression +// $example off$ +import org.apache.spark.sql.SparkSession + +/** + * An example demonstrating Isotonic Regression. + * Run with + * {{{ + * bin/run-example ml.IsotonicRegressionExample + * }}} + */ +object IsotonicRegressionExample { + + def main(args: Array[String]): Unit = { + + // Creates a SparkSession. + val spark = SparkSession + .builder + .appName(s"${this.getClass.getSimpleName}") + .getOrCreate() + + // $example on$ + // Loads data. + val dataset = spark.read.format("libsvm") + .load("data/mllib/sample_isotonic_regression_libsvm_data.txt") + + // Trains an isotonic regression model. + val ir = new IsotonicRegression() + val model = ir.fit(dataset) + + println(s"Boundaries in increasing order: ${model.boundaries}") + println(s"Predictions associated with the boundaries: ${model.predictions}") + + // Makes predictions. + model.transform(dataset).show() + // $example off$ + + spark.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala index c4336639d7c0b84009e59eb592740ed61749a467..e5dea129c113dc9c4cef3a5cdca283c307b5b75b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala @@ -21,6 +21,7 @@ package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.regression.{IsotonicRegression, IsotonicRegressionModel} +import org.apache.spark.mllib.util.MLUtils // $example off$ object IsotonicRegressionExample { @@ -30,12 +31,12 @@ object IsotonicRegressionExample { val conf = new SparkConf().setAppName("IsotonicRegressionExample") val sc = new SparkContext(conf) // $example on$ - val data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt") + val data = MLUtils.loadLibSVMFile(sc, + "data/mllib/sample_isotonic_regression_libsvm_data.txt").cache() // Create label, feature, weight tuples from input data with weight set to default value 1.0. - val parsedData = data.map { line => - val parts = line.split(',').map(_.toDouble) - (parts(0), parts(1), 1.0) + val parsedData = data.map { labeledPoint => + (labeledPoint.label, labeledPoint.features(0), 1.0) } // Split data into training (60%) and test (40%) sets.