Skip to content
Snippets Groups Projects
Commit 4d28e843 authored by Yuhao Yang's avatar Yuhao Yang Committed by Nick Pentreath
Browse files

[SPARK-19969][ML] Imputer doc and example

## What changes were proposed in this pull request?

Add docs and examples for spark.ml.feature.Imputer. Currently scala and Java examples are included. Python example will be added after https://github.com/apache/spark/pull/17316

## How was this patch tested?

local doc generation and example execution

Author: Yuhao Yang <yuhao.yang@intel.com>

Closes #17324 from hhbyyh/imputerdoc.
parent fb5869f2
No related branches found
No related tags found
No related merge requests found
......@@ -1284,6 +1284,72 @@ for more details on the API.
</div>
## Imputer
The `Imputer` transformer completes missing values in a dataset, either using the mean or the
median of the columns in which the missing values are located. The input columns should be of
`DoubleType` or `FloatType`. Currently `Imputer` does not support categorical features and possibly
creates incorrect values for columns containing categorical features.
**Note** all `null` values in the input columns are treated as missing, and so are also imputed.
**Examples**
Suppose that we have a DataFrame with the columns `a` and `b`:
~~~
a | b
------------|-----------
1.0 | Double.NaN
2.0 | Double.NaN
Double.NaN | 3.0
4.0 | 4.0
5.0 | 5.0
~~~
In this example, Imputer will replace all occurrences of `Double.NaN` (the default for the missing value)
with the mean (the default imputation strategy) computed from the other values in the corresponding columns.
In this example, the surrogate values for columns `a` and `b` are 3.0 and 4.0 respectively. After
transformation, the missing values in the output columns will be replaced by the surrogate value for
the relevant column.
~~~
a | b | out_a | out_b
------------|------------|-------|-------
1.0 | Double.NaN | 1.0 | 4.0
2.0 | Double.NaN | 2.0 | 4.0
Double.NaN | 3.0 | 3.0 | 3.0
4.0 | 4.0 | 4.0 | 4.0
5.0 | 5.0 | 5.0 | 5.0
~~~
<div class="codetabs">
<div data-lang="scala" markdown="1">
Refer to the [Imputer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Imputer)
for more details on the API.
{% include_example scala/org/apache/spark/examples/ml/ImputerExample.scala %}
</div>
<div data-lang="java" markdown="1">
Refer to the [Imputer Java docs](api/java/org/apache/spark/ml/feature/Imputer.html)
for more details on the API.
{% include_example java/org/apache/spark/examples/ml/JavaImputerExample.java %}
</div>
<div data-lang="python" markdown="1">
Refer to the [Imputer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Imputer)
for more details on the API.
{% include_example python/ml/imputer_example.py %}
</div>
</div>
# Feature Selectors
## VectorSlicer
......
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.examples.ml;
// $example on$
import java.util.Arrays;
import java.util.List;
import org.apache.spark.ml.feature.Imputer;
import org.apache.spark.ml.feature.ImputerModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.*;
// $example off$
import static org.apache.spark.sql.types.DataTypes.*;
/**
* An example demonstrating Imputer.
* Run with:
* bin/run-example ml.JavaImputerExample
*/
public class JavaImputerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("JavaImputerExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(
RowFactory.create(1.0, Double.NaN),
RowFactory.create(2.0, Double.NaN),
RowFactory.create(Double.NaN, 3.0),
RowFactory.create(4.0, 4.0),
RowFactory.create(5.0, 5.0)
);
StructType schema = new StructType(new StructField[]{
createStructField("a", DoubleType, false),
createStructField("b", DoubleType, false)
});
Dataset<Row> df = spark.createDataFrame(data, schema);
Imputer imputer = new Imputer()
.setInputCols(new String[]{"a", "b"})
.setOutputCols(new String[]{"out_a", "out_b"});
ImputerModel model = imputer.fit(df);
model.transform(df).show();
// $example off$
spark.stop();
}
}
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# $example on$
from pyspark.ml.feature import Imputer
# $example off$
from pyspark.sql import SparkSession
"""
An example demonstrating Imputer.
Run with:
bin/spark-submit examples/src/main/python/ml/imputer_example.py
"""
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("ImputerExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame([
(1.0, float("nan")),
(2.0, float("nan")),
(float("nan"), 3.0),
(4.0, 4.0),
(5.0, 5.0)
], ["a", "b"])
imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
model = imputer.fit(df)
model.transform(df).show()
# $example off$
spark.stop()
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.Imputer
// $example off$
import org.apache.spark.sql.SparkSession
/**
* An example demonstrating Imputer.
* Run with:
* bin/run-example ml.ImputerExample
*/
object ImputerExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.appName("ImputerExample")
.getOrCreate()
// $example on$
val df = spark.createDataFrame(Seq(
(1.0, Double.NaN),
(2.0, Double.NaN),
(Double.NaN, 3.0),
(4.0, 4.0),
(5.0, 5.0)
)).toDF("a", "b")
val imputer = new Imputer()
.setInputCols(Array("a", "b"))
.setOutputCols(Array("out_a", "out_b"))
val model = imputer.fit(df)
model.transform(df).show()
// $example off$
spark.stop()
}
}
......@@ -35,7 +35,7 @@ import org.apache.spark.sql.types._
private[feature] trait ImputerParams extends Params with HasInputCols {
/**
* The imputation strategy.
* The imputation strategy. Currently only "mean" and "median" are supported.
* If "mean", then replace missing values using the mean value of the feature.
* If "median", then replace missing values using the approximate median value of the feature.
* Default: mean
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment