Skip to content
Snippets Groups Projects
Commit e76431f8 authored by Xusen Yin's avatar Xusen Yin Committed by Joseph K. Bradley
Browse files

[SPARK-11961][DOC] Add docs of ChiSqSelector

https://issues.apache.org/jira/browse/SPARK-11961

Author: Xusen Yin <yinxusen@gmail.com>

Closes #9965 from yinxusen/SPARK-11961.
parent 328b757d
No related branches found
No related tags found
Loading
......@@ -1949,3 +1949,53 @@ output.select("features", "label").show()
{% endhighlight %}
</div>
</div>
## ChiSqSelector
`ChiSqSelector` stands for Chi-Squared feature selection. It operates on labeled data with
categorical features. ChiSqSelector orders features based on a
[Chi-Squared test of independence](https://en.wikipedia.org/wiki/Chi-squared_test)
from the class, and then filters (selects) the top features which the class label depends on the
most. This is akin to yielding the features with the most predictive power.
**Examples**
Assume that we have a DataFrame with the columns `id`, `features`, and `clicked`, which is used as
our target to be predicted:
~~~
id | features | clicked
---|-----------------------|---------
7 | [0.0, 0.0, 18.0, 1.0] | 1.0
8 | [0.0, 1.0, 12.0, 0.0] | 0.0
9 | [1.0, 0.0, 15.0, 0.1] | 0.0
~~~
If we use `ChiSqSelector` with a `numTopFeatures = 1`, then according to our label `clicked` the
last column in our `features` chosen as the most useful feature:
~~~
id | features | clicked | selectedFeatures
---|-----------------------|---------|------------------
7 | [0.0, 0.0, 18.0, 1.0] | 1.0 | [1.0]
8 | [0.0, 1.0, 12.0, 0.0] | 0.0 | [0.0]
9 | [1.0, 0.0, 15.0, 0.1] | 0.0 | [0.1]
~~~
<div class="codetabs">
<div data-lang="scala" markdown="1">
Refer to the [ChiSqSelector Scala docs](api/scala/index.html#org.apache.spark.ml.feature.ChiSqSelector)
for more details on the API.
{% include_example scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala %}
</div>
<div data-lang="java" markdown="1">
Refer to the [ChiSqSelector Java docs](api/java/org/apache/spark/ml/feature/ChiSqSelector.html)
for more details on the API.
{% include_example java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java %}
</div>
</div>
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
// $example on$
import java.util.Arrays;
import org.apache.spark.ml.feature.ChiSqSelector;
import org.apache.spark.mllib.linalg.VectorUDT;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
// $example off$
public class JavaChiSqSelectorExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaChiSqSelectorExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
// $example on$
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
));
StructType schema = new StructType(new StructField[]{
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("features", new VectorUDT(), false, Metadata.empty()),
new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty())
});
DataFrame df = sqlContext.createDataFrame(jrdd, schema);
ChiSqSelector selector = new ChiSqSelector()
.setNumTopFeatures(1)
.setFeaturesCol("features")
.setLabelCol("clicked")
.setOutputCol("selectedFeatures");
DataFrame result = selector.fit(df).transform(df);
result.show();
// $example off$
jsc.stop();
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// scalastyle:off println
package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.ChiSqSelector
import org.apache.spark.mllib.linalg.Vectors
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object ChiSqSelectorExample {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("ChiSqSelectorExample")
val sc = new SparkContext(conf)
val sqlContext = SQLContext.getOrCreate(sc)
import sqlContext.implicits._
// $example on$
val data = Seq(
(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
)
val df = sc.parallelize(data).toDF("id", "features", "clicked")
val selector = new ChiSqSelector()
.setNumTopFeatures(1)
.setFeaturesCol("features")
.setLabelCol("clicked")
.setOutputCol("selectedFeatures")
val result = selector.fit(df).transform(df)
result.show()
// $example off$
sc.stop()
}
}
// scalastyle:on println
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment