Skip to content
Snippets Groups Projects
Commit a9b8b655 authored by wm624@hotmail.com's avatar wm624@hotmail.com Committed by Nick Pentreath
Browse files

[SPARK-14392][ML] CountVectorizer Estimator should include binary toggle Param

## What changes were proposed in this pull request?

CountVectorizerModel has a binary toggle param. This PR is to add binary toggle param for estimator CountVectorizer. As discussed in the JIRA, instead of adding a param into CountVerctorizer, I moved the binary param to CountVectorizerParams. Therefore, the estimator inherits the binary param.

## How was this patch tested?

Add a new test case, which fits the model with binary flag set to true and then check the trained model's all non-zero counts is set to 1.0.

All tests in CounterVectorizerSuite.scala are passed.

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #12200 from wangmiao1981/binary_param.
parent 90c0a045
No related branches found
No related tags found
No related merge requests found
......@@ -100,6 +100,21 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
/** @group getParam */
def getMinTF: Double = $(minTF)
/**
* Binary toggle to control the output vector values.
* If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
* discrete probabilistic models that model binary events rather than integer counts.
* Default: false
* @group param
*/
val binary: BooleanParam =
new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.")
/** @group getParam */
def getBinary: Boolean = $(binary)
setDefault(binary -> false)
}
/**
......@@ -127,6 +142,9 @@ class CountVectorizer(override val uid: String)
/** @group setParam */
def setMinTF(value: Double): this.type = set(minTF, value)
/** @group setParam */
def setBinary(value: Boolean): this.type = set(binary, value)
setDefault(vocabSize -> (1 << 18), minDF -> 1)
override def fit(dataset: DataFrame): CountVectorizerModel = {
......@@ -206,26 +224,9 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
/** @group setParam */
def setMinTF(value: Double): this.type = set(minTF, value)
/**
* Binary toggle to control the output vector values.
* If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
* discrete probabilistic models that model binary events rather than integer counts.
* Default: false
* @group param
*/
val binary: BooleanParam =
new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
"This is useful for discrete probabilistic models that model binary events rather " +
"than integer counts")
/** @group getParam */
def getBinary: Boolean = $(binary)
/** @group setParam */
def setBinary(value: Boolean): this.type = set(binary, value)
setDefault(binary -> false)
/** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */
private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None
......
......@@ -168,21 +168,34 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
}
}
test("CountVectorizerModel with binary") {
test("CountVectorizerModel and CountVectorizer with binary") {
val df = sqlContext.createDataFrame(Seq(
(0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))),
(0, split("a a a a b b b b c d"),
Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))),
(1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))),
(2, split("a"), Vectors.sparse(4, Seq((0, 1.0))))
)).toDF("id", "words", "expected")
val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
// CountVectorizer test
val cv = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setBinary(true)
.fit(df)
cv.transform(df).select("features", "expected").collect().foreach {
case Row(features: Vector, expected: Vector) =>
assert(features ~== expected absTol 1e-14)
}
// CountVectorizerModel test
val cv2 = new CountVectorizerModel(cv.vocabulary)
.setInputCol("words")
.setOutputCol("features")
.setBinary(true)
cv2.transform(df).select("features", "expected").collect().foreach {
case Row(features: Vector, expected: Vector) =>
assert(features ~== expected absTol 1e-14)
}
}
test("CountVectorizer read/write") {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment