Skip to content
Snippets Groups Projects
Commit 61f9c871 authored by Yuhao Yang's avatar Yuhao Yang Committed by Joseph K. Bradley
Browse files

[SPARK-11069][ML] Add RegexTokenizer option to convert to lowercase

jira: https://issues.apache.org/jira/browse/SPARK-11069
quotes from jira:
Tokenizer converts strings to lowercase automatically, but RegexTokenizer does not. It would be nice to add an option to RegexTokenizer to convert to lowercase. Proposal:
call the Boolean Param "toLowercase"
set default to false (so behavior does not change)

Actually sklearn converts to lowercase before tokenizing too

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #9092 from hhbyyh/tokenLower.
parent 7dc9d8db
No related branches found
No related tags found
No related merge requests found
......@@ -100,10 +100,25 @@ class RegexTokenizer(override val uid: String)
/** @group getParam */
def getPattern: String = $(pattern)
setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+")
/**
* Indicates whether to convert all characters to lowercase before tokenizing.
* Default: true
* @group param
*/
final val toLowercase: BooleanParam = new BooleanParam(this, "toLowercase",
"whether to convert all characters to lowercase before tokenizing.")
/** @group setParam */
def setToLowercase(value: Boolean): this.type = set(toLowercase, value)
/** @group getParam */
def getToLowercase: Boolean = $(toLowercase)
setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+", toLowercase -> true)
override protected def createTransformFunc: String => Seq[String] = { str =>
override protected def createTransformFunc: String => Seq[String] = { originStr =>
val re = $(pattern).r
val str = if ($(toLowercase)) originStr.toLowerCase() else originStr
val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq
val minLength = $(minTokenLength)
tokens.filter(_.length >= minLength)
......
......@@ -53,6 +53,7 @@ public class JavaTokenizerSuite {
.setOutputCol("tokens")
.setPattern("\\s")
.setGaps(true)
.setToLowercase(false)
.setMinTokenLength(3);
......
......@@ -48,13 +48,13 @@ class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext {
.setInputCol("rawText")
.setOutputCol("tokens")
val dataset0 = sqlContext.createDataFrame(Seq(
TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")),
TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct"))
TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")),
TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct"))
))
testRegexTokenizer(tokenizer0, dataset0)
val dataset1 = sqlContext.createDataFrame(Seq(
TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")),
TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")),
TokenizerTestData("Te,st. punct", Array("punct"))
))
tokenizer0.setMinTokenLength(3)
......@@ -64,11 +64,23 @@ class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext {
.setInputCol("rawText")
.setOutputCol("tokens")
val dataset2 = sqlContext.createDataFrame(Seq(
TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")),
TokenizerTestData("Te,st. punct", Array("Te,st.", "punct"))
TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")),
TokenizerTestData("Te,st. punct", Array("te,st.", "punct"))
))
testRegexTokenizer(tokenizer2, dataset2)
}
test("RegexTokenizer with toLowercase false") {
val tokenizer = new RegexTokenizer()
.setInputCol("rawText")
.setOutputCol("tokens")
.setToLowercase(false)
val dataset = sqlContext.createDataFrame(Seq(
TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")),
TokenizerTestData("java scala", Array("java", "scala"))
))
testRegexTokenizer(tokenizer, dataset)
}
}
object RegexTokenizerSuite extends SparkFunSuite {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment