Skip to content
Snippets Groups Projects
Commit 66d87c1d authored by Yuhao Yang's avatar Yuhao Yang Committed by Xiangrui Meng
Browse files

[SPARK-7583] [MLLIB] User guide update for RegexTokenizer

jira: https://issues.apache.org/jira/browse/SPARK-7583

User guide update for RegexTokenizer

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #7828 from hhbyyh/regexTokenizerDoc.
parent be5d1912
No related branches found
No related tags found
No related merge requests found
...@@ -217,21 +217,32 @@ for feature in result.select("result").take(3): ...@@ -217,21 +217,32 @@ for feature in result.select("result").take(3):
[Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality. The example below shows how to split sentences into sequences of words. [Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality. The example below shows how to split sentences into sequences of words.
Note: A more advanced tokenizer is provided via [RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer). [RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more
advanced tokenization based on regular expression (regex) matching.
By default, the parameter "pattern" (regex, default: \\s+) is used as delimiters to split the input text.
Alternatively, users can set parameter "gaps" to false indicating the regex "pattern" denotes
"tokens" rather than splitting gaps, and find all matching occurrences as the tokenization result.
<div class="codetabs"> <div class="codetabs">
<div data-lang="scala" markdown="1"> <div data-lang="scala" markdown="1">
{% highlight scala %} {% highlight scala %}
import org.apache.spark.ml.feature.Tokenizer import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
val sentenceDataFrame = sqlContext.createDataFrame(Seq( val sentenceDataFrame = sqlContext.createDataFrame(Seq(
(0, "Hi I heard about Spark"), (0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"), (1, "I wish Java could use case classes"),
(1, "Logistic regression models are neat") (2, "Logistic,regression,models,are,neat")
)).toDF("label", "sentence") )).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsDataFrame = tokenizer.transform(sentenceDataFrame) val regexTokenizer = new RegexTokenizer()
wordsDataFrame.select("words", "label").take(3).foreach(println) .setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
val tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("words", "label").take(3).foreach(println)
val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("words", "label").take(3).foreach(println)
{% endhighlight %} {% endhighlight %}
</div> </div>
...@@ -240,6 +251,7 @@ wordsDataFrame.select("words", "label").take(3).foreach(println) ...@@ -240,6 +251,7 @@ wordsDataFrame.select("words", "label").take(3).foreach(println)
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.RegexTokenizer;
import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
...@@ -252,8 +264,8 @@ import org.apache.spark.sql.types.StructType; ...@@ -252,8 +264,8 @@ import org.apache.spark.sql.types.StructType;
JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList( JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
RowFactory.create(0, "Hi I heard about Spark"), RowFactory.create(0, "Hi I heard about Spark"),
RowFactory.create(0, "I wish Java could use case classes"), RowFactory.create(1, "I wish Java could use case classes"),
RowFactory.create(1, "Logistic regression models are neat") RowFactory.create(2, "Logistic,regression,models,are,neat")
)); ));
StructType schema = new StructType(new StructField[]{ StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
...@@ -267,22 +279,29 @@ for (Row r : wordsDataFrame.select("words", "label").take(3)) { ...@@ -267,22 +279,29 @@ for (Row r : wordsDataFrame.select("words", "label").take(3)) {
for (String word : words) System.out.print(word + " "); for (String word : words) System.out.print(word + " ");
System.out.println(); System.out.println();
} }
RegexTokenizer regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
{% endhighlight %} {% endhighlight %}
</div> </div>
<div data-lang="python" markdown="1"> <div data-lang="python" markdown="1">
{% highlight python %} {% highlight python %}
from pyspark.ml.feature import Tokenizer from pyspark.ml.feature import Tokenizer, RegexTokenizer
sentenceDataFrame = sqlContext.createDataFrame([ sentenceDataFrame = sqlContext.createDataFrame([
(0, "Hi I heard about Spark"), (0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"), (1, "I wish Java could use case classes"),
(1, "Logistic regression models are neat") (2, "Logistic,regression,models,are,neat")
], ["label", "sentence"]) ], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsDataFrame = tokenizer.transform(sentenceDataFrame) wordsDataFrame = tokenizer.transform(sentenceDataFrame)
for words_label in wordsDataFrame.select("words", "label").take(3): for words_label in wordsDataFrame.select("words", "label").take(3):
print(words_label) print(words_label)
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)
{% endhighlight %} {% endhighlight %}
</div> </div>
</div> </div>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment