From ed9d80385486cd39a84a689ef467795262af919a Mon Sep 17 00:00:00 2001 From: Yuhao Yang <hhbyyh@gmail.com> Date: Wed, 20 Apr 2016 11:45:08 +0100 Subject: [PATCH] [SPARK-14635][ML] Documentation and Examples for TF-IDF only refer to HashingTF ## What changes were proposed in this pull request? Currently, the docs for TF-IDF only refer to using HashingTF with IDF. However, CountVectorizer can also be used. We should probably amend the user guide and examples to show this. ## How was this patch tested? unit tests and doc generation Author: Yuhao Yang <hhbyyh@gmail.com> Closes #12454 from hhbyyh/tfdoc. --- docs/ml-features.md | 15 ++++++++++++--- .../spark/examples/ml/JavaTfIdfExample.java | 2 ++ examples/src/main/python/ml/tf_idf_example.py | 2 ++ .../apache/spark/examples/ml/TfIdfExample.scala | 2 ++ 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 876d21f495..11d5acbb10 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -22,10 +22,19 @@ This section covers algorithms for working with features, roughly divided into t [Term Frequency-Inverse Document Frequency (TF-IDF)](http://en.wikipedia.org/wiki/Tf%E2%80%93idf) is a common text pre-processing step. In Spark ML, TF-IDF is separate into two parts: TF (+hashing) and IDF. -**TF**: `HashingTF` is a `Transformer` which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a "set of terms" might be a bag of words. -The algorithm combines Term Frequency (TF) counts with the [hashing trick](http://en.wikipedia.org/wiki/Feature_hashing) for dimensionality reduction. +**TF**: Both `HashingTF` and `CountVectorizer` can be used to generate the term frequency vectors. -**IDF**: `IDF` is an `Estimator` which fits on a dataset and produces an `IDFModel`. The `IDFModel` takes feature vectors (generally created from `HashingTF`) and scales each column. Intuitively, it down-weights columns which appear frequently in a corpus. +`HashingTF` is a `Transformer` which takes sets of terms and converts those sets into +fixed-length feature vectors. In text processing, a "set of terms" might be a bag of words. +The algorithm combines Term Frequency (TF) counts with the +[hashing trick](http://en.wikipedia.org/wiki/Feature_hashing) for dimensionality reduction. + +`CountVectorizer` converts text documents to vectors of term counts. Refer to [CountVectorizer +](ml-features.html#countvectorizer) for more details. + +**IDF**: `IDF` is an `Estimator` which is fit on a dataset and produces an `IDFModel`. The +`IDFModel` takes feature vectors (generally created from `HashingTF` or `CountVectorizer`) and scales each column. +Intuitively, it down-weights columns which appear frequently in a corpus. Please refer to the [MLlib user guide on TF-IDF](mllib-feature-extraction.html#tf-idf) for more details on Term Frequency and Inverse Document Frequency. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java index 37a3d0d84d..107c835f2e 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java @@ -63,6 +63,8 @@ public class JavaTfIdfExample { .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); Dataset<Row> featurizedData = hashingTF.transform(wordsData); + // alternatively, CountVectorizer can also be used to get term frequency vectors + IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); IDFModel idfModel = idf.fit(featurizedData); Dataset<Row> rescaledData = idfModel.transform(featurizedData); diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py index c92313378e..141324d458 100644 --- a/examples/src/main/python/ml/tf_idf_example.py +++ b/examples/src/main/python/ml/tf_idf_example.py @@ -37,6 +37,8 @@ if __name__ == "__main__": wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) + # alternatively, CountVectorizer can also be used to get term frequency vectors + idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala index 28115f9390..396f073e6b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala @@ -43,6 +43,8 @@ object TfIdfExample { val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) + // alternatively, CountVectorizer can also be used to get term frequency vectors + val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) -- GitLab