diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala index d22bf164c313c15dbae6278b006fe18a2c55d073..4b91fa933ed9fc64daeec287d727399612609a8a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala @@ -64,17 +64,17 @@ import org.apache.spark.util.collection.OpenHashMap * ).toDF("real", "bool", "stringNum", "string") * * val hasher = new FeatureHasher() - * .setInputCols("real", "bool", "stringNum", "num") + * .setInputCols("real", "bool", "stringNum", "string") * .setOutputCol("features") * - * hasher.transform(df).show() + * hasher.transform(df).show(false) * - * +----+-----+---------+------+--------------------+ - * |real| bool|stringNum|string| features| - * +----+-----+---------+------+--------------------+ - * | 2.0| true| 1| foo|(262144,[51871,63...| - * | 3.0|false| 2| bar|(262144,[6031,806...| - * +----+-----+---------+------+--------------------+ + * +----+-----+---------+------+------------------------------------------------------+ + * |real|bool |stringNum|string|features | + * +----+-----+---------+------+------------------------------------------------------+ + * |2.0 |true |1 |foo |(262144,[51871,63643,174475,253195],[1.0,1.0,2.0,1.0])| + * |3.0 |false|2 |bar |(262144,[6031,80619,140467,174475],[1.0,1.0,1.0,3.0]) | + * +----+-----+---------+------+------------------------------------------------------+ * }}} */ @Experimental diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 54b4026f78bec36bce316f74620ab988208ec11f..050537b811f610f45a51d8c3d9eb20ca213e8519 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -34,6 +34,7 @@ __all__ = ['Binarizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT', 'ElementwiseProduct', + 'FeatureHasher', 'HashingTF', 'IDF', 'IDFModel', 'Imputer', 'ImputerModel', @@ -696,6 +697,82 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada return self.getOrDefault(self.scalingVec) +@inherit_doc +class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable, + JavaMLWritable): + """ + .. note:: Experimental + + Feature hashing projects a set of categorical or numerical features into a feature vector of + specified dimension (typically substantially smaller than that of the original feature + space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing) + to map features to indices in the feature vector. + + The FeatureHasher transformer operates on multiple columns. Each column may contain either + numeric or categorical features. Behavior and handling of column data types is as follows: + + * Numeric columns: + For numeric features, the hash value of the column name is used to map the + feature value to its index in the feature vector. Numeric features are never + treated as categorical, even when they are integers. You must explicitly + convert numeric columns containing categorical features to strings first. + + * String columns: + For categorical features, the hash value of the string "column_name=value" + is used to map to the vector index, with an indicator value of `1.0`. + Thus, categorical features are "one-hot" encoded + (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`). + + * Boolean columns: + Boolean values are treated in the same way as string columns. That is, + boolean features are represented as "column_name=true" or "column_name=false", + with an indicator value of `1.0`. + + Null (missing) values are ignored (implicitly zero in the resulting feature vector). + + Since a simple modulo is used to transform the hash function to a vector index, + it is advisable to use a power of two as the `numFeatures` parameter; + otherwise the features will not be mapped evenly to the vector indices. + + >>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")] + >>> cols = ["real", "bool", "stringNum", "string"] + >>> df = spark.createDataFrame(data, cols) + >>> hasher = FeatureHasher(inputCols=cols, outputCol="features") + >>> hasher.transform(df).head().features + SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0}) + >>> hasherPath = temp_path + "/hasher" + >>> hasher.save(hasherPath) + >>> loadedHasher = FeatureHasher.load(hasherPath) + >>> loadedHasher.getNumFeatures() == hasher.getNumFeatures() + True + >>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features + True + + .. versionadded:: 2.3.0 + """ + + @keyword_only + def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None): + """ + __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None) + """ + super(FeatureHasher, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid) + self._setDefault(numFeatures=1 << 18) + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + @since("2.3.0") + def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None): + """ + setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None) + Sets params for this FeatureHasher. + """ + kwargs = self._input_kwargs + return self._set(**kwargs) + + @inherit_doc class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable, JavaMLWritable):