From 4672e9838b130d006965efeba2665676aa995ebc Mon Sep 17 00:00:00 2001 From: Yanbo Liang <ybliang8@gmail.com> Date: Wed, 27 Apr 2016 14:08:26 -0700 Subject: [PATCH] [SPARK-14899][ML][PYSPARK] Remove spark.ml HashingTF hashingAlg option ## What changes were proposed in this pull request? Since [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574) breaks behavior of ```HashingTF```, we should try to enforce good practice by removing the "native" hashAlgorithm option in spark.ml and pyspark.ml. We can leave spark.mllib and pyspark.mllib alone. ## How was this patch tested? Unit tests. cc jkbradley Author: Yanbo Liang <ybliang8@gmail.com> Closes #12702 from yanboliang/spark-14899. --- .../apache/spark/ml/feature/HashingTF.scala | 36 ++++------------ .../spark/mllib/feature/HashingTF.scala | 4 +- .../spark/ml/feature/HashingTFSuite.scala | 37 ++++++----------- python/pyspark/ml/feature.py | 41 +++++-------------- python/pyspark/ml/tests.py | 9 ++-- 5 files changed, 36 insertions(+), 91 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 6fc08aee13..66ae91cfc0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -31,12 +31,11 @@ import org.apache.spark.sql.types.{ArrayType, StructType} /** * :: Experimental :: * Maps a sequence of terms to their term frequencies using the hashing trick. - * Currently we support two hash algorithms: "murmur3" (default) and "native". - * "murmur3" calculates a hash code value for the term object using - * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32); - * "native" calculates the hash code value using the native Scala implementation. - * In Spark 1.6 and earlier, "native" is the default hash algorithm; - * after Spark 2.0, we use "murmur3" as the default one. + * Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32) + * to calculate the hash code value for the term object. + * Since a simple modulo is used to transform the hash function to a column index, + * it is advisable to use a power of two as the numFeatures parameter; + * otherwise the features will not be mapped evenly to the columns. */ @Experimental class HashingTF(override val uid: String) @@ -69,20 +68,7 @@ class HashingTF(override val uid: String) "This is useful for discrete probabilistic models that model binary events rather " + "than integer counts") - /** - * The hash algorithm used when mapping term to integer. - * Supported options: "murmur3" and "native". We use "native" as default hash algorithm - * in Spark 1.6 and earlier. After Spark 2.0, we use "murmur3" as default one. - * (Default = "murmur3") - * @group expertParam - */ - val hashAlgorithm = new Param[String](this, "hashAlgorithm", "The hash algorithm used when " + - "mapping term to integer. Supported options: " + - s"${feature.HashingTF.supportedHashAlgorithms.mkString(",")}.", - ParamValidators.inArray[String](feature.HashingTF.supportedHashAlgorithms)) - - setDefault(numFeatures -> (1 << 18), binary -> false, - hashAlgorithm -> feature.HashingTF.Murmur3) + setDefault(numFeatures -> (1 << 18), binary -> false) /** @group getParam */ def getNumFeatures: Int = $(numFeatures) @@ -96,18 +82,10 @@ class HashingTF(override val uid: String) /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) - /** @group expertGetParam */ - def getHashAlgorithm: String = $(hashAlgorithm) - - /** @group expertSetParam */ - def setHashAlgorithm(value: String): this.type = set(hashAlgorithm, value) - @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) - val hashingTF = new feature.HashingTF($(numFeatures)) - .setBinary($(binary)) - .setHashAlgorithm($(hashAlgorithm)) + val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index 321f11d9f9..bc26655104 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -135,18 +135,18 @@ object HashingTF { private[spark] val Murmur3: String = "murmur3" - private[spark] val supportedHashAlgorithms: Array[String] = Array(Native, Murmur3) - private val seed = 42 /** * Calculate a hash code value for the term object using the native Scala implementation. + * This is the default hash algorithm used in Spark 1.6 and earlier. */ private[spark] def nativeHash(term: Any): Int = term.## /** * Calculate a hash code value for the term object using * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32). + * This is the default hash algorithm used from Spark 2.0 onwards. */ private[spark] def murmur3Hash(term: Any): Int = { term match { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala index e32b862af7..44bad4aba4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala @@ -38,26 +38,19 @@ class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with Defau (0, "a a b b c d".split(" ").toSeq) )).toDF("id", "words") val n = 100 - Seq("murmur3", "native").foreach { hashAlgorithm => - val hashingTF = new HashingTF() - .setInputCol("words") - .setOutputCol("features") - .setNumFeatures(n) - .setHashAlgorithm(hashAlgorithm) - val output = hashingTF.transform(df) - val attrGroup = AttributeGroup.fromStructField(output.schema("features")) - require(attrGroup.numAttributes === Some(n)) - val features = output.select("features").first().getAs[Vector](0) - // Assume perfect hash on "a", "b", "c", and "d". - def idx: Any => Int = if (hashAlgorithm == "murmur3") { - murmur3FeatureIdx(n) - } else { - nativeFeatureIdx(n) - } - val expected = Vectors.sparse(n, - Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) - assert(features ~== expected absTol 1e-14) - } + val hashingTF = new HashingTF() + .setInputCol("words") + .setOutputCol("features") + .setNumFeatures(n) + val output = hashingTF.transform(df) + val attrGroup = AttributeGroup.fromStructField(output.schema("features")) + require(attrGroup.numAttributes === Some(n)) + val features = output.select("features").first().getAs[Vector](0) + // Assume perfect hash on "a", "b", "c", and "d". + def idx: Any => Int = murmur3FeatureIdx(n) + val expected = Vectors.sparse(n, + Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) + assert(features ~== expected absTol 1e-14) } test("applying binary term freqs") { @@ -86,10 +79,6 @@ class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with Defau testDefaultReadWrite(t) } - private def nativeFeatureIdx(numFeatures: Int)(term: Any): Int = { - Utils.nonNegativeMod(MLlibHashingTF.nativeHash(term), numFeatures) - } - private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = { Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures) } diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 0e578d48ca..610d167f3a 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -517,8 +517,12 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java """ .. note:: Experimental - Maps a sequence of terms to their term frequencies using the - hashing trick. + Maps a sequence of terms to their term frequencies using the hashing trick. + Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32) + to calculate the hash code value for the term object. + Since a simple modulo is used to transform the hash function to a column index, + it is advisable to use a power of two as the numFeatures parameter; + otherwise the features will not be mapped evenly to the columns. >>> df = sqlContext.createDataFrame([(["a", "b", "c"],)], ["words"]) >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features") @@ -543,30 +547,22 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java "rather than integer counts. Default False.", typeConverter=TypeConverters.toBoolean) - hashAlgorithm = Param(Params._dummy(), "hashAlgorithm", "The hash algorithm used when " + - "mapping term to integer. Supported options: murmur3(default) " + - "and native.", typeConverter=TypeConverters.toString) - @keyword_only - def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None, - hashAlgorithm="murmur3"): + def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None): """ - __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None, \ - hashAlgorithm="murmur3") + __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None) """ super(HashingTF, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid) - self._setDefault(numFeatures=1 << 18, binary=False, hashAlgorithm="murmur3") + self._setDefault(numFeatures=1 << 18, binary=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.3.0") - def setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None, - hashAlgorithm="murmur3"): + def setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None): """ - setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None, \ - hashAlgorithm="murmur3") + setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None) Sets params for this HashingTF. """ kwargs = self.setParams._input_kwargs @@ -587,21 +583,6 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java """ return self.getOrDefault(self.binary) - @since("2.0.0") - def setHashAlgorithm(self, value): - """ - Sets the value of :py:attr:`hashAlgorithm`. - """ - self._set(hashAlgorithm=value) - return self - - @since("2.0.0") - def getHashAlgorithm(self): - """ - Gets the value of hashAlgorithm or its default value. - """ - return self.getOrDefault(self.hashAlgorithm) - @inherit_doc class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index d014da8d0e..ebef656632 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -911,15 +911,12 @@ class HashingTFTest(PySparkTestCase): sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) - n = 100 + n = 10 hashingTF = HashingTF() - hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n)\ - .setBinary(True).setHashAlgorithm("native") + hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) output = hashingTF.transform(df) features = output.select("features").first().features.toArray() - expected = Vectors.sparse(n, {(ord("a") % n): 1.0, - (ord("b") % n): 1.0, - (ord("c") % n): 1.0}).toArray() + expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray() for i in range(0, n): self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) + ": expected " + str(expected[i]) + ", got " + str(features[i])) -- GitLab