[SPARK-7383] [ML] Feature Parity in PySpark for ml.features

Implemented python wrappers for Scala functions that don't exist in `ml.features` Author: Burak Yavuz <brkyvz@gmail.com> Closes #5991 from brkyvz/ml-feat-PR and squashes the following commits: adcca55 [Burak Yavuz] add regex tokenizer to __all__ b91cb44 [Burak Yavuz] addressed comments bd39fd2 [Burak Yavuz] remove addition b82bd7c [Burak Yavuz] Parity in PySpark for ml.features

[SPARK-7383] [ML] Feature Parity in PySpark for ml.features
f5ff4a84 · Burak Yavuz · Xiangrui Meng · c796be70 · f5ff4a84 · f5ff4a84
Commit f5ff4a84 authored 10 years ago by Burak Yavuz Committed by Xiangrui Meng 10 years ago
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types.DataType
 * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an
 * expansion of a product of sums expresses it as a sum of products by using the fact that
 * multiplication distributes over addition". Take a 2-variable feature vector as an example:
- * `(x, y)`, if we want to expand it with degree 2, then we get `(x, y, x * x, x * y, y * y)`.
+ * `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.
 */
 @AlphaComponent
 class PolynomialExpansion extends UnaryTransformer[Vector, Vector, PolynomialExpansion] {

--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -42,7 +42,7 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] {
 /**
 * :: AlphaComponent ::
- * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default) 
+ * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
 * or using it to split the text (set matching to false). Optional parameters also allow filtering
 * tokens using a minimal length.
 * It returns an array of strings that can be empty.

--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -97,7 +97,9 @@ if __name__ == "__main__":
        ("inputCol", "input column name", None),
        ("inputCols", "input column names", None),
        ("outputCol", "output column name", None),
-        ("numFeatures", "number of features", None)]
+        ("seed", "random seed", None),
+        ("tol", "the convergence tolerance for iterative algorithms", None),
+        ("stepSize", "Step size to be used for each iteration of optimization.", None)]
    code = []
    for name, doc, defaultValueStr in shared:
        code.append(_gen_param_code(name, doc, defaultValueStr))

--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -308,3 +308,92 @@ class HasNumFeatures(Params):
        Gets the value of numFeatures or its default value.
        """
        return self.getOrDefault(self.numFeatures)
+class HasSeed(Params):
+    """
+    Mixin for param seed: random seed.
+    """
+    # a placeholder to make it appear in the generated doc
+    seed = Param(Params._dummy(), "seed", "random seed")
+    def __init__(self):
+        super(HasSeed, self).__init__()
+        #: param for random seed
+        self.seed = Param(self, "seed", "random seed")
+        if None is not None:
+            self._setDefault(seed=None)
+    def setSeed(self, value):
+        """
+        Sets the value of :py:attr:`seed`.
+        """
+        self.paramMap[self.seed] = value
+        return self
+    def getSeed(self):
+        """
+        Gets the value of seed or its default value.
+        """
+        return self.getOrDefault(self.seed)
+class HasTol(Params):
+    """
+    Mixin for param tol: the convergence tolerance for iterative algorithms.
+    """
+    # a placeholder to make it appear in the generated doc
+    tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms")
+    def __init__(self):
+        super(HasTol, self).__init__()
+        #: param for the convergence tolerance for iterative algorithms
+        self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms")
+        if None is not None:
+            self._setDefault(tol=None)
+    def setTol(self, value):
+        """
+        Sets the value of :py:attr:`tol`.
+        """
+        self.paramMap[self.tol] = value
+        return self
+    def getTol(self):
+        """
+        Gets the value of tol or its default value.
+        """
+        return self.getOrDefault(self.tol)
+class HasStepSize(Params):
+    """
+    Mixin for param stepSize: Step size to be used for each iteration of optimization..
+    """
+    # a placeholder to make it appear in the generated doc
+    stepSize = Param(Params._dummy(), "stepSize",
+                     "Step size to be used for each iteration of optimization.")
+    def __init__(self):
+        super(HasStepSize, self).__init__()
+        #: param for Step size to be used for each iteration of optimization.
+        self.stepSize = Param(self, "stepSize",
+                              "Step size to be used for each iteration of optimization.")
+        if None is not None:
+            self._setDefault(stepSize=None)
+    def setStepSize(self, value):
+        """
+        Sets the value of :py:attr:`stepSize`.
+        """
+        self.paramMap[self.stepSize] = value
+        return self
+    def getStepSize(self):
+        """
+        Gets the value of stepSize or its default value.
+        """
+        return self.getOrDefault(self.stepSize)