Skip to content
Snippets Groups Projects
Commit f5ff4a84 authored by Burak Yavuz's avatar Burak Yavuz Committed by Xiangrui Meng
Browse files

[SPARK-7383] [ML] Feature Parity in PySpark for ml.features

Implemented python wrappers for Scala functions that don't exist in `ml.features`

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #5991 from brkyvz/ml-feat-PR and squashes the following commits:

adcca55 [Burak Yavuz] add regex tokenizer to __all__
b91cb44 [Burak Yavuz] addressed comments
bd39fd2 [Burak Yavuz] remove addition
b82bd7c [Burak Yavuz] Parity in PySpark for ml.features
parent c796be70
No related branches found
No related tags found
No related merge requests found
...@@ -31,7 +31,7 @@ import org.apache.spark.sql.types.DataType ...@@ -31,7 +31,7 @@ import org.apache.spark.sql.types.DataType
* which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an
* expansion of a product of sums expresses it as a sum of products by using the fact that * expansion of a product of sums expresses it as a sum of products by using the fact that
* multiplication distributes over addition". Take a 2-variable feature vector as an example: * multiplication distributes over addition". Take a 2-variable feature vector as an example:
* `(x, y)`, if we want to expand it with degree 2, then we get `(x, y, x * x, x * y, y * y)`. * `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.
*/ */
@AlphaComponent @AlphaComponent
class PolynomialExpansion extends UnaryTransformer[Vector, Vector, PolynomialExpansion] { class PolynomialExpansion extends UnaryTransformer[Vector, Vector, PolynomialExpansion] {
......
...@@ -42,7 +42,7 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { ...@@ -42,7 +42,7 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] {
/** /**
* :: AlphaComponent :: * :: AlphaComponent ::
* A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default) * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
* or using it to split the text (set matching to false). Optional parameters also allow filtering * or using it to split the text (set matching to false). Optional parameters also allow filtering
* tokens using a minimal length. * tokens using a minimal length.
* It returns an array of strings that can be empty. * It returns an array of strings that can be empty.
......
This diff is collapsed.
...@@ -97,7 +97,9 @@ if __name__ == "__main__": ...@@ -97,7 +97,9 @@ if __name__ == "__main__":
("inputCol", "input column name", None), ("inputCol", "input column name", None),
("inputCols", "input column names", None), ("inputCols", "input column names", None),
("outputCol", "output column name", None), ("outputCol", "output column name", None),
("numFeatures", "number of features", None)] ("seed", "random seed", None),
("tol", "the convergence tolerance for iterative algorithms", None),
("stepSize", "Step size to be used for each iteration of optimization.", None)]
code = [] code = []
for name, doc, defaultValueStr in shared: for name, doc, defaultValueStr in shared:
code.append(_gen_param_code(name, doc, defaultValueStr)) code.append(_gen_param_code(name, doc, defaultValueStr))
......
...@@ -308,3 +308,92 @@ class HasNumFeatures(Params): ...@@ -308,3 +308,92 @@ class HasNumFeatures(Params):
Gets the value of numFeatures or its default value. Gets the value of numFeatures or its default value.
""" """
return self.getOrDefault(self.numFeatures) return self.getOrDefault(self.numFeatures)
class HasSeed(Params):
"""
Mixin for param seed: random seed.
"""
# a placeholder to make it appear in the generated doc
seed = Param(Params._dummy(), "seed", "random seed")
def __init__(self):
super(HasSeed, self).__init__()
#: param for random seed
self.seed = Param(self, "seed", "random seed")
if None is not None:
self._setDefault(seed=None)
def setSeed(self, value):
"""
Sets the value of :py:attr:`seed`.
"""
self.paramMap[self.seed] = value
return self
def getSeed(self):
"""
Gets the value of seed or its default value.
"""
return self.getOrDefault(self.seed)
class HasTol(Params):
"""
Mixin for param tol: the convergence tolerance for iterative algorithms.
"""
# a placeholder to make it appear in the generated doc
tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms")
def __init__(self):
super(HasTol, self).__init__()
#: param for the convergence tolerance for iterative algorithms
self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms")
if None is not None:
self._setDefault(tol=None)
def setTol(self, value):
"""
Sets the value of :py:attr:`tol`.
"""
self.paramMap[self.tol] = value
return self
def getTol(self):
"""
Gets the value of tol or its default value.
"""
return self.getOrDefault(self.tol)
class HasStepSize(Params):
"""
Mixin for param stepSize: Step size to be used for each iteration of optimization..
"""
# a placeholder to make it appear in the generated doc
stepSize = Param(Params._dummy(), "stepSize",
"Step size to be used for each iteration of optimization.")
def __init__(self):
super(HasStepSize, self).__init__()
#: param for Step size to be used for each iteration of optimization.
self.stepSize = Param(self, "stepSize",
"Step size to be used for each iteration of optimization.")
if None is not None:
self._setDefault(stepSize=None)
def setStepSize(self, value):
"""
Sets the value of :py:attr:`stepSize`.
"""
self.paramMap[self.stepSize] = value
return self
def getStepSize(self):
"""
Gets the value of stepSize or its default value.
"""
return self.getOrDefault(self.stepSize)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment