Skip to content
Snippets Groups Projects
Commit 52ea399e authored by Yanbo Liang's avatar Yanbo Liang Committed by Xiangrui Meng
Browse files

[SPARK-10355] [ML] [PySpark] Add Python API for SQLTransformer

Add Python API for SQLTransformer

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8527 from yanboliang/spark-10355.
parent fe16fd0b
No related branches found
No related tags found
No related merge requests found
...@@ -28,9 +28,9 @@ from pyspark.mllib.linalg import _convert_to_vector ...@@ -28,9 +28,9 @@ from pyspark.mllib.linalg import _convert_to_vector
__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', __all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer',
'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec',
'PCA', 'PCAModel', 'RFormula', 'RFormulaModel'] 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
@inherit_doc @inherit_doc
...@@ -743,6 +743,57 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): ...@@ -743,6 +743,57 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
return self.getOrDefault(self.pattern) return self.getOrDefault(self.pattern)
@inherit_doc
class SQLTransformer(JavaTransformer):
"""
Implements the transforms which are defined by SQL statement.
Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
where '__THIS__' represents the underlying table of the input dataset.
>>> df = sqlContext.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"])
>>> sqlTrans = SQLTransformer(
... statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
>>> sqlTrans.transform(df).head()
Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)
"""
# a placeholder to make it appear in the generated doc
statement = Param(Params._dummy(), "statement", "SQL statement")
@keyword_only
def __init__(self, statement=None):
"""
__init__(self, statement=None)
"""
super(SQLTransformer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
self.statement = Param(self, "statement", "SQL statement")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@keyword_only
def setParams(self, statement=None):
"""
setParams(self, statement=None)
Sets params for this SQLTransformer.
"""
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)
def setStatement(self, value):
"""
Sets the value of :py:attr:`statement`.
"""
self._paramMap[self.statement] = value
return self
def getStatement(self):
"""
Gets the value of statement or its default value.
"""
return self.getOrDefault(self.statement)
@inherit_doc @inherit_doc
class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol): class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment