From 1e5fcdf96c0176a11e5f425ba539b6ed629281db Mon Sep 17 00:00:00 2001 From: zlpmichelle <zlpmichelle@gmail.com> Date: Fri, 26 Feb 2016 14:37:44 -0800 Subject: [PATCH] [SPARK-13505][ML] add python api for MaxAbsScaler ## What changes were proposed in this pull request? After SPARK-13028, we should add Python API for MaxAbsScaler. ## How was this patch tested? unit test Author: zlpmichelle <zlpmichelle@gmail.com> Closes #11393 from zlpmichelle/master. --- python/pyspark/ml/feature.py | 75 ++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 67bccfae7a..369f3508fd 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -28,13 +28,14 @@ from pyspark.mllib.common import inherit_doc from pyspark.mllib.linalg import _convert_to_vector __all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT', - 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'MinMaxScaler', - 'MinMaxScalerModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel', - 'PolynomialExpansion', 'QuantileDiscretizer', 'RegexTokenizer', 'RFormula', - 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', - 'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', - 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', 'Word2Vec', 'Word2VecModel', - 'ChiSqSelector', 'ChiSqSelectorModel'] + 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', + 'MaxAbsScaler', 'MaxAbsScalerModel', 'MinMaxScaler', 'MinMaxScalerModel', + 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel', 'PolynomialExpansion', + 'QuantileDiscretizer', 'RegexTokenizer', 'RFormula', 'RFormulaModel', + 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', + 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', + 'VectorIndexer', 'VectorSlicer', 'Word2Vec', 'Word2VecModel', 'ChiSqSelector', + 'ChiSqSelectorModel'] @inherit_doc @@ -544,6 +545,66 @@ class IDFModel(JavaModel): """ +@inherit_doc +class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol): + """ + .. note:: Experimental + + Rescale each feature individually to range [-1, 1] by dividing through the largest maximum + absolute value in each feature. It does not shift/center the data, and thus does not destroy + any sparsity. + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) + >>> maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") + >>> model = maScaler.fit(df) + >>> model.transform(df).show() + +-----+------+ + | a|scaled| + +-----+------+ + |[1.0]| [0.5]| + |[2.0]| [1.0]| + +-----+------+ + ... + + .. versionadded:: 2.0.0 + """ + + @keyword_only + def __init__(self, inputCol=None, outputCol=None): + """ + __init__(self, inputCol=None, outputCol=None) + """ + super(MaxAbsScaler, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MaxAbsScaler", self.uid) + self._setDefault() + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + @since("2.0.0") + def setParams(self, inputCol=None, outputCol=None): + """ + setParams(self, inputCol=None, outputCol=None) + Sets params for this MaxAbsScaler. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def _create_model(self, java_model): + return MaxAbsScalerModel(java_model) + + +class MaxAbsScalerModel(JavaModel): + """ + .. note:: Experimental + + Model fitted by :py:class:`MaxAbsScaler`. + + .. versionadded:: 2.0.0 + """ + + @inherit_doc class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): """ -- GitLab