Skip to content
Snippets Groups Projects
  • Xiangrui Meng's avatar
    9c7e802a
    [SPARK-7380] [MLLIB] pipeline stages should be copyable in Python · 9c7e802a
    Xiangrui Meng authored
    This PR makes pipeline stages in Python copyable and hence simplifies some implementations. It also includes the following changes:
    
    1. Rename `paramMap` and `defaultParamMap` to `_paramMap` and `_defaultParamMap`, respectively.
    2. Accept a list of param maps in `fit`.
    3. Use parent uid and name to identify param.
    
    jkbradley
    
    Author: Xiangrui Meng <meng@databricks.com>
    Author: Joseph K. Bradley <joseph@databricks.com>
    
    Closes #6088 from mengxr/SPARK-7380 and squashes the following commits:
    
    413c463 [Xiangrui Meng] remove unnecessary doc
    4159f35 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7380
    611c719 [Xiangrui Meng] fix python style
    68862b8 [Xiangrui Meng] update _java_obj initialization
    927ad19 [Xiangrui Meng] fix ml/tests.py
    0138fc3 [Xiangrui Meng] update feature transformers and fix a bug in RegexTokenizer
    9ca44fb [Xiangrui Meng] simplify Java wrappers and add tests
    c7d84ef [Xiangrui Meng] update ml/tests.py to test copy params
    7e0d27f [Xiangrui Meng] merge master
    46840fb [Xiangrui Meng] update wrappers
    b6db1ed [Xiangrui Meng] update all self.paramMap to self._paramMap
    46cb6ed [Xiangrui Meng] merge master
    a163413 [Xiangrui Meng] fix style
    1042e80 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7380
    9630eae [Xiangrui Meng] fix Identifiable._randomUID
    13bd70a [Xiangrui Meng] update ml/tests.py
    64a536c [Xiangrui Meng] use _fit/_transform/_evaluate to simplify the impl
    02abf13 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into copyable-python
    66ce18c [Joseph K. Bradley] some cleanups before sending to Xiangrui
    7431272 [Joseph K. Bradley] Rebased with master
    9c7e802a
    History
    [SPARK-7380] [MLLIB] pipeline stages should be copyable in Python
    Xiangrui Meng authored
    This PR makes pipeline stages in Python copyable and hence simplifies some implementations. It also includes the following changes:
    
    1. Rename `paramMap` and `defaultParamMap` to `_paramMap` and `_defaultParamMap`, respectively.
    2. Accept a list of param maps in `fit`.
    3. Use parent uid and name to identify param.
    
    jkbradley
    
    Author: Xiangrui Meng <meng@databricks.com>
    Author: Joseph K. Bradley <joseph@databricks.com>
    
    Closes #6088 from mengxr/SPARK-7380 and squashes the following commits:
    
    413c463 [Xiangrui Meng] remove unnecessary doc
    4159f35 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7380
    611c719 [Xiangrui Meng] fix python style
    68862b8 [Xiangrui Meng] update _java_obj initialization
    927ad19 [Xiangrui Meng] fix ml/tests.py
    0138fc3 [Xiangrui Meng] update feature transformers and fix a bug in RegexTokenizer
    9ca44fb [Xiangrui Meng] simplify Java wrappers and add tests
    c7d84ef [Xiangrui Meng] update ml/tests.py to test copy params
    7e0d27f [Xiangrui Meng] merge master
    46840fb [Xiangrui Meng] update wrappers
    b6db1ed [Xiangrui Meng] update all self.paramMap to self._paramMap
    46cb6ed [Xiangrui Meng] merge master
    a163413 [Xiangrui Meng] fix style
    1042e80 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7380
    9630eae [Xiangrui Meng] fix Identifiable._randomUID
    13bd70a [Xiangrui Meng] update ml/tests.py
    64a536c [Xiangrui Meng] use _fit/_transform/_evaluate to simplify the impl
    02abf13 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into copyable-python
    66ce18c [Joseph K. Bradley] some cleanups before sending to Xiangrui
    7431272 [Joseph K. Bradley] Rebased with master
evaluation.py 4.15 KiB
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark.ml.wrapper import JavaEvaluator
from pyspark.ml.param import Param, Params
from pyspark.ml.param.shared import HasLabelCol, HasRawPredictionCol
from pyspark.ml.util import keyword_only
from pyspark.mllib.common import inherit_doc

__all__ = ['BinaryClassificationEvaluator']


@inherit_doc
class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol):
    """
    Evaluator for binary classification, which expects two input
    columns: rawPrediction and label.

    >>> from pyspark.mllib.linalg import Vectors
    >>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]),
    ...    [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)])
    >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"])
    ...
    >>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw")
    >>> evaluator.evaluate(dataset)
    0.70...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
    0.83...
    """

    # a placeholder to make it appear in the generated doc
    metricName = Param(Params._dummy(), "metricName",
                       "metric name in evaluation (areaUnderROC|areaUnderPR)")

    @keyword_only
    def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
                 metricName="areaUnderROC"):
        """
        __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \
                 metricName="areaUnderROC")
        """
        super(BinaryClassificationEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
        #: param for metric name in evaluation (areaUnderROC|areaUnderPR)
        self.metricName = Param(self, "metricName",
                                "metric name in evaluation (areaUnderROC|areaUnderPR)")
        self._setDefault(rawPredictionCol="rawPrediction", labelCol="label",
                         metricName="areaUnderROC")
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        self._paramMap[self.metricName] = value
        return self

    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    def setParams(self, rawPredictionCol="rawPrediction", labelCol="label",
                  metricName="areaUnderROC"):
        """
        setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \
                  metricName="areaUnderROC")
        Sets params for binary classification evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.evaluation tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    (failure_count, test_count) = doctest.testmod(
        globs=globs, optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)