From 236e3c8fbc887e4da4f143cbf533f016f21c10d4 Mon Sep 17 00:00:00 2001
From: vijaykiran <mail@vijaykiran.com>
Date: Mon, 29 Feb 2016 15:52:41 +0200
Subject: [PATCH] [SPARK-12633][PYSPARK] [DOC] PySpark regression parameter
 desc to consistent format

Part of task for [SPARK-11219](https://issues.apache.org/jira/browse/SPARK-11219) to make PySpark MLlib parameter description formatting consistent. This is for the regression module.  Also, updated 2 params in classification to read as `Supported values:` to be consistent.

closes #10600

Author: vijaykiran <mail@vijaykiran.com>
Author: Bryan Cutler <cutlerb@gmail.com>

Closes #11404 from BryanCutler/param-desc-consistent-regression-SPARK-12633.
---
 python/pyspark/mllib/classification.py |   4 +-
 python/pyspark/mllib/regression.py     | 326 +++++++++++++------------
 2 files changed, 166 insertions(+), 164 deletions(-)

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 53a0df27ca..57106f8690 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -294,7 +294,7 @@ class LogisticRegressionWithSGD(object):
           (default: 0.01)
         :param regType:
           The type of regularizer used for training our model.
-          Allowed values:
+          Supported values:
 
             - "l1" for using L1 regularization
             - "l2" for using L2 regularization (default)
@@ -344,7 +344,7 @@ class LogisticRegressionWithLBFGS(object):
           (default: 0.0)
         :param regType:
           The type of regularizer used for training our model.
-          Allowed values:
+          Supported values:
 
             - "l1" for using L1 regularization
             - "l2" for using L2 regularization (default)
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 4dd7083d79..3b77a62000 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -37,10 +37,11 @@ class LabeledPoint(object):
     """
     Class that represents the features and labels of a data point.
 
-    :param label: Label for this data point.
-    :param features: Vector of features for this point (NumPy array,
-            list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
-            column matrix)
+    :param label:
+      Label for this data point.
+    :param features:
+      Vector of features for this point (NumPy array, list,
+      pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix).
 
     Note: 'label' and 'features' are accessible as class attributes.
 
@@ -66,8 +67,10 @@ class LinearModel(object):
     """
     A linear model that has a vector of coefficients and an intercept.
 
-    :param weights: Weights computed for every feature.
-    :param intercept: Intercept computed for this model.
+    :param weights:
+      Weights computed for every feature.
+    :param intercept:
+      Intercept computed for this model.
 
     .. versionadded:: 0.9.0
     """
@@ -217,19 +220,8 @@ def _regression_train_wrapper(train_func, modelClass, data, initial_weights):
 
 class LinearRegressionWithSGD(object):
     """
-    Train a linear regression model with no regularization using Stochastic Gradient Descent.
-    This solves the least squares regression formulation
-
-        f(weights) = 1/n ||A weights-y||^2
-
-    which is the mean squared error.
-    Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
-    its corresponding right hand side label y.
-    See also the documentation for the precise formulation.
-
     .. versionadded:: 0.9.0
     """
-
     @classmethod
     @since("0.9.0")
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
@@ -237,47 +229,52 @@ class LinearRegressionWithSGD(object):
               validateData=True, convergenceTol=0.001):
         """
         Train a linear regression model using Stochastic Gradient
-        Descent (SGD).
-        This solves the least squares regression formulation
-
-            f(weights) = 1/(2n) ||A weights - y||^2,
-
-        which is the mean squared error.
-        Here the data matrix has n rows, and the input RDD holds the
-        set of rows of A, each with its corresponding right hand side
-        label y. See also the documentation for the precise formulation.
-
-        :param data:              The training data, an RDD of
-                                  LabeledPoint.
-        :param iterations:        The number of iterations
-                                  (default: 100).
-        :param step:              The step parameter used in SGD
-                                  (default: 1.0).
-        :param miniBatchFraction: Fraction of data to be used for each
-                                  SGD iteration (default: 1.0).
-        :param initialWeights:    The initial weights (default: None).
-        :param regParam:          The regularizer parameter
-                                  (default: 0.0).
-        :param regType:           The type of regularizer used for
-                                  training our model.
-
-                                  :Allowed values:
-                                     - "l1" for using L1 regularization (lasso),
-                                     - "l2" for using L2 regularization (ridge),
-                                     - None for no regularization
-
-                                     (default: None)
-
-        :param intercept:         Boolean parameter which indicates the
-                                  use or not of the augmented representation
-                                  for training data (i.e. whether bias
-                                  features are activated or not,
-                                  default: False).
-        :param validateData:      Boolean parameter which indicates if
-                                  the algorithm should validate data
-                                  before training. (default: True)
-        :param convergenceTol:    A condition which decides iteration termination.
-                                  (default: 0.001)
+        Descent (SGD). This solves the least squares regression
+        formulation
+
+            f(weights) = 1/(2n) ||A weights - y||^2
+
+        which is the mean squared error. Here the data matrix has n rows,
+        and the input RDD holds the set of rows of A, each with its
+        corresponding right hand side label y.
+        See also the documentation for the precise formulation.
+
+        :param data:
+          The training data, an RDD of LabeledPoint.
+        :param iterations:
+          The number of iterations.
+          (default: 100)
+        :param step:
+          The step parameter used in SGD.
+          (default: 1.0)
+        :param miniBatchFraction:
+          Fraction of data to be used for each SGD iteration.
+          (default: 1.0)
+        :param initialWeights:
+          The initial weights.
+          (default: None)
+        :param regParam:
+          The regularizer parameter.
+          (default: 0.0)
+        :param regType:
+          The type of regularizer used for training our model.
+          Supported values:
+
+            - "l1" for using L1 regularization
+            - "l2" for using L2 regularization
+            - None for no regularization (default)
+        :param intercept:
+          Boolean parameter which indicates the use or not of the
+          augmented representation for training data (i.e., whether bias
+          features are activated or not).
+          (default: False)
+        :param validateData:
+          Boolean parameter which indicates if the algorithm should
+          validate data before training.
+          (default: True)
+        :param convergenceTol:
+          A condition which decides iteration termination.
+          (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
@@ -368,56 +365,53 @@ class LassoModel(LinearRegressionModelBase):
 
 class LassoWithSGD(object):
     """
-    Train a regression model with L1-regularization using Stochastic Gradient Descent.
-    This solves the L1-regularized least squares regression formulation
-
-        f(weights) = 1/2n ||A weights-y||^2  + regParam ||weights||_1
-
-    Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
-    its corresponding right hand side label y.
-    See also the documentation for the precise formulation.
-
     .. versionadded:: 0.9.0
     """
-
     @classmethod
     @since("0.9.0")
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
               validateData=True, convergenceTol=0.001):
         """
-        Train a regression model with L1-regularization using
-        Stochastic Gradient Descent.
-        This solves the l1-regularized least squares regression
-        formulation
-
-            f(weights) = 1/(2n) ||A weights - y||^2  + regParam ||weights||_1.
-
-        Here the data matrix has n rows, and the input RDD holds the
-        set of rows of A, each with its corresponding right hand side
-        label y. See also the documentation for the precise formulation.
-
-        :param data:              The training data, an RDD of
-                                  LabeledPoint.
-        :param iterations:        The number of iterations
-                                  (default: 100).
-        :param step:              The step parameter used in SGD
-                                  (default: 1.0).
-        :param regParam:          The regularizer parameter
-                                  (default: 0.01).
-        :param miniBatchFraction: Fraction of data to be used for each
-                                  SGD iteration (default: 1.0).
-        :param initialWeights:    The initial weights (default: None).
-        :param intercept:         Boolean parameter which indicates the
-                                  use or not of the augmented representation
-                                  for training data (i.e. whether bias
-                                  features are activated or not,
-                                  default: False).
-        :param validateData:      Boolean parameter which indicates if
-                                  the algorithm should validate data
-                                  before training. (default: True)
-        :param convergenceTol:    A condition which decides iteration termination.
-                                  (default: 0.001)
+        Train a regression model with L1-regularization using Stochastic
+        Gradient Descent. This solves the l1-regularized least squares
+        regression formulation
+
+            f(weights) = 1/(2n) ||A weights - y||^2  + regParam ||weights||_1
+
+        Here the data matrix has n rows, and the input RDD holds the set
+        of rows of A, each with its corresponding right hand side label y.
+        See also the documentation for the precise formulation.
+
+        :param data:
+          The training data, an RDD of LabeledPoint.
+        :param iterations:
+          The number of iterations.
+          (default: 100)
+        :param step:
+          The step parameter used in SGD.
+          (default: 1.0)
+        :param regParam:
+          The regularizer parameter.
+          (default: 0.01)
+        :param miniBatchFraction:
+          Fraction of data to be used for each SGD iteration.
+          (default: 1.0)
+        :param initialWeights:
+          The initial weights.
+          (default: None)
+        :param intercept:
+          Boolean parameter which indicates the use or not of the
+          augmented representation for training data (i.e. whether bias
+          features are activated or not).
+          (default: False)
+        :param validateData:
+          Boolean parameter which indicates if the algorithm should
+          validate data before training.
+          (default: True)
+        :param convergenceTol:
+          A condition which decides iteration termination.
+          (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
@@ -508,56 +502,53 @@ class RidgeRegressionModel(LinearRegressionModelBase):
 
 class RidgeRegressionWithSGD(object):
     """
-    Train a regression model with L2-regularization using Stochastic Gradient Descent.
-    This solves the L2-regularized least squares regression formulation
-
-          f(weights) = 1/2n ||A weights-y||^2  + regParam/2 ||weights||^2
-
-    Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
-    its corresponding right hand side label y.
-    See also the documentation for the precise formulation.
-
     .. versionadded:: 0.9.0
     """
-
     @classmethod
     @since("0.9.0")
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
               validateData=True, convergenceTol=0.001):
         """
-        Train a regression model with L2-regularization using
-        Stochastic Gradient Descent.
-        This solves the l2-regularized least squares regression
-        formulation
-
-            f(weights) = 1/(2n) ||A weights - y||^2 + regParam/2 ||weights||^2.
-
-        Here the data matrix has n rows, and the input RDD holds the
-        set of rows of A, each with its corresponding right hand side
-        label y. See also the documentation for the precise formulation.
-
-        :param data:              The training data, an RDD of
-                                  LabeledPoint.
-        :param iterations:        The number of iterations
-                                  (default: 100).
-        :param step:              The step parameter used in SGD
-                                  (default: 1.0).
-        :param regParam:          The regularizer parameter
-                                  (default: 0.01).
-        :param miniBatchFraction: Fraction of data to be used for each
-                                  SGD iteration (default: 1.0).
-        :param initialWeights:    The initial weights (default: None).
-        :param intercept:         Boolean parameter which indicates the
-                                  use or not of the augmented representation
-                                  for training data (i.e. whether bias
-                                  features are activated or not,
-                                  default: False).
-        :param validateData:      Boolean parameter which indicates if
-                                  the algorithm should validate data
-                                  before training. (default: True)
-        :param convergenceTol:    A condition which decides iteration termination.
-                                  (default: 0.001)
+        Train a regression model with L2-regularization using Stochastic
+        Gradient Descent. This solves the l2-regularized least squares
+        regression formulation
+
+            f(weights) = 1/(2n) ||A weights - y||^2 + regParam/2 ||weights||^2
+
+        Here the data matrix has n rows, and the input RDD holds the set
+        of rows of A, each with its corresponding right hand side label y.
+        See also the documentation for the precise formulation.
+
+        :param data:
+          The training data, an RDD of LabeledPoint.
+        :param iterations:
+          The number of iterations.
+          (default: 100)
+        :param step:
+          The step parameter used in SGD.
+          (default: 1.0)
+        :param regParam:
+          The regularizer parameter.
+          (default: 0.01)
+        :param miniBatchFraction:
+          Fraction of data to be used for each SGD iteration.
+          (default: 1.0)
+        :param initialWeights:
+          The initial weights.
+          (default: None)
+        :param intercept:
+          Boolean parameter which indicates the use or not of the
+          augmented representation for training data (i.e. whether bias
+          features are activated or not).
+          (default: False)
+        :param validateData:
+          Boolean parameter which indicates if the algorithm should
+          validate data before training.
+          (default: True)
+        :param convergenceTol:
+          A condition which decides iteration termination.
+          (default: 0.001)
         """
         def train(rdd, i):
             return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
@@ -572,12 +563,14 @@ class IsotonicRegressionModel(Saveable, Loader):
     """
     Regression model for isotonic regression.
 
-    :param boundaries: Array of boundaries for which predictions are
-            known. Boundaries must be sorted in increasing order.
-    :param predictions: Array of predictions associated to the
-            boundaries at the same index. Results of isotonic
-            regression and therefore monotone.
-    :param isotonic: indicates whether this is isotonic or antitonic.
+    :param boundaries:
+      Array of boundaries for which predictions are known. Boundaries
+      must be sorted in increasing order.
+    :param predictions:
+      Array of predictions associated to the boundaries at the same
+      index. Results of isotonic regression and therefore monotone.
+    :param isotonic:
+      Indicates whether this is isotonic or antitonic.
 
     >>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)]
     >>> irm = IsotonicRegression.train(sc.parallelize(data))
@@ -628,7 +621,8 @@ class IsotonicRegressionModel(Saveable, Loader):
         values with the same boundary then the same rules as in 2)
         are used.
 
-        :param x: Feature or RDD of Features to be labeled.
+        :param x:
+          Feature or RDD of Features to be labeled.
         """
         if isinstance(x, RDD):
             return x.map(lambda v: self.predict(v))
@@ -657,8 +651,8 @@ class IsotonicRegressionModel(Saveable, Loader):
 class IsotonicRegression(object):
     """
     Isotonic regression.
-    Currently implemented using parallelized pool adjacent violators algorithm.
-    Only univariate (single feature) algorithm supported.
+    Currently implemented using parallelized pool adjacent violators
+    algorithm. Only univariate (single feature) algorithm supported.
 
     Sequential PAV implementation based on:
 
@@ -684,8 +678,11 @@ class IsotonicRegression(object):
         """
         Train a isotonic regression model on the given data.
 
-        :param data: RDD of (label, feature, weight) tuples.
-        :param isotonic: Whether this is isotonic or antitonic.
+        :param data:
+          RDD of (label, feature, weight) tuples.
+        :param isotonic:
+          Whether this is isotonic (which is default) or antitonic.
+          (default: True)
         """
         boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel",
                                                 data.map(_convert_to_vector), bool(isotonic))
@@ -721,9 +718,11 @@ class StreamingLinearAlgorithm(object):
     @since("1.5.0")
     def predictOn(self, dstream):
         """
-        Make predictions on a dstream.
+        Use the model to make predictions on batches of data from a
+        DStream.
 
-        :return: Transformed dstream object.
+        :return:
+          DStream containing predictions.
         """
         self._validate(dstream)
         return dstream.map(lambda x: self._model.predict(x))
@@ -731,9 +730,11 @@ class StreamingLinearAlgorithm(object):
     @since("1.5.0")
     def predictOnValues(self, dstream):
         """
-        Make predictions on a keyed dstream.
+        Use the model to make predictions on the values of a DStream and
+        carry over its keys.
 
-        :return: Transformed dstream object.
+        :return:
+          DStream containing the input keys and the predictions as values.
         """
         self._validate(dstream)
         return dstream.mapValues(lambda x: self._model.predict(x))
@@ -742,14 +743,15 @@ class StreamingLinearAlgorithm(object):
 @inherit_doc
 class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm):
     """
-    Train or predict a linear regression model on streaming data. Training uses
-    Stochastic Gradient Descent to update the model based on each new batch of
-    incoming data from a DStream (see `LinearRegressionWithSGD` for model equation).
+    Train or predict a linear regression model on streaming data.
+    Training uses Stochastic Gradient Descent to update the model
+    based on each new batch of incoming data from a DStream
+    (see `LinearRegressionWithSGD` for model equation).
 
     Each batch of data is assumed to be an RDD of LabeledPoints.
     The number of data points per batch can vary, but the number
-    of features must be constant. An initial weight
-    vector must be provided.
+    of features must be constant. An initial weight vector must
+    be provided.
 
     :param stepSize:
       Step size for each iteration of gradient descent.
-- 
GitLab