From 105745645b12afbbc2a350518cb5853a88944183 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <bjcutler@us.ibm.com>
Date: Mon, 23 Nov 2015 17:11:51 -0800
Subject: [PATCH] [SPARK-10560][PYSPARK][MLLIB][DOCS] Make
 StreamingLogisticRegressionWithSGD Python API equal to Scala one

This is to bring the API documentation of StreamingLogisticReressionWithSGD and StreamingLinearRegressionWithSGC in line with the Scala versions.

-Fixed the algorithm descriptions
-Added default values to parameter descriptions
-Changed StreamingLogisticRegressionWithSGD regParam to default to 0, as in the Scala version

Author: Bryan Cutler <bjcutler@us.ibm.com>

Closes #9141 from BryanCutler/StreamingLogisticRegressionWithSGD-python-api-sync.
---
 python/pyspark/mllib/classification.py | 37 +++++++++++++++++---------
 python/pyspark/mllib/regression.py     | 32 ++++++++++++++--------
 2 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index aab4015ba8..9e6f17ef6e 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -652,21 +652,34 @@ class NaiveBayes(object):
 @inherit_doc
 class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
     """
-    Run LogisticRegression with SGD on a batch of data.
-
-    The weights obtained at the end of training a stream are used as initial
-    weights for the next batch.
-
-    :param stepSize: Step size for each iteration of gradient descent.
-    :param numIterations: Number of iterations run for each batch of data.
-    :param miniBatchFraction: Fraction of data on which SGD is run for each
-                              iteration.
-    :param regParam: L2 Regularization parameter.
-    :param convergenceTol: A condition which decides iteration termination.
+    Train or predict a logistic regression model on streaming data. Training uses
+    Stochastic Gradient Descent to update the model based on each new batch of
+    incoming data from a DStream.
+
+    Each batch of data is assumed to be an RDD of LabeledPoints.
+    The number of data points per batch can vary, but the number
+    of features must be constant. An initial weight
+    vector must be provided.
+
+    :param stepSize:
+      Step size for each iteration of gradient descent.
+      (default: 0.1)
+    :param numIterations:
+      Number of iterations run for each batch of data.
+      (default: 50)
+    :param miniBatchFraction:
+      Fraction of each batch of data to use for updates.
+      (default: 1.0)
+    :param regParam:
+      L2 Regularization parameter.
+      (default: 0.0)
+    :param convergenceTol:
+      Value used to determine when to terminate iterations.
+      (default: 0.001)
 
     .. versionadded:: 1.5.0
     """
-    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01,
+    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.0,
                  convergenceTol=0.001):
         self.stepSize = stepSize
         self.numIterations = numIterations
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 6f00d1df20..13b3397501 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -734,17 +734,27 @@ class StreamingLinearAlgorithm(object):
 @inherit_doc
 class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm):
     """
-    Run LinearRegression with SGD on a batch of data.
-
-    The problem minimized is (1 / n_samples) * (y - weights'X)**2.
-    After training on a batch of data, the weights obtained at the end of
-    training are used as initial weights for the next batch.
-
-    :param stepSize: Step size for each iteration of gradient descent.
-    :param numIterations: Total number of iterations run.
-    :param miniBatchFraction: Fraction of data on which SGD is run for each
-                              iteration.
-    :param convergenceTol: A condition which decides iteration termination.
+    Train or predict a linear regression model on streaming data. Training uses
+    Stochastic Gradient Descent to update the model based on each new batch of
+    incoming data from a DStream (see `LinearRegressionWithSGD` for model equation).
+
+    Each batch of data is assumed to be an RDD of LabeledPoints.
+    The number of data points per batch can vary, but the number
+    of features must be constant. An initial weight
+    vector must be provided.
+
+    :param stepSize:
+      Step size for each iteration of gradient descent.
+      (default: 0.1)
+    :param numIterations:
+      Number of iterations run for each batch of data.
+      (default: 50)
+    :param miniBatchFraction:
+      Fraction of each batch of data to use for updates.
+      (default: 1.0)
+    :param convergenceTol:
+      Value used to determine when to terminate iterations.
+      (default: 0.001)
 
     .. versionadded:: 1.5.0
     """
-- 
GitLab