From 36cd10d19d95418cec4b789545afc798088be315 Mon Sep 17 00:00:00 2001 From: hyukjinkwon <gurwls223@gmail.com> Date: Tue, 22 Nov 2016 11:40:18 +0000 Subject: [PATCH] [SPARK-18447][DOCS] Fix the markdown for `Note:`/`NOTE:`/`Note that` across Python API documentation ## What changes were proposed in this pull request? It seems in Python, there are - `Note:` - `NOTE:` - `Note that` - `.. note::` This PR proposes to fix those to `.. note::` to be consistent. **Before** <img width="567" alt="2016-11-21 1 18 49" src="https://cloud.githubusercontent.com/assets/6477701/20464305/85144c86-af88-11e6-8ee9-90f584dd856c.png"> <img width="617" alt="2016-11-21 12 42 43" src="https://cloud.githubusercontent.com/assets/6477701/20464263/27be5022-af88-11e6-8577-4bbca7cdf36c.png"> **After** <img width="554" alt="2016-11-21 1 18 42" src="https://cloud.githubusercontent.com/assets/6477701/20464306/8fe48932-af88-11e6-83e1-fc3cbf74407d.png"> <img width="628" alt="2016-11-21 12 42 51" src="https://cloud.githubusercontent.com/assets/6477701/20464264/2d3e156e-af88-11e6-93f3-cab8d8d02983.png"> ## How was this patch tested? The notes were found via ```bash grep -r "Note: " . grep -r "NOTE: " . grep -r "Note that " . ``` And then fixed one by one comparing with API documentation. After that, manually tested via `make html` under `./python/docs`. Author: hyukjinkwon <gurwls223@gmail.com> Closes #15947 from HyukjinKwon/SPARK-18447. (cherry picked from commit 933a6548d423cf17448207a99299cf36fc1a95f6) Signed-off-by: Sean Owen <sowen@cloudera.com> --- python/pyspark/conf.py | 4 +- python/pyspark/context.py | 8 ++-- python/pyspark/ml/classification.py | 45 +++++++++--------- python/pyspark/ml/clustering.py | 8 ++-- python/pyspark/ml/feature.py | 13 +++--- python/pyspark/ml/linalg/__init__.py | 11 +++-- python/pyspark/ml/regression.py | 32 ++++++------- python/pyspark/mllib/clustering.py | 6 +-- python/pyspark/mllib/feature.py | 24 +++++----- python/pyspark/mllib/linalg/__init__.py | 11 +++-- python/pyspark/mllib/linalg/distributed.py | 15 +++--- python/pyspark/mllib/regression.py | 2 +- python/pyspark/mllib/stat/_statistics.py | 3 +- python/pyspark/mllib/tree.py | 12 ++--- python/pyspark/rdd.py | 54 +++++++++++----------- python/pyspark/sql/dataframe.py | 28 ++++++----- python/pyspark/sql/functions.py | 11 +++-- python/pyspark/sql/streaming.py | 10 ++-- python/pyspark/streaming/context.py | 2 +- python/pyspark/streaming/kinesis.py | 4 +- 20 files changed, 157 insertions(+), 146 deletions(-) diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py index 64b6f238e9..491b3a8197 100644 --- a/python/pyspark/conf.py +++ b/python/pyspark/conf.py @@ -90,8 +90,8 @@ class SparkConf(object): All setter methods in this class support chaining. For example, you can write C{conf.setMaster("local").setAppName("My app")}. - Note that once a SparkConf object is passed to Spark, it is cloned - and can no longer be modified by the user. + .. note:: Once a SparkConf object is passed to Spark, it is cloned + and can no longer be modified by the user. """ def __init__(self, loadDefaults=True, _jvm=None, _jconf=None): diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 2c2cf6a373..2fd3aee01d 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -520,8 +520,8 @@ class SparkContext(object): ... (a-hdfs-path/part-nnnnn, its content) - NOTE: Small files are preferred, as each file will be loaded - fully in memory. + .. note:: Small files are preferred, as each file will be loaded + fully in memory. >>> dirPath = os.path.join(tempdir, "files") >>> os.mkdir(dirPath) @@ -547,8 +547,8 @@ class SparkContext(object): in a key-value pair, where the key is the path of each file, the value is the content of each file. - Note: Small files are preferred, large file is also allowable, but - may cause bad performance. + .. note:: Small files are preferred, large file is also allowable, but + may cause bad performance. """ minPartitions = minPartitions or self.defaultMinPartitions return RDD(self._jsc.binaryFiles(path, minPartitions), self, diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 83e1e89347..8054a34db3 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -440,9 +440,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): .. seealso:: `Wikipedia reference \ <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("roc") @@ -453,9 +453,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): Computes the area under the receiver operating characteristic (ROC) curve. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("areaUnderROC") @@ -467,9 +467,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): containing two fields recall, precision with (0.0, 1.0) prepended to it. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("pr") @@ -480,9 +480,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("fMeasureByThreshold") @@ -494,9 +494,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): Every possible probability obtained in transforming the dataset are used as thresholds used in calculating the precision. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("precisionByThreshold") @@ -508,9 +508,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): Every possible probability obtained in transforming the dataset are used as thresholds used in calculating the recall. - Note: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("recallByThreshold") @@ -695,9 +695,9 @@ class DecisionTreeClassificationModel(DecisionTreeModel, JavaClassificationModel where gain is scaled by the number of instances passing through node - Normalize importances for tree to sum to 1. - Note: Feature importance for single decision trees can have high variance due to - correlated predictor variables. Consider using a :py:class:`RandomForestClassifier` - to determine feature importance instead. + .. note:: Feature importance for single decision trees can have high variance due to + correlated predictor variables. Consider using a :py:class:`RandomForestClassifier` + to determine feature importance instead. """ return self._call_java("featureImportances") @@ -839,7 +839,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol `Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_ learning algorithm for classification. It supports binary labels, as well as both continuous and categorical features. - Note: Multiclass labels are not currently supported. The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999. @@ -851,6 +850,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol - We expect to implement TreeBoost in the future: `SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_ + .. note:: Multiclass labels are not currently supported. + >>> from numpy import allclose >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index e58ec1e7ac..b29b5ac70e 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -155,7 +155,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte While this process is generally guaranteed to converge, it is not guaranteed to find a global optimum. - Note: For high-dimensional data (with many features), this algorithm may perform poorly. + .. note:: For high-dimensional data (with many features), this algorithm may perform poorly. This is due to high-dimensional data (a) making it difficult to cluster at all (based on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions. @@ -749,9 +749,9 @@ class DistributedLDAModel(LDAModel, JavaMLReadable, JavaMLWritable): If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may be saved checkpoint files. This method is provided so that users can manage those files. - Note that removing the checkpoints can cause failures if a partition is lost and is needed - by certain :py:class:`DistributedLDAModel` methods. Reference counting will clean up the - checkpoints when this model and derivative data go out of scope. + .. note:: Removing the checkpoints can cause failures if a partition is lost and is needed + by certain :py:class:`DistributedLDAModel` methods. Reference counting will clean up + the checkpoints when this model and derivative data go out of scope. :return List of checkpoint files from training """ diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 635cf13045..40b63d4d31 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -742,8 +742,8 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min) - Note that since zero values will probably be transformed to non-zero values, output of the - transformer will be DenseVector even for sparse input. + .. note:: Since zero values will probably be transformed to non-zero values, output of the + transformer will be DenseVector even for sparse input. >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) @@ -1014,9 +1014,9 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, :py:attr:`dropLast`) because it makes the vector entries sum up to one, and hence linearly dependent. So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`. - Note that this is different from scikit-learn's OneHotEncoder, - which keeps all categories. - The output vectors are sparse. + + .. note:: This is different from scikit-learn's OneHotEncoder, + which keeps all categories. The output vectors are sparse. .. seealso:: @@ -1698,7 +1698,8 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ A feature transformer that filters out stop words from input. - Note: null values from input array are preserved unless adding null to stopWords explicitly. + + .. note:: null values from input array are preserved unless adding null to stopWords explicitly. >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"]) >>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"]) diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py index a5df727fdb..1705c156ce 100644 --- a/python/pyspark/ml/linalg/__init__.py +++ b/python/pyspark/ml/linalg/__init__.py @@ -746,11 +746,12 @@ class SparseVector(Vector): class Vectors(object): """ - Factory methods for working with vectors. Note that dense vectors - are simply represented as NumPy array objects, so there is no need - to covert them for use in MLlib. For sparse vectors, the factory - methods in this class create an MLlib-compatible type, or users - can pass in SciPy's C{scipy.sparse} column vectors. + Factory methods for working with vectors. + + .. note:: Dense vectors are simply represented as NumPy array objects, + so there is no need to covert them for use in MLlib. For sparse vectors, + the factory methods in this class create an MLlib-compatible type, or users + can pass in SciPy's C{scipy.sparse} column vectors. """ @staticmethod diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 385391ba53..b42e807069 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -245,9 +245,9 @@ class LinearRegressionSummary(JavaWrapper): .. seealso:: `Wikipedia explain variation \ <http://en.wikipedia.org/wiki/Explained_variation>`_ - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("explainedVariance") @@ -259,9 +259,9 @@ class LinearRegressionSummary(JavaWrapper): corresponding to the expected value of the absolute error loss or l1-norm loss. - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("meanAbsoluteError") @@ -273,9 +273,9 @@ class LinearRegressionSummary(JavaWrapper): corresponding to the expected value of the squared error loss or quadratic loss. - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("meanSquaredError") @@ -286,9 +286,9 @@ class LinearRegressionSummary(JavaWrapper): Returns the root mean squared error, which is defined as the square root of the mean squared error. - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("rootMeanSquaredError") @@ -301,9 +301,9 @@ class LinearRegressionSummary(JavaWrapper): .. seealso:: `Wikipedia coefficient of determination \ <http://en.wikipedia.org/wiki/Coefficient_of_determination>` - Note: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. + .. note:: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. """ return self._call_java("r2") @@ -822,7 +822,7 @@ class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, JavaMLReada where gain is scaled by the number of instances passing through node - Normalize importances for tree to sum to 1. - Note: Feature importance for single decision trees can have high variance due to + .. note:: Feature importance for single decision trees can have high variance due to correlated predictor variables. Consider using a :py:class:`RandomForestRegressor` to determine feature importance instead. """ diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index 2036168e45..91123ace33 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -699,9 +699,9 @@ class StreamingKMeansModel(KMeansModel): * n_t+1: New number of weights. * a: Decay Factor, which gives the forgetfulness. - Note that if a is set to 1, it is the weighted mean of the previous - and new data. If it set to zero, the old centroids are completely - forgotten. + .. note:: If a is set to 1, it is the weighted mean of the previous + and new data. If it set to zero, the old centroids are completely + forgotten. :param clusterCenters: Initial cluster centers. diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 7eaa2282cb..bde0f67be7 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -114,9 +114,9 @@ class JavaVectorTransformer(JavaModelWrapper, VectorTransformer): """ Applies transformation on a vector or an RDD[Vector]. - Note: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. + .. note:: In Python, transform cannot currently be used within + an RDD transformation or action. + Call transform directly on the RDD instead. :param vector: Vector or RDD of Vector to be transformed. """ @@ -139,9 +139,9 @@ class StandardScalerModel(JavaVectorTransformer): """ Applies standardization transformation on a vector. - Note: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. + .. note:: In Python, transform cannot currently be used within + an RDD transformation or action. + Call transform directly on the RDD instead. :param vector: Vector or RDD of Vector to be standardized. :return: Standardized vector. If the variance of a column is @@ -407,7 +407,7 @@ class HashingTF(object): Maps a sequence of terms to their term frequencies using the hashing trick. - Note: the terms must be hashable (can not be dict/set/list...). + .. note:: The terms must be hashable (can not be dict/set/list...). :param numFeatures: number of features (default: 2^20) @@ -469,9 +469,9 @@ class IDFModel(JavaVectorTransformer): the terms which occur in fewer than `minDocFreq` documents will have an entry of 0. - Note: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. + .. note:: In Python, transform cannot currently be used within + an RDD transformation or action. + Call transform directly on the RDD instead. :param x: an RDD of term frequency vectors or a term frequency vector @@ -551,7 +551,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader): """ Transforms a word to its vector representation - Note: local use only + .. note:: Local use only :param word: a word :return: vector representation of word(s) @@ -570,7 +570,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader): :param num: number of synonyms to find :return: array of (word, cosineSimilarity) - Note: local use only + .. note:: Local use only """ if not isinstance(word, basestring): word = _convert_to_vector(word) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index d37e715c8d..031f22c020 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -835,11 +835,12 @@ class SparseVector(Vector): class Vectors(object): """ - Factory methods for working with vectors. Note that dense vectors - are simply represented as NumPy array objects, so there is no need - to covert them for use in MLlib. For sparse vectors, the factory - methods in this class create an MLlib-compatible type, or users - can pass in SciPy's C{scipy.sparse} column vectors. + Factory methods for working with vectors. + + .. note:: Dense vectors are simply represented as NumPy array objects, + so there is no need to covert them for use in MLlib. For sparse vectors, + the factory methods in this class create an MLlib-compatible type, or users + can pass in SciPy's C{scipy.sparse} column vectors. """ @staticmethod diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 538cada7d1..600655c912 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -171,8 +171,9 @@ class RowMatrix(DistributedMatrix): def computeCovariance(self): """ Computes the covariance matrix, treating each row as an - observation. Note that this cannot be computed on matrices - with more than 65535 columns. + observation. + + .. note:: This cannot be computed on matrices with more than 65535 columns. >>> rows = sc.parallelize([[1, 2], [2, 1]]) >>> mat = RowMatrix(rows) @@ -185,8 +186,9 @@ class RowMatrix(DistributedMatrix): @since('2.0.0') def computeGramianMatrix(self): """ - Computes the Gramian matrix `A^T A`. Note that this cannot be - computed on matrices with more than 65535 columns. + Computes the Gramian matrix `A^T A`. + + .. note:: This cannot be computed on matrices with more than 65535 columns. >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) >>> mat = RowMatrix(rows) @@ -458,8 +460,9 @@ class IndexedRowMatrix(DistributedMatrix): @since('2.0.0') def computeGramianMatrix(self): """ - Computes the Gramian matrix `A^T A`. Note that this cannot be - computed on matrices with more than 65535 columns. + Computes the Gramian matrix `A^T A`. + + .. note:: This cannot be computed on matrices with more than 65535 columns. >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6])]) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 705022934e..1b66f5b510 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -44,7 +44,7 @@ class LabeledPoint(object): Vector of features for this point (NumPy array, list, pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix). - Note: 'label' and 'features' are accessible as class attributes. + .. note:: 'label' and 'features' are accessible as class attributes. .. versionadded:: 1.0.0 """ diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index 67d5f0e44f..49b26446db 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -164,7 +164,6 @@ class Statistics(object): of fit test of the observed data against the expected distribution, or againt the uniform distribution (by default), with each category having an expected frequency of `1 / len(observed)`. - (Note: `observed` cannot contain negative values) If `observed` is matrix, conduct Pearson's independence test on the input contingency matrix, which cannot contain negative entries or @@ -176,6 +175,8 @@ class Statistics(object): contingency matrix for which the chi-squared statistic is computed. All label and feature values must be categorical. + .. note:: `observed` cannot contain negative values + :param observed: it could be a vector containing the observed categorical counts/relative frequencies, or the contingency matrix (containing either counts or relative frequencies), diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index b3011d42e5..a6089fc8b9 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -40,9 +40,9 @@ class TreeEnsembleModel(JavaModelWrapper, JavaSaveable): Predict values for a single data point or an RDD of points using the model trained. - Note: In Python, predict cannot currently be used within an RDD - transformation or action. - Call predict directly on the RDD instead. + .. note:: In Python, predict cannot currently be used within an RDD + transformation or action. + Call predict directly on the RDD instead. """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -85,9 +85,9 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ Predict the label of one or more examples. - Note: In Python, predict cannot currently be used within an RDD - transformation or action. - Call predict directly on the RDD instead. + .. note:: In Python, predict cannot currently be used within an RDD + transformation or action. + Call predict directly on the RDD instead. :param x: Data point (feature vector), or an RDD of data points (feature diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index f21a364df9..9e05da89af 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -417,10 +417,8 @@ class RDD(object): with replacement: expected number of times each element is chosen; fraction must be >= 0 :param seed: seed for the random number generator - .. note:: - - This is not guaranteed to provide exactly the fraction specified of the total count - of the given :class:`DataFrame`. + .. note:: This is not guaranteed to provide exactly the fraction specified of the total + count of the given :class:`DataFrame`. >>> rdd = sc.parallelize(range(100), 4) >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14 @@ -460,8 +458,8 @@ class RDD(object): """ Return a fixed-size sampled subset of this RDD. - Note that this method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. + .. note:: This method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. >>> rdd = sc.parallelize(range(0, 10)) >>> len(rdd.takeSample(True, 20, 1)) @@ -572,7 +570,7 @@ class RDD(object): Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did. - Note that this method performs a shuffle internally. + .. note:: This method performs a shuffle internally. >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5]) >>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8]) @@ -803,8 +801,9 @@ class RDD(object): def collect(self): """ Return a list that contains all of the elements in this RDD. - Note that this method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. + + .. note:: This method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. """ with SCCallSiteSync(self.context) as css: port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) @@ -1251,10 +1250,10 @@ class RDD(object): """ Get the top N elements from an RDD. - Note that this method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. + .. note:: This method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. - Note: It returns the list sorted in descending order. + .. note:: It returns the list sorted in descending order. >>> sc.parallelize([10, 4, 2, 12, 3]).top(1) [12] @@ -1276,8 +1275,8 @@ class RDD(object): Get the N elements from an RDD ordered in ascending order or as specified by the optional key function. - Note that this method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. + .. note:: this method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6) [1, 2, 3, 4, 5, 6] @@ -1298,11 +1297,11 @@ class RDD(object): that partition to estimate the number of additional partitions needed to satisfy the limit. - Note that this method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. - Translated from the Scala implementation in RDD#take(). + .. note:: this method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. + >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2) [2, 3] >>> sc.parallelize([2, 3, 4, 5, 6]).take(10) @@ -1366,8 +1365,9 @@ class RDD(object): def isEmpty(self): """ - Returns true if and only if the RDD contains no elements at all. Note that an RDD - may be empty even when it has at least 1 partition. + Returns true if and only if the RDD contains no elements at all. + + .. note:: an RDD may be empty even when it has at least 1 partition. >>> sc.parallelize([]).isEmpty() True @@ -1558,8 +1558,8 @@ class RDD(object): """ Return the key-value pairs in this RDD to the master as a dictionary. - Note that this method should only be used if the resulting data is expected - to be small, as all the data is loaded into the driver's memory. + .. note:: this method should only be used if the resulting data is expected + to be small, as all the data is loaded into the driver's memory. >>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap() >>> m[1] @@ -1796,8 +1796,7 @@ class RDD(object): set of aggregation functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined - type" C. Note that V and C can be different -- for example, one might - group an RDD of type (Int, Int) into an RDD of type (Int, List[Int]). + type" C. Users provide three functions: @@ -1809,6 +1808,9 @@ class RDD(object): In addition, users can control the partitioning of the output RDD. + .. note:: V and C can be different -- for example, one might group an RDD of type + (Int, Int) into an RDD of type (Int, List[Int]). + >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> def add(a, b): return a + str(b) >>> sorted(x.combineByKey(str, add, add).collect()) @@ -1880,9 +1882,9 @@ class RDD(object): Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with numPartitions partitions. - Note: If you are grouping in order to perform an aggregation (such as a - sum or average) over each key, using reduceByKey or aggregateByKey will - provide much better performance. + .. note:: If you are grouping in order to perform an aggregation (such as a + sum or average) over each key, using reduceByKey or aggregateByKey will + provide much better performance. >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> sorted(rdd.groupByKey().mapValues(len).collect()) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 6fe6226432..b9d90384e3 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -457,7 +457,7 @@ class DataFrame(object): def cache(self): """Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}). - .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. + .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. """ self.is_cached = True self._jdf.cache() @@ -470,7 +470,7 @@ class DataFrame(object): a new storage level if the :class:`DataFrame` does not have a storage level set yet. If no storage level is specified defaults to (C{MEMORY_AND_DISK}). - .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. + .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. """ self.is_cached = True javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel) @@ -597,10 +597,8 @@ class DataFrame(object): def sample(self, withReplacement, fraction, seed=None): """Returns a sampled subset of this :class:`DataFrame`. - .. note:: - - This is not guaranteed to provide exactly the fraction specified of the total count - of the given :class:`DataFrame`. + .. note:: This is not guaranteed to provide exactly the fraction specified of the total + count of the given :class:`DataFrame`. >>> df.sample(False, 0.5, 42).count() 2 @@ -866,8 +864,8 @@ class DataFrame(object): This include count, mean, stddev, min, and max. If no columns are given, this function computes statistics for all numerical or string columns. - .. note:: This function is meant for exploratory data analysis, as we make no \ - guarantee about the backward compatibility of the schema of the resulting DataFrame. + .. note:: This function is meant for exploratory data analysis, as we make no + guarantee about the backward compatibility of the schema of the resulting DataFrame. >>> df.describe(['age']).show() +-------+------------------+ @@ -900,8 +898,8 @@ class DataFrame(object): def head(self, n=None): """Returns the first ``n`` rows. - Note that this method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. + .. note:: This method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. :param n: int, default 1. Number of rows to return. :return: If n is greater than 1, return a list of :class:`Row`. @@ -1462,8 +1460,8 @@ class DataFrame(object): "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou". :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases. - .. note:: This function is meant for exploratory data analysis, as we make no \ - guarantee about the backward compatibility of the schema of the resulting DataFrame. + .. note:: This function is meant for exploratory data analysis, as we make no + guarantee about the backward compatibility of the schema of the resulting DataFrame. :param cols: Names of the columns to calculate frequent items for as a list or tuple of strings. @@ -1564,11 +1562,11 @@ class DataFrame(object): def toPandas(self): """Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``. - Note that this method should only be used if the resulting Pandas's DataFrame is expected - to be small, as all the data is loaded into the driver's memory. - This is only available if Pandas is installed and available. + .. note:: This method should only be used if the resulting Pandas's DataFrame is expected + to be small, as all the data is loaded into the driver's memory. + >>> df.toPandas() # doctest: +SKIP age name 0 2 Alice diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 46a092f16d..d8abafcde3 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -359,7 +359,7 @@ def grouping_id(*cols): (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) - .. note:: the list of columns should match with grouping columns exactly, or empty (means all + .. note:: The list of columns should match with grouping columns exactly, or empty (means all the grouping columns). >>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show() @@ -547,7 +547,7 @@ def shiftRightUnsigned(col, numBits): def spark_partition_id(): """A column for partition ID. - Note that this is indeterministic because it depends on data partitioning and task scheduling. + .. note:: This is indeterministic because it depends on data partitioning and task scheduling. >>> df.repartition(1).select(spark_partition_id().alias("pid")).collect() [Row(pid=0), Row(pid=0)] @@ -1852,9 +1852,10 @@ class UserDefinedFunction(object): @since(1.3) def udf(f, returnType=StringType()): """Creates a :class:`Column` expression representing a user defined function (UDF). - Note that the user-defined functions must be deterministic. Due to optimization, - duplicate invocations may be eliminated or the function may even be invoked more times than - it is present in the query. + + .. note:: The user-defined functions must be deterministic. Due to optimization, + duplicate invocations may be eliminated or the function may even be invoked more times than + it is present in the query. :param f: python function :param returnType: a :class:`pyspark.sql.types.DataType` object diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 0e4589be97..9c3a237699 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -90,10 +90,12 @@ class StreamingQuery(object): @since(2.0) def processAllAvailable(self): """Blocks until all available data in the source has been processed and committed to the - sink. This method is intended for testing. Note that in the case of continually arriving - data, this method may block forever. Additionally, this method is only guaranteed to block - until data that has been synchronously appended data to a stream source prior to invocation. - (i.e. `getOffset` must immediately reflect the addition). + sink. This method is intended for testing. + + .. note:: In the case of continually arriving data, this method may block forever. + Additionally, this method is only guaranteed to block until data that has been + synchronously appended data to a stream source prior to invocation. + (i.e. `getOffset` must immediately reflect the addition). """ return self._jsq.processAllAvailable() diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index ec3ad9933c..17c34f8a1c 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -304,7 +304,7 @@ class StreamingContext(object): Create an input stream from an queue of RDDs or list. In each batch, it will process either one or all of the RDDs returned by the queue. - NOTE: changes to the queue after the stream is created will not be recognized. + .. note:: Changes to the queue after the stream is created will not be recognized. @param rdds: Queue of RDDs @param oneAtATime: pick one rdd each time or pick all of them once. diff --git a/python/pyspark/streaming/kinesis.py b/python/pyspark/streaming/kinesis.py index 434ce83e1e..3a8d8b819f 100644 --- a/python/pyspark/streaming/kinesis.py +++ b/python/pyspark/streaming/kinesis.py @@ -42,8 +42,8 @@ class KinesisUtils(object): Create an input stream that pulls messages from a Kinesis stream. This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. - Note: The given AWS credentials will get saved in DStream checkpoints if checkpointing is - enabled. Make sure that your checkpoint directory is secure. + .. note:: The given AWS credentials will get saved in DStream checkpoints if checkpointing + is enabled. Make sure that your checkpoint directory is secure. :param ssc: StreamingContext object :param kinesisAppName: Kinesis application name used by the Kinesis Client Library (KCL) to -- GitLab