diff --git a/python/epydoc.conf b/python/epydoc.conf index 0b42e729f8dcc756c711584de2b2a4f071b480c5..95a6af09748065a3e1939e8662a818dcc9211d44 100644 --- a/python/epydoc.conf +++ b/python/epydoc.conf @@ -34,4 +34,4 @@ private: no exclude: pyspark.cloudpickle pyspark.worker pyspark.join pyspark.java_gateway pyspark.examples pyspark.shell pyspark.test - pyspark.rddsampler pyspark.daemon + pyspark.rddsampler pyspark.daemon pyspark.mllib._common diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index f1b95acf097810a9357f661f6bc4f9ca8578e106..2b2c3a061a71d49b0bc5f9815be1bfa2d90cb191 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -20,21 +20,24 @@ PySpark is the Python API for Spark. Public classes: - - L{SparkContext<pyspark.context.SparkContext>} - Main entry point for Spark functionality. - - L{RDD<pyspark.rdd.RDD>} - A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. - - L{Broadcast<pyspark.broadcast.Broadcast>} - A broadcast variable that gets reused across tasks. - - L{Accumulator<pyspark.accumulators.Accumulator>} - An "add-only" shared variable that tasks can only add values to. - - L{SparkConf<pyspark.conf.SparkConf} - Configuration for a Spark application. - - L{SparkFiles<pyspark.files.SparkFiles>} - Access files shipped with jobs. - - L{StorageLevel<pyspark.storagelevel.StorageLevel>} - Finer-grained cache persistence levels. + - L{SparkContext<pyspark.context.SparkContext>} + Main entry point for Spark functionality. + - L{RDD<pyspark.rdd.RDD>} + A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. + - L{Broadcast<pyspark.broadcast.Broadcast>} + A broadcast variable that gets reused across tasks. + - L{Accumulator<pyspark.accumulators.Accumulator>} + An "add-only" shared variable that tasks can only add values to. + - L{SparkConf<pyspark.conf.SparkConf>} + For configuring Spark. + - L{SparkFiles<pyspark.files.SparkFiles>} + Access files shipped with jobs. + - L{StorageLevel<pyspark.storagelevel.StorageLevel>} + Finer-grained cache persistence levels. """ + + + import sys import os sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "python/lib/py4j0.7.egg")) diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py index dfdaba274f43a86f97c66f17fdb6553f1a24676a..43f40f8783bfd426cb94f031527cc11ee43059c7 100644 --- a/python/pyspark/broadcast.py +++ b/python/pyspark/broadcast.py @@ -45,7 +45,18 @@ def _from_id(bid): class Broadcast(object): + """ + A broadcast variable created with + L{SparkContext.broadcast()<pyspark.context.SparkContext.broadcast>}. + Access its value through C{.value}. + """ + def __init__(self, bid, value, java_broadcast=None, pickle_registry=None): + """ + Should not be called directly by users -- use + L{SparkContext.broadcast()<pyspark.context.SparkContext.broadcast>} + instead. + """ self.value = value self.bid = bid self._jbroadcast = java_broadcast diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py index a79f348b526bd78299a88756d575964acd492ba0..cf98b0e071e8dbcb602671470783866a2f518dca 100644 --- a/python/pyspark/conf.py +++ b/python/pyspark/conf.py @@ -55,11 +55,11 @@ class SparkConf(object): parameters as key-value pairs. Most of the time, you would create a SparkConf object with - C{SparkConf()}, which will load values from `spark.*` Java system - properties and any `spark.conf` on your application's classpath. - In this case, system properties take priority over `spark.conf`, - and any parameters you set directly on the `SparkConf` object take - priority over both of those. + C{SparkConf()}, which will load values from C{spark.*} Java system + properties and any C{spark.conf} on your Spark classpath. In this + case, system properties take priority over C{spark.conf}, and any + parameters you set directly on the C{SparkConf} object take priority + over both of those. For unit tests, you can also call C{SparkConf(false)} to skip loading external settings and get the same configuration no matter diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 1244a1495f743e21bf3ab683f87cdb66c402af32..8b028027ebfec327c7ebf5bb17d3461edd4c20b1 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -267,7 +267,8 @@ class SparkContext(object): def broadcast(self, value): """ - Broadcast a read-only variable to the cluster, returning a C{Broadcast} + Broadcast a read-only variable to the cluster, returning a + L{Broadcast<pyspark.broadcast.Broadcast>} object for reading it in distributed functions. The variable will be sent to each cluster only once. """ diff --git a/python/run-tests b/python/run-tests index a0898b3c210fe4f6c95538f6f331ce22698e9238..4b71fff7c12711c856abc95dd322561f6604ecb4 100755 --- a/python/run-tests +++ b/python/run-tests @@ -29,7 +29,7 @@ FAILED=0 rm -f unit-tests.log function run_test() { - $FWDIR/pyspark $1 2>&1 | tee -a unit-tests.log + SPARK_TESTING=0 $FWDIR/pyspark $1 2>&1 | tee -a unit-tests.log FAILED=$((PIPESTATUS[0]||$FAILED)) }