From c2b105af34f7241ac0597d9c35fbf66633a3eaf6 Mon Sep 17 00:00:00 2001 From: Josh Rosen <joshrosen@eecs.berkeley.edu> Date: Fri, 28 Dec 2012 22:51:28 -0800 Subject: [PATCH] Add documentation for Python API. --- docs/_layouts/global.html | 1 + docs/api.md | 1 + docs/index.md | 11 +-- docs/python-programming-guide.md | 74 +++++++++++++++++++++ docs/quick-start.md | 40 ++++++++++- pyspark/README | 42 ------------ pyspark/{pyspark => }/examples/kmeans.py | 0 pyspark/{pyspark => }/examples/pi.py | 0 pyspark/{pyspark => }/examples/tc.py | 0 pyspark/{pyspark => }/examples/wordcount.py | 0 pyspark/pyspark/__init__.py | 6 ++ pyspark/pyspark/examples/__init__.py | 0 12 files changed, 127 insertions(+), 48 deletions(-) create mode 100644 docs/python-programming-guide.md delete mode 100644 pyspark/README rename pyspark/{pyspark => }/examples/kmeans.py (100%) rename pyspark/{pyspark => }/examples/pi.py (100%) rename pyspark/{pyspark => }/examples/tc.py (100%) rename pyspark/{pyspark => }/examples/wordcount.py (100%) delete mode 100644 pyspark/pyspark/examples/__init__.py diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index 43a5fa3e1c..96eebd9f23 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -47,6 +47,7 @@ <li><a href="quick-start.html">Quick Start</a></li> <li><a href="scala-programming-guide.html">Scala</a></li> <li><a href="java-programming-guide.html">Java</a></li> + <li><a href="python-programming-guide.html">Python</a></li> </ul> </li> diff --git a/docs/api.md b/docs/api.md index 43548b223c..b9c93ac5e8 100644 --- a/docs/api.md +++ b/docs/api.md @@ -8,3 +8,4 @@ Here you can find links to the Scaladoc generated for the Spark sbt subprojects. - [Core](api/core/index.html) - [Examples](api/examples/index.html) - [Bagel](api/bagel/index.html) +- [PySpark](api/pyspark/index.html) diff --git a/docs/index.md b/docs/index.md index ed9953a590..33ab58a962 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,11 +7,11 @@ title: Spark Overview TODO(andyk): Rewrite to make the Java API a first class part of the story. {% endcomment %} -Spark is a MapReduce-like cluster computing framework designed for low-latency iterative jobs and interactive use from an -interpreter. It provides clean, language-integrated APIs in Scala and Java, with a rich array of parallel operators. Spark can -run on top of the [Apache Mesos](http://incubator.apache.org/mesos/) cluster manager, +Spark is a MapReduce-like cluster computing framework designed for low-latency iterative jobs and interactive use from an interpreter. +It provides clean, language-integrated APIs in Scala, Java, and Python, with a rich array of parallel operators. +Spark can run on top of the [Apache Mesos](http://incubator.apache.org/mesos/) cluster manager, [Hadoop YARN](http://hadoop.apache.org/docs/r2.0.1-alpha/hadoop-yarn/hadoop-yarn-site/YARN.html), -Amazon EC2, or without an independent resource manager ("standalone mode"). +Amazon EC2, or without an independent resource manager ("standalone mode"). # Downloading @@ -59,6 +59,7 @@ of `project/SparkBuild.scala`, then rebuilding Spark (`sbt/sbt clean compile`). * [Quick Start](quick-start.html): a quick introduction to the Spark API; start here! * [Spark Programming Guide](scala-programming-guide.html): an overview of Spark concepts, and details on the Scala API * [Java Programming Guide](java-programming-guide.html): using Spark from Java +* [Python Programming Guide](python-programming-guide.html): using Spark from Python **Deployment guides:** @@ -72,7 +73,7 @@ of `project/SparkBuild.scala`, then rebuilding Spark (`sbt/sbt clean compile`). * [Configuration](configuration.html): customize Spark via its configuration system * [Tuning Guide](tuning.html): best practices to optimize performance and memory use -* [API Docs (Scaladoc)](api/core/index.html) +* API Docs: [Java/Scala (Scaladoc)](api/core/index.html) and [Python (Epydoc)](api/pyspark/index.html) * [Bagel](bagel-programming-guide.html): an implementation of Google's Pregel on Spark * [Contributing to Spark](contributing-to-spark.html) diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md new file mode 100644 index 0000000000..b7c747f905 --- /dev/null +++ b/docs/python-programming-guide.md @@ -0,0 +1,74 @@ +--- +layout: global +title: Python Programming Guide +--- + + +The Spark Python API (PySpark) exposes most of the Spark features available in the Scala version to Python. +To learn the basics of Spark, we recommend reading through the +[Scala programming guide](scala-programming-guide.html) first; it should be +easy to follow even if you don't know Scala. +This guide will show how to use the Spark features described there in Python. + +# Key Differences in the Python API + +There are a few key differences between the Python and Scala APIs: + +* Python is dynamically typed, so RDDs can hold objects of different types. +* PySpark does not currently support the following Spark features: + - Accumulators + - Special functions on RRDs of doubles, such as `mean` and `stdev` + - Approximate jobs / functions, such as `countApprox` and `sumApprox`. + - `lookup` + - `mapPartitionsWithSplit` + - `persist` at storage levels other than `MEMORY_ONLY` + - `sample` + - `sort` + + +# Installing and Configuring PySpark + +PySpark requires Python 2.6 or higher. +PySpark jobs are executed using a standard cPython interpreter in order to support Python modules that use C extensions. +We have not tested PySpark with Python 3 or with alternative Python interpreters, such as [PyPy](http://pypy.org/) or [Jython](http://www.jython.org/). +By default, PySpark's scripts will run programs using `python`; an alternate Python executable may be specified by setting the `PYSPARK_PYTHON` environment variable in `conf/spark-env.sh`. + +All of PySpark's library dependencies, including [Py4J](http://py4j.sourceforge.net/), are bundled with PySpark and automatically imported. + +Standalone PySpark jobs should be run using the `run-pyspark` script, which automatically configures the Java and Python environmnt using the settings in `conf/spark-env.sh`. +The script automatically adds the `pyspark` package to the `PYTHONPATH`. + + +# Interactive Use + +PySpark's `pyspark-shell` script provides a simple way to learn the API: + +{% highlight python %} +>>> words = sc.textFile("/usr/share/dict/words") +>>> words.filter(lambda w: w.startswith("spar")).take(5) +[u'spar', u'sparable', u'sparada', u'sparadrap', u'sparagrass'] +{% endhighlight %} + +# Standalone Use + +PySpark can also be used from standalone Python scripts by creating a SparkContext in the script and running the script using the `run-pyspark` script in the `pyspark` directory. +The Quick Start guide includes a [complete example](quick-start.html#a-standalone-job-in-python) of a standalone Python job. + +Code dependencies can be deployed by listing them in the `pyFiles` option in the SparkContext constructor: + +{% highlight python %} +from pyspark import SparkContext +sc = SparkContext("local", "Job Name", pyFiles=['MyFile.py', 'lib.zip', 'app.egg']) +{% endhighlight %} + +Files listed here will be added to the `PYTHONPATH` and shipped to remote worker machines. +Code dependencies can be added to an existing SparkContext using its `addPyFile()` method. + +# Where to Go from Here + +PySpark includes several sample programs using the Python API in `pyspark/examples`. +You can run them by passing the files to the `pyspark-run` script included in PySpark -- for example `./pyspark-run examples/wordcount.py`. +Each example program prints usage help when run without any arguments. + +We currently provide [API documentation](api/pyspark/index.html) for the Python API as Epydoc. +Many of the RDD method descriptions contain [doctests](http://docs.python.org/2/library/doctest.html) that provide additional usage examples. diff --git a/docs/quick-start.md b/docs/quick-start.md index defdb34836..c859c31b09 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -6,7 +6,8 @@ title: Quick Start * This will become a table of contents (this text will be scraped). {:toc} -This tutorial provides a quick introduction to using Spark. We will first introduce the API through Spark's interactive Scala shell (don't worry if you don't know Scala -- you will need much for this), then show how to write standalone jobs in Scala and Java. See the [programming guide](scala-programming-guide.html) for a fuller reference. +This tutorial provides a quick introduction to using Spark. We will first introduce the API through Spark's interactive Scala shell (don't worry if you don't know Scala -- you will need much for this), then show how to write standalone jobs in Scala, Java, and Python. +See the [programming guide](scala-programming-guide.html) for a more complete reference. To follow along with this guide, you only need to have successfully built Spark on one machine. Simply go into your Spark directory and run: @@ -230,3 +231,40 @@ Lines with a: 8422, Lines with b: 1836 {% endhighlight %} This example only runs the job locally; for a tutorial on running jobs across several machines, see the [Standalone Mode](spark-standalone.html) documentation, and consider using a distributed input source, such as HDFS. + +# A Standalone Job In Python +Now we will show how to write a standalone job using the Python API (PySpark). + +As an example, we'll create a simple Spark job, `SimpleJob.py`: + +{% highlight python %} +"""SimpleJob.py""" +from pyspark import SparkContext + +logFile = "/var/log/syslog" # Should be some file on your system +sc = SparkContext("local", "Simple job") +logData = sc.textFile(logFile).cache() + +numAs = logData.filter(lambda s: 'a' in s).count() +numBs = logData.filter(lambda s: 'b' in s).count() + +print "Lines with a: %i, lines with b: %i" % (numAs, numBs) +{% endhighlight %} + + +This job simply counts the number of lines containing 'a' and the number containing 'b' in a system log file. +Like in the Scala and Java examples, we use a SparkContext to create RDDs. +We can pass Python functions to Spark, which are automatically serialized along with any variables that they reference. +For jobs that use custom classes or third-party libraries, we can add those code dependencies to SparkContext to ensure that they will be available on remote machines; this is described in more detail in the [Python programming guide](python-programming-guide). +`SimpleJob` is simple enough that we do not need to specify any code dependencies. + +We can run this job using the `run-pyspark` script in `$SPARK_HOME/pyspark`: + +{% highlight python %} +$ cd $SPARK_HOME +$ ./pyspark/run-pyspark SimpleJob.py +... +Lines with a: 8422, Lines with b: 1836 +{% endhighlight python %} + +This example only runs the job locally; for a tutorial on running jobs across several machines, see the [Standalone Mode](spark-standalone.html) documentation, and consider using a distributed input source, such as HDFS. diff --git a/pyspark/README b/pyspark/README deleted file mode 100644 index d8d521c72c..0000000000 --- a/pyspark/README +++ /dev/null @@ -1,42 +0,0 @@ -# PySpark - -PySpark is a Python API for Spark. - -PySpark jobs are writen in Python and executed using a standard Python -interpreter; this supports modules that use Python C extensions. The -API is based on the Spark Scala API and uses regular Python functions -and lambdas to support user-defined functions. PySpark supports -interactive use through a standard Python interpreter; it can -automatically serialize closures and ship them to worker processes. - -PySpark is built on top of the Spark Java API. Data is uniformly -represented as serialized Python objects and stored in Spark Java -processes, which communicate with PySpark worker processes over pipes. - -## Features - -PySpark supports most of the Spark API, including broadcast variables. -RDDs are dynamically typed and can hold any Python object. - -PySpark does not support: - -- Special functions on RDDs of doubles -- Accumulators - -## Examples and Documentation - -The PySpark source contains docstrings and doctests that document its -API. The public classes are in `context.py` and `rdd.py`. - -The `pyspark/pyspark/examples` directory contains a few complete -examples. - -## Installing PySpark -# -To use PySpark, `SPARK_HOME` should be set to the location of the Spark -package. - -## Running PySpark - -The easiest way to run PySpark is to use the `run-pyspark` and -`pyspark-shell` scripts, which are included in the `pyspark` directory. diff --git a/pyspark/pyspark/examples/kmeans.py b/pyspark/examples/kmeans.py similarity index 100% rename from pyspark/pyspark/examples/kmeans.py rename to pyspark/examples/kmeans.py diff --git a/pyspark/pyspark/examples/pi.py b/pyspark/examples/pi.py similarity index 100% rename from pyspark/pyspark/examples/pi.py rename to pyspark/examples/pi.py diff --git a/pyspark/pyspark/examples/tc.py b/pyspark/examples/tc.py similarity index 100% rename from pyspark/pyspark/examples/tc.py rename to pyspark/examples/tc.py diff --git a/pyspark/pyspark/examples/wordcount.py b/pyspark/examples/wordcount.py similarity index 100% rename from pyspark/pyspark/examples/wordcount.py rename to pyspark/examples/wordcount.py diff --git a/pyspark/pyspark/__init__.py b/pyspark/pyspark/__init__.py index 549c2d2711..8f8402b62b 100644 --- a/pyspark/pyspark/__init__.py +++ b/pyspark/pyspark/__init__.py @@ -1,3 +1,9 @@ import sys import os sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "pyspark/lib/py4j0.7.egg")) + + +from pyspark.context import SparkContext + + +__all__ = ["SparkContext"] diff --git a/pyspark/pyspark/examples/__init__.py b/pyspark/pyspark/examples/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 -- GitLab