Skip to content
Snippets Groups Projects
Commit 5b6ea9e9 authored by Josh Rosen's avatar Josh Rosen
Browse files

Update checkpointing API docs in Python/Java.

parent d0ba80dc
No related branches found
No related tags found
No related merge requests found
......@@ -307,16 +307,13 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
implicit val kcm: ClassManifest[K] = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K]]
JavaPairRDD.fromRDD(rdd.keyBy(f))
}
/**
* Mark this RDD for checkpointing. The RDD will be saved to a file inside `checkpointDir`
* (set using setCheckpointDir()) and all references to its parent RDDs will be removed.
* This is used to truncate very long lineages. In the current implementation, Spark will save
* this RDD to a file (using saveAsObjectFile()) after the first job using this RDD is done.
* Hence, it is strongly recommended to use checkpoint() on RDDs when
* (i) checkpoint() is called before the any job has been executed on this RDD.
* (ii) This RDD has been made to persist in memory. Otherwise saving it on a file will
* require recomputation.
/**
* Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint
* directory set with SparkContext.setCheckpointDir() and all references to its parent
* RDDs will be removed. This function must be called before any job has been
* executed on this RDD. It is strongly recommended that this RDD is persisted in
* memory, otherwise saving it on a file will require recomputation.
*/
def checkpoint() = rdd.checkpoint()
......
......@@ -357,20 +357,21 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
}
/**
* Set the directory under which RDDs are going to be checkpointed. This method will
* create this directory and will throw an exception of the path already exists (to avoid
* overwriting existing files may be overwritten). The directory will be deleted on exit
* if indicated.
* Set the directory under which RDDs are going to be checkpointed. The directory must
* be a HDFS path if running on a cluster. If the directory does not exist, it will
* be created. If the directory exists and useExisting is set to true, then the
* exisiting directory will be used. Otherwise an exception will be thrown to
* prevent accidental overriding of checkpoint files in the existing directory.
*/
def setCheckpointDir(dir: String, useExisting: Boolean) {
sc.setCheckpointDir(dir, useExisting)
}
/**
* Set the directory under which RDDs are going to be checkpointed. This method will
* create this directory and will throw an exception of the path already exists (to avoid
* overwriting existing files may be overwritten). The directory will be deleted on exit
* if indicated.
* Set the directory under which RDDs are going to be checkpointed. The directory must
* be a HDFS path if running on a cluster. If the directory does not exist, it will
* be created. If the directory exists, an exception will be thrown to prevent accidental
* overriding of checkpoint files.
*/
def setCheckpointDir(dir: String) {
sc.setCheckpointDir(dir)
......
......@@ -202,9 +202,12 @@ class SparkContext(object):
def setCheckpointDir(self, dirName, useExisting=False):
"""
Set the directory under which RDDs are going to be checkpointed. This
method will create this directory and will throw an exception of the
path already exists (to avoid overwriting existing files may be
overwritten). The directory will be deleted on exit if indicated.
Set the directory under which RDDs are going to be checkpointed. The
directory must be a HDFS path if running on a cluster.
If the directory does not exist, it will be created. If the directory
exists and C{useExisting} is set to true, then the exisiting directory
will be used. Otherwise an exception will be thrown to prevent
accidental overriding of checkpoint files in the existing directory.
"""
self._jsc.sc().setCheckpointDir(dirName, useExisting)
......@@ -52,18 +52,11 @@ class RDD(object):
def checkpoint(self):
"""
Mark this RDD for checkpointing. The RDD will be saved to a file inside
`checkpointDir` (set using setCheckpointDir()) and all references to
its parent RDDs will be removed. This is used to truncate very long
lineages. In the current implementation, Spark will save this RDD to
a file (using saveAsObjectFile()) after the first job using this RDD is
done. Hence, it is strongly recommended to use checkpoint() on RDDs
when
(i) checkpoint() is called before the any job has been executed on this
RDD.
(ii) This RDD has been made to persist in memory. Otherwise saving it
Mark this RDD for checkpointing. It will be saved to a file inside the
checkpoint directory set with L{SparkContext.setCheckpointDir()} and
all references to its parent RDDs will be removed. This function must
be called before any job has been executed on this RDD. It is strongly
recommended that this RDD is persisted in memory, otherwise saving it
on a file will require recomputation.
"""
self.is_checkpointed = True
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment