Skip to content
Snippets Groups Projects
Commit 3a8b698e authored by CodingCat's avatar CodingCat Committed by Patrick Wendell
Browse files

[SPARK-1100] prevent Spark from overwriting directory silently

Thanks for Diana Carroll to report this issue (https://spark-project.atlassian.net/browse/SPARK-1100)

the current saveAsTextFile/SequenceFile will overwrite the output directory silently if the directory already exists, this behaviour is not desirable because

overwriting the data silently is not user-friendly

if the partition number of two writing operation changed, then the output directory will contain the results generated by two runnings

My fix includes:

add some new APIs with a flag for users to define whether he/she wants to overwrite the directory:
if the flag is set to true, then the output directory is deleted first and then written into the new data to prevent the output directory contains results from multiple rounds of running;

if the flag is set to false, Spark will throw an exception if the output directory already exists

changed JavaAPI part

default behaviour is overwriting

Two questions

should we deprecate the old APIs without such a flag?

I noticed that Spark Streaming also called these APIs, I thought we don't need to change the related part in streaming? @tdas

Author: CodingCat <zhunansjtu@gmail.com>

Closes #11 from CodingCat/SPARK-1100 and squashes the following commits:

6a4e3a3 [CodingCat] code clean
ef2d43f [CodingCat] add new test cases and code clean
ac63136 [CodingCat] checkOutputSpecs not applicable to FSOutputFormat
ec490e8 [CodingCat] prevent Spark from overwriting directory silently and leaving dirty directory
parent fe195ae1
No related branches found
No related tags found
No related merge requests found
...@@ -30,18 +30,15 @@ import scala.reflect.ClassTag ...@@ -30,18 +30,15 @@ import scala.reflect.ClassTag
import com.clearspring.analytics.stream.cardinality.HyperLogLog import com.clearspring.analytics.stream.cardinality.HyperLogLog
import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.fs.Path import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.SequenceFile.CompressionType
import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf, OutputFormat} import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf, OutputFormat}
import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat} import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat, Job => NewAPIHadoopJob, RecordWriter => NewRecordWriter, JobContext, SparkHadoopMapReduceUtil}
import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob}
import org.apache.hadoop.mapreduce.{RecordWriter => NewRecordWriter}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat => NewFileOutputFormat} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat => NewFileOutputFormat}
// SparkHadoopWriter and SparkHadoopMapReduceUtil are actually source files defined in Spark. // SparkHadoopWriter and SparkHadoopMapReduceUtil are actually source files defined in Spark.
import org.apache.hadoop.mapred.SparkHadoopWriter import org.apache.hadoop.mapred.SparkHadoopWriter
import org.apache.hadoop.mapreduce.SparkHadoopMapReduceUtil
import org.apache.spark._ import org.apache.spark._
import org.apache.spark.Partitioner.defaultPartitioner import org.apache.spark.Partitioner.defaultPartitioner
...@@ -604,8 +601,12 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)]) ...@@ -604,8 +601,12 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
val job = new NewAPIHadoopJob(conf) val job = new NewAPIHadoopJob(conf)
job.setOutputKeyClass(keyClass) job.setOutputKeyClass(keyClass)
job.setOutputValueClass(valueClass) job.setOutputValueClass(valueClass)
val wrappedConf = new SerializableWritable(job.getConfiguration) val wrappedConf = new SerializableWritable(job.getConfiguration)
NewFileOutputFormat.setOutputPath(job, new Path(path)) val outpath = new Path(path)
NewFileOutputFormat.setOutputPath(job, outpath)
val jobFormat = outputFormatClass.newInstance
jobFormat.checkOutputSpecs(job)
val formatter = new SimpleDateFormat("yyyyMMddHHmm") val formatter = new SimpleDateFormat("yyyyMMddHHmm")
val jobtrackerID = formatter.format(new Date()) val jobtrackerID = formatter.format(new Date())
val stageId = self.id val stageId = self.id
...@@ -633,7 +634,7 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)]) ...@@ -633,7 +634,7 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
committer.commitTask(hadoopContext) committer.commitTask(hadoopContext)
return 1 return 1
} }
val jobFormat = outputFormatClass.newInstance
/* apparently we need a TaskAttemptID to construct an OutputCommitter; /* apparently we need a TaskAttemptID to construct an OutputCommitter;
* however we're only going to use this local OutputCommitter for * however we're only going to use this local OutputCommitter for
* setupJob/commitJob, so we just use a dummy "map" task. * setupJob/commitJob, so we just use a dummy "map" task.
...@@ -642,7 +643,7 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)]) ...@@ -642,7 +643,7 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId) val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext) val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
jobCommitter.setupJob(jobTaskContext) jobCommitter.setupJob(jobTaskContext)
val count = self.context.runJob(self, writeShard _).sum self.context.runJob(self, writeShard _)
jobCommitter.commitJob(jobTaskContext) jobCommitter.commitJob(jobTaskContext)
} }
...@@ -696,10 +697,10 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)]) ...@@ -696,10 +697,10 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
* MapReduce job. * MapReduce job.
*/ */
def saveAsHadoopDataset(conf: JobConf) { def saveAsHadoopDataset(conf: JobConf) {
val outputFormatClass = conf.getOutputFormat val outputFormatInstance = conf.getOutputFormat
val keyClass = conf.getOutputKeyClass val keyClass = conf.getOutputKeyClass
val valueClass = conf.getOutputValueClass val valueClass = conf.getOutputValueClass
if (outputFormatClass == null) { if (outputFormatInstance == null) {
throw new SparkException("Output format class not set") throw new SparkException("Output format class not set")
} }
if (keyClass == null) { if (keyClass == null) {
...@@ -712,6 +713,12 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)]) ...@@ -712,6 +713,12 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
logDebug("Saving as hadoop file of type (" + keyClass.getSimpleName + ", " + logDebug("Saving as hadoop file of type (" + keyClass.getSimpleName + ", " +
valueClass.getSimpleName + ")") valueClass.getSimpleName + ")")
if (outputFormatInstance.isInstanceOf[FileOutputFormat[_, _]]) {
// FileOutputFormat ignores the filesystem parameter
val ignoredFs = FileSystem.get(conf)
conf.getOutputFormat.checkOutputSpecs(ignoredFs, conf)
}
val writer = new SparkHadoopWriter(conf) val writer = new SparkHadoopWriter(conf)
writer.preSetup() writer.preSetup()
......
...@@ -24,9 +24,11 @@ import scala.io.Source ...@@ -24,9 +24,11 @@ import scala.io.Source
import com.google.common.io.Files import com.google.common.io.Files
import org.apache.hadoop.io._ import org.apache.hadoop.io._
import org.apache.hadoop.io.compress.DefaultCodec import org.apache.hadoop.io.compress.DefaultCodec
import org.apache.hadoop.mapred.FileAlreadyExistsException
import org.scalatest.FunSuite import org.scalatest.FunSuite
import org.apache.spark.SparkContext._ import org.apache.spark.SparkContext._
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
class FileSuite extends FunSuite with LocalSparkContext { class FileSuite extends FunSuite with LocalSparkContext {
...@@ -208,4 +210,44 @@ class FileSuite extends FunSuite with LocalSparkContext { ...@@ -208,4 +210,44 @@ class FileSuite extends FunSuite with LocalSparkContext {
assert(rdd.count() === 3) assert(rdd.count() === 3)
assert(rdd.count() === 3) assert(rdd.count() === 3)
} }
test ("prevent user from overwriting the empty directory (old Hadoop API)") {
sc = new SparkContext("local", "test")
val tempdir = Files.createTempDir()
val randomRDD = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1)
intercept[FileAlreadyExistsException] {
randomRDD.saveAsTextFile(tempdir.getPath)
}
}
test ("prevent user from overwriting the non-empty directory (old Hadoop API)") {
sc = new SparkContext("local", "test")
val tempdir = Files.createTempDir()
val randomRDD = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1)
randomRDD.saveAsTextFile(tempdir.getPath + "/output")
assert(new File(tempdir.getPath + "/output/part-00000").exists() === true)
intercept[FileAlreadyExistsException] {
randomRDD.saveAsTextFile(tempdir.getPath + "/output")
}
}
test ("prevent user from overwriting the empty directory (new Hadoop API)") {
sc = new SparkContext("local", "test")
val tempdir = Files.createTempDir()
val randomRDD = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
intercept[FileAlreadyExistsException] {
randomRDD.saveAsNewAPIHadoopFile[TextOutputFormat[String, String]](tempdir.getPath)
}
}
test ("prevent user from overwriting the non-empty directory (new Hadoop API)") {
sc = new SparkContext("local", "test")
val tempdir = Files.createTempDir()
val randomRDD = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
randomRDD.saveAsTextFile(tempdir.getPath + "/output")
assert(new File(tempdir.getPath + "/output/part-00000").exists() === true)
intercept[FileAlreadyExistsException] {
randomRDD.saveAsNewAPIHadoopFile[TextOutputFormat[String, String]](tempdir.getPath)
}
}
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment