Skip to content
Snippets Groups Projects
Commit b463e6d6 authored by Cheng Lian's avatar Cheng Lian Committed by Yin Huai
Browse files

[SPARK-7868] [SQL] Ignores _temporary directories in HadoopFsRelation

So that potential partial/corrupted data files left by failed tasks/jobs won't affect normal data scan.

Author: Cheng Lian <lian@databricks.com>

Closes #6411 from liancheng/spark-7868 and squashes the following commits:

273ea36 [Cheng Lian] Ignores _temporary directories
parent 0c33c7b4
No related branches found
No related tags found
No related merge requests found
...@@ -31,7 +31,7 @@ import org.apache.spark.SerializableWritable ...@@ -31,7 +31,7 @@ import org.apache.spark.SerializableWritable
import org.apache.spark.sql.{Row, _} import org.apache.spark.sql.{Row, _}
import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.types.StructType
/** /**
* ::DeveloperApi:: * ::DeveloperApi::
...@@ -378,16 +378,22 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio ...@@ -378,16 +378,22 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]] var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]
def refresh(): Unit = { def refresh(): Unit = {
// We don't filter files/directories whose name start with "_" or "." here, as specific data
// sources may take advantages over them (e.g. Parquet _metadata and _common_metadata files).
// But "_temporary" directories are explicitly ignored since failed tasks/jobs may leave
// partial/corrupted data files there.
def listLeafFilesAndDirs(fs: FileSystem, status: FileStatus): Set[FileStatus] = { def listLeafFilesAndDirs(fs: FileSystem, status: FileStatus): Set[FileStatus] = {
val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir) if (status.getPath.getName.toLowerCase == "_temporary") {
val leafDirs = if (dirs.isEmpty) Set(status) else Set.empty[FileStatus] Set.empty
files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir)) } else {
val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
val leafDirs = if (dirs.isEmpty) Set(status) else Set.empty[FileStatus]
files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir))
}
} }
leafFiles.clear() leafFiles.clear()
// We don't filter files/directories like _temporary/_SUCCESS here, as specific data sources
// may take advantages over them (e.g. Parquet _metadata and _common_metadata files).
val statuses = paths.flatMap { path => val statuses = paths.flatMap { path =>
val hdfsPath = new Path(path) val hdfsPath = new Path(path)
val fs = hdfsPath.getFileSystem(hadoopConf) val fs = hdfsPath.getFileSystem(hadoopConf)
...@@ -395,7 +401,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio ...@@ -395,7 +401,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
Try(fs.getFileStatus(qualified)).toOption.toArray.flatMap(listLeafFilesAndDirs(fs, _)) Try(fs.getFileStatus(qualified)).toOption.toArray.flatMap(listLeafFilesAndDirs(fs, _))
} }
val (dirs, files) = statuses.partition(_.isDir) val files = statuses.filterNot(_.isDir)
leafFiles ++= files.map(f => f.getPath -> f).toMap leafFiles ++= files.map(f => f.getPath -> f).toMap
leafDirToChildrenFiles ++= files.groupBy(_.getPath.getParent) leafDirToChildrenFiles ++= files.groupBy(_.getPath.getParent)
} }
......
...@@ -548,4 +548,20 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest { ...@@ -548,4 +548,20 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
checkAnswer(table("t"), df.select('b, 'c, 'a).collect()) checkAnswer(table("t"), df.select('b, 'c, 'a).collect())
} }
} }
test("SPARK-7868: _temporary directories should be ignored") {
withTempPath { dir =>
val df = Seq("a", "b", "c").zipWithIndex.toDF()
df.write
.format("parquet")
.save(dir.getCanonicalPath)
df.write
.format("parquet")
.save(s"${dir.getCanonicalPath}/_temporary")
checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df.collect())
}
}
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment