Skip to content
Snippets Groups Projects
Unverified Commit 76e9bd74 authored by uncleGen's avatar uncleGen Committed by Sean Owen
Browse files

[SPARK-18960][SQL][SS] Avoid double reading file which is being copied.

## What changes were proposed in this pull request?

In HDFS, when we copy a file into target directory, there will a temporary `._COPY_` file for a period of time. The duration depends on file size. If we do not skip this file, we will may read the same data for two times.

## How was this patch tested?
update unit test

Author: uncleGen <hustyugm@gmail.com>

Closes #16370 from uncleGen/SPARK-18960.
parent 67fb33e7
No related branches found
No related tags found
No related merge requests found
......@@ -439,10 +439,15 @@ object PartitioningAwareFileIndex extends Logging {
/** Checks if we should filter out this path name. */
def shouldFilterOut(pathName: String): Boolean = {
// We filter everything that starts with _ and ., except _common_metadata and _metadata
// We filter follow paths:
// 1. everything that starts with _ and ., except _common_metadata and _metadata
// because Parquet needs to find those metadata files from leaf files returned by this method.
// We should refactor this logic to not mix metadata files with data files.
((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
!pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
// 2. everything that ends with `._COPYING_`, because this is a intermediate state of file. we
// should skip this file in case of double reading.
val exclude = (pathName.startsWith("_") && !pathName.contains("=")) ||
pathName.startsWith(".") || pathName.endsWith("._COPYING_")
val include = pathName.startsWith("_common_metadata") || pathName.startsWith("_metadata")
exclude && !include
}
}
......@@ -142,6 +142,7 @@ class FileIndexSuite extends SharedSQLContext {
assert(!PartitioningAwareFileIndex.shouldFilterOut("_common_metadata"))
assert(PartitioningAwareFileIndex.shouldFilterOut("_ab_metadata"))
assert(PartitioningAwareFileIndex.shouldFilterOut("_cd_common_metadata"))
assert(PartitioningAwareFileIndex.shouldFilterOut("a._COPYING_"))
}
test("SPARK-17613 - PartitioningAwareFileIndex: base path w/o '/' at end") {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment