diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala index 811e96c99a96d6b2f3362e11737d7a89c1c1adcc..cef9d4d9c7f1b333af5bb22e0c423e0395977d22 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala @@ -204,6 +204,6 @@ abstract class PartitioningAwareFileCatalog( private def isDataPath(path: Path): Boolean = { val name = path.getName - !(name.startsWith("_") || name.startsWith(".")) + !((name.startsWith("_") && !name.contains("=")) || name.startsWith(".")) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index f068779b3e047d563bbc6a7c8cf75584f61405c8..e03a2323c7493a8177a3f8a5be0bbd7c0bde2dd0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -364,7 +364,7 @@ object HadoopFsRelation extends Logging { // We filter everything that starts with _ and ., except _common_metadata and _metadata // because Parquet needs to find those metadata files from leaf files returned by this method. // We should refactor this logic to not mix metadata files with data files. - (pathName.startsWith("_") || pathName.startsWith(".")) && + ((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) && !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index 19681be60465268a2eadd0b0299ce226ef65be93..27910e2cddad826395a026d48b56448e7b2316aa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala @@ -54,7 +54,7 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister { .getOrElse(sparkSession.sessionState.conf.columnNameOfCorruptRecord) val jsonFiles = files.filterNot { status => val name = status.getPath.getName - name.startsWith("_") || name.startsWith(".") + (name.startsWith("_") && !name.contains("=")) || name.startsWith(".") }.toArray val jsonSchema = InferSchema.infer( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 7794f31331a86be2b87ad5f3637f3f4d51a5896a..9c4778acf53d7681866ab00b93d8ea29d7f26f83 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -236,7 +236,8 @@ class ParquetFileFormat // Lists `FileStatus`es of all leaf nodes (files) under all base directories. val leaves = allFiles.filter { f => isSummaryFile(f.getPath) || - !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith(".")) + !((f.getPath.getName.startsWith("_") && !f.getPath.getName.contains("=")) || + f.getPath.getName.startsWith(".")) }.toArray.sortBy(_.getPath.toString) FileTypes( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index eac588fff2fc72f467e84a614bc00d3458514d1c..4fcde58833d76be92224bedf5265275ca195b5a2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import java.io.File import java.math.MathContext import java.sql.{Date, Timestamp} @@ -2637,6 +2638,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } } + test("SPARK-16975: Column-partition path starting '_' should be handled correctly") { + withTempDir { dir => + val parquetDir = new File(dir, "parquet").getCanonicalPath + spark.range(10).withColumn("_col", $"id").write.partitionBy("_col").save(parquetDir) + spark.read.parquet(parquetDir) + } + } + test("SPARK-16644: Aggregate should not put aggregate expressions to constraints") { withTable("tbl") { sql("CREATE TABLE tbl(a INT, b INT) USING parquet")