From f18b905f6cace7686ef169fda7de474079d0af23 Mon Sep 17 00:00:00 2001 From: Wenchen Fan <wenchen@databricks.com> Date: Tue, 18 Jul 2017 15:56:16 -0700 Subject: [PATCH] [SPARK-21457][SQL] ExternalCatalog.listPartitions should correctly handle partition values with dot ## What changes were proposed in this pull request? When we list partitions from hive metastore with a partial partition spec, we are expecting exact matching according to the partition values. However, hive treats dot specially and match any single character for dot. We should do an extra filter to drop unexpected partitions. ## How was this patch tested? new regression test. Author: Wenchen Fan <wenchen@databricks.com> Closes #18671 from cloud-fan/hive. --- .../sql/catalyst/catalog/ExternalCatalogUtils.scala | 12 ++++++++++++ .../spark/sql/catalyst/catalog/InMemoryCatalog.scala | 12 ------------ .../sql/catalyst/catalog/ExternalCatalogSuite.scala | 12 ++++++++++++ .../apache/spark/sql/hive/HiveExternalCatalog.scala | 12 +++++++++++- 4 files changed, 35 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala index 1fc3a654cf..50f32e81d9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala @@ -159,6 +159,18 @@ object ExternalCatalogUtils { } } } + + /** + * Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a + * partial partition spec w.r.t. PARTITION (a=1,b=2). + */ + def isPartialPartitionSpec( + spec1: TablePartitionSpec, + spec2: TablePartitionSpec): Boolean = { + spec1.forall { + case (partitionColumn, value) => spec2(partitionColumn) == value + } + } } object CatalogUtils { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index d253c72a62..37e9eeadaa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -553,18 +553,6 @@ class InMemoryCatalog( } } - /** - * Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a - * partial partition spec w.r.t. PARTITION (a=1,b=2). - */ - private def isPartialPartitionSpec( - spec1: TablePartitionSpec, - spec2: TablePartitionSpec): Boolean = { - spec1.forall { - case (partitionColumn, value) => spec2(partitionColumn) == value - } - } - override def listPartitionsByFilter( db: String, table: String, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala index 66e895a469..94593ef7ef 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala @@ -448,6 +448,18 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac assert(catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "unknown"))).isEmpty) } + test("SPARK-21457: list partitions with special chars") { + val catalog = newBasicCatalog() + assert(catalog.listPartitions("db2", "tbl1").isEmpty) + + val part1 = CatalogTablePartition(Map("a" -> "1", "b" -> "i+j"), storageFormat) + val part2 = CatalogTablePartition(Map("a" -> "1", "b" -> "i.j"), storageFormat) + catalog.createPartitions("db2", "tbl1", Seq(part1, part2), ignoreIfExists = false) + + assert(catalog.listPartitions("db2", "tbl1", Some(part1.spec)).map(_.spec) == Seq(part1.spec)) + assert(catalog.listPartitions("db2", "tbl1", Some(part2.spec)).map(_.spec) == Seq(part2.spec)) + } + test("list partitions by filter") { val tz = TimeZone.getDefault.getID val catalog = newBasicCatalog() diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 306b38048e..70d7dd23d9 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -1088,9 +1088,19 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat table: String, partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient { val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table)) - client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part => + val res = client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part => part.copy(spec = restorePartitionSpec(part.spec, partColNameMap)) } + + partialSpec match { + // This might be a bug of Hive: When the partition value inside the partial partition spec + // contains dot, and we ask Hive to list partitions w.r.t. the partial partition spec, Hive + // treats dot as matching any single character and may return more partitions than we + // expected. Here we do an extra filter to drop unexpected partitions. + case Some(spec) if spec.exists(_._2.contains(".")) => + res.filter(p => isPartialPartitionSpec(spec, p.spec)) + case _ => res + } } override def listPartitionsByFilter( -- GitLab