Skip to content
Snippets Groups Projects
Commit f3dea607 authored by Takeshi Yamamuro's avatar Takeshi Yamamuro Committed by gatorsmile
Browse files

[SPARK-21144][SQL] Print a warning if the data schema and partition schema...

[SPARK-21144][SQL] Print a warning if the data schema and partition schema have the duplicate columns

## What changes were proposed in this pull request?
The current master outputs unexpected results when the data schema and partition schema have the duplicate columns:
```
withTempPath { dir =>
  val basePath = dir.getCanonicalPath
  spark.range(0, 3).toDF("foo").write.parquet(new Path(basePath, "foo=1").toString)
  spark.range(0, 3).toDF("foo").write.parquet(new Path(basePath, "foo=a").toString)
  spark.read.parquet(basePath).show()
}

+---+
|foo|
+---+
|  1|
|  1|
|  a|
|  a|
|  1|
|  a|
+---+
```
This patch added code to print a warning when the duplication found.

## How was this patch tested?
Manually checked.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18375 from maropu/SPARK-21144-3.
parent 5dca10b8
No related branches found
No related tags found
No related merge requests found
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.util
import org.apache.spark.internal.Logging
/**
* Utils for handling schemas.
*
* TODO: Merge this file with [[org.apache.spark.ml.util.SchemaUtils]].
*/
private[spark] object SchemaUtils extends Logging {
/**
* Checks if input column names have duplicate identifiers. Prints a warning message if
* the duplication exists.
*
* @param columnNames column names to check
* @param colType column type name, used in a warning message
* @param caseSensitiveAnalysis whether duplication checks should be case sensitive or not
*/
def checkColumnNameDuplication(
columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = {
val names = if (caseSensitiveAnalysis) {
columnNames
} else {
columnNames.map(_.toLowerCase)
}
if (names.distinct.length != names.length) {
val duplicateColumns = names.groupBy(identity).collect {
case (x, ys) if ys.length > 1 => s"`$x`"
}
logWarning(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}. " +
"You might need to assign different column names.")
}
}
}
...@@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.streaming._ ...@@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.sources._ import org.apache.spark.sql.sources._
import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{CalendarIntervalType, StructType} import org.apache.spark.sql.types.{CalendarIntervalType, StructType}
import org.apache.spark.sql.util.SchemaUtils
import org.apache.spark.util.Utils import org.apache.spark.util.Utils
/** /**
...@@ -182,6 +183,11 @@ case class DataSource( ...@@ -182,6 +183,11 @@ case class DataSource(
throw new AnalysisException( throw new AnalysisException(
s"Unable to infer schema for $format. It must be specified manually.") s"Unable to infer schema for $format. It must be specified manually.")
} }
SchemaUtils.checkColumnNameDuplication(
(dataSchema ++ partitionSchema).map(_.name), "in the data schema and the partition schema",
sparkSession.sessionState.conf.caseSensitiveAnalysis)
(dataSchema, partitionSchema) (dataSchema, partitionSchema)
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment