Skip to content
Snippets Groups Projects
Commit 7eb83fef authored by hyukjinkwon's avatar hyukjinkwon Committed by Reynold Xin
Browse files

[SPARK-13137][SQL] NullPoingException in schema inference for CSV when the first line is empty

https://issues.apache.org/jira/browse/SPARK-13137

This PR adds a filter in schema inference so that it does not emit NullPointException.

Also, I removed `MAX_COMMENT_LINES_IN_HEADER `but instead used a monad chaining with `filter()` and `first()`.

Lastly, I simply added a newline rather than adding a new file for this so that this is covered with the original tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #11023 from HyukjinKwon/SPARK-13137.
parent b6a873d6
No related branches found
No related tags found
No related merge requests found
...@@ -75,9 +75,6 @@ private[sql] class CSVOptions( ...@@ -75,9 +75,6 @@ private[sql] class CSVOptions(
val ignoreLeadingWhiteSpaceFlag = getBool("ignoreLeadingWhiteSpace") val ignoreLeadingWhiteSpaceFlag = getBool("ignoreLeadingWhiteSpace")
val ignoreTrailingWhiteSpaceFlag = getBool("ignoreTrailingWhiteSpace") val ignoreTrailingWhiteSpaceFlag = getBool("ignoreTrailingWhiteSpace")
// Limit the number of lines we'll search for a header row that isn't comment-prefixed
val MAX_COMMENT_LINES_IN_HEADER = 10
// Parse mode flags // Parse mode flags
if (!ParseModes.isValidMode(parseMode)) { if (!ParseModes.isValidMode(parseMode)) {
logWarning(s"$parseMode is not a valid parse mode. Using ${ParseModes.DEFAULT}.") logWarning(s"$parseMode is not a valid parse mode. Using ${ParseModes.DEFAULT}.")
......
...@@ -154,12 +154,14 @@ private[csv] class CSVRelation( ...@@ -154,12 +154,14 @@ private[csv] class CSVRelation(
*/ */
private def findFirstLine(rdd: RDD[String]): String = { private def findFirstLine(rdd: RDD[String]): String = {
if (params.isCommentSet) { if (params.isCommentSet) {
rdd.take(params.MAX_COMMENT_LINES_IN_HEADER) val comment = params.comment.toString
.find(!_.startsWith(params.comment.toString)) rdd.filter { line =>
.getOrElse(sys.error(s"No uncommented header line in " + line.trim.nonEmpty && !line.startsWith(comment)
s"first ${params.MAX_COMMENT_LINES_IN_HEADER} lines")) }.first()
} else { } else {
rdd.first() rdd.filter { line =>
line.trim.nonEmpty
}.first()
} }
} }
} }
......
year,make,model,comment,blank year,make,model,comment,blank
"2012","Tesla","S","No comment", "2012","Tesla","S","No comment",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment