Skip to content
Snippets Groups Projects
Commit d60dde26 authored by windpiger's avatar windpiger Committed by Wenchen Fan
Browse files

[SPARK-19488][SQL] fix csv infer schema when the field is Nan/Inf etc

## What changes were proposed in this pull request?

when csv infer schema, it does not use user defined csvoptions to parse the field, such as `inf`, `-inf` which are should be parsed to DoubleType

this pr add  `options.nanValue`, `options.negativeInf`, `options.positiveIn`  to check if the field is a DoubleType

## How was this patch tested?
unit test added

Author: windpiger <songjun@outlook.com>

Closes #16834 from windpiger/fixinferInfSchemaCsv.
parent 5a0569ce
No related branches found
No related tags found
No related merge requests found
...@@ -150,6 +150,10 @@ private[csv] object CSVInferSchema { ...@@ -150,6 +150,10 @@ private[csv] object CSVInferSchema {
} }
} }
private def isInfOrNan(field: String, options: CSVOptions): Boolean = {
field == options.nanValue || field == options.negativeInf || field == options.positiveInf
}
private def tryParseInteger(field: String, options: CSVOptions): DataType = { private def tryParseInteger(field: String, options: CSVOptions): DataType = {
if ((allCatch opt field.toInt).isDefined) { if ((allCatch opt field.toInt).isDefined) {
IntegerType IntegerType
...@@ -185,7 +189,7 @@ private[csv] object CSVInferSchema { ...@@ -185,7 +189,7 @@ private[csv] object CSVInferSchema {
} }
private def tryParseDouble(field: String, options: CSVOptions): DataType = { private def tryParseDouble(field: String, options: CSVOptions): DataType = {
if ((allCatch opt field.toDouble).isDefined) { if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field, options)) {
DoubleType DoubleType
} else { } else {
tryParseTimestamp(field, options) tryParseTimestamp(field, options)
......
...@@ -131,4 +131,12 @@ class CSVInferSchemaSuite extends SparkFunSuite { ...@@ -131,4 +131,12 @@ class CSVInferSchemaSuite extends SparkFunSuite {
assert(CSVInferSchema.inferField(DecimalType(20, 0), "2015-12-01 00:00:00", options) assert(CSVInferSchema.inferField(DecimalType(20, 0), "2015-12-01 00:00:00", options)
== StringType) == StringType)
} }
test("DoubleType should be infered when user defined nan/inf are provided") {
val options = new CSVOptions(Map("nanValue" -> "nan", "negativeInf" -> "-inf",
"positiveInf" -> "inf"))
assert(CSVInferSchema.inferField(NullType, "nan", options) == DoubleType)
assert(CSVInferSchema.inferField(NullType, "inf", options) == DoubleType)
assert(CSVInferSchema.inferField(NullType, "-inf", options) == DoubleType)
}
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment