From 9a74de18a13d84805e1a448f858bb05ce30de87e Mon Sep 17 00:00:00 2001 From: Shixiong Zhu <shixiong@databricks.com> Date: Tue, 31 May 2016 14:50:07 -0700 Subject: [PATCH] Revert "[SPARK-11753][SQL][TEST-HADOOP2.2] Make allowNonNumericNumbers option work ## What changes were proposed in this pull request? This reverts commit c24b6b679c3efa053f7de19be73eb36dc70d9930. Sent a PR to run Jenkins tests due to the revert conflicts of `dev/deps/spark-deps-hadoop*`. ## How was this patch tested? Jenkins unit tests, integration tests, manual tests) Author: Shixiong Zhu <shixiong@databricks.com> Closes #13417 from zsxwing/revert-SPARK-11753. --- dev/deps/spark-deps-hadoop-2.2 | 12 ++-- dev/deps/spark-deps-hadoop-2.3 | 12 ++-- dev/deps/spark-deps-hadoop-2.4 | 12 ++-- dev/deps/spark-deps-hadoop-2.6 | 12 ++-- dev/deps/spark-deps-hadoop-2.7 | 12 ++-- pom.xml | 8 +-- python/pyspark/sql/readwriter.py | 3 - .../apache/spark/sql/DataFrameReader.scala | 2 - .../datasources/json/JacksonParser.scala | 28 ++++----- .../json/JsonParsingOptionsSuite.scala | 59 +++++-------------- 10 files changed, 58 insertions(+), 102 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index deec033c21..b5c38a6c05 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -72,13 +72,13 @@ hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.7.3.jar -jackson-core-2.7.3.jar +jackson-annotations-2.6.5.jar +jackson-core-2.6.5.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.7.3.jar +jackson-databind-2.6.5.jar jackson-mapper-asl-1.9.13.jar -jackson-module-paranamer-2.7.3.jar -jackson-module-scala_2.11-2.7.3.jar +jackson-module-paranamer-2.6.5.jar +jackson-module-scala_2.11-2.6.5.jar janino-2.7.8.jar javassist-3.18.1-GA.jar javax.annotation-api-1.2.jar @@ -128,7 +128,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.8.jar +paranamer-2.3.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 43c7dd3580..969df0495d 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -74,13 +74,13 @@ hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.7.3.jar -jackson-core-2.7.3.jar +jackson-annotations-2.6.5.jar +jackson-core-2.6.5.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.7.3.jar +jackson-databind-2.6.5.jar jackson-mapper-asl-1.9.13.jar -jackson-module-paranamer-2.7.3.jar -jackson-module-scala_2.11-2.7.3.jar +jackson-module-paranamer-2.6.5.jar +jackson-module-scala_2.11-2.6.5.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar javassist-3.18.1-GA.jar @@ -135,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.8.jar +paranamer-2.3.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 7186b305a8..f0491ece7c 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -74,13 +74,13 @@ hk2-utils-2.4.0-b34.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.7.3.jar -jackson-core-2.7.3.jar +jackson-annotations-2.6.5.jar +jackson-core-2.6.5.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.7.3.jar +jackson-databind-2.6.5.jar jackson-mapper-asl-1.9.13.jar -jackson-module-paranamer-2.7.3.jar -jackson-module-scala_2.11-2.7.3.jar +jackson-module-paranamer-2.6.5.jar +jackson-module-scala_2.11-2.6.5.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar javassist-3.18.1-GA.jar @@ -135,7 +135,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.8.jar +paranamer-2.3.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 3e4ed74cc6..b3dced63b9 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -80,14 +80,14 @@ htrace-core-3.0.4.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.7.3.jar -jackson-core-2.7.3.jar +jackson-annotations-2.6.5.jar +jackson-core-2.6.5.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.7.3.jar +jackson-databind-2.6.5.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-paranamer-2.7.3.jar -jackson-module-scala_2.11-2.7.3.jar +jackson-module-paranamer-2.6.5.jar +jackson-module-scala_2.11-2.6.5.jar jackson-xc-1.9.13.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar @@ -143,7 +143,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.8.jar +paranamer-2.3.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 6b999538a3..16f60f29ff 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -80,14 +80,14 @@ htrace-core-3.1.0-incubating.jar httpclient-4.5.2.jar httpcore-4.4.4.jar ivy-2.4.0.jar -jackson-annotations-2.7.3.jar -jackson-core-2.7.3.jar +jackson-annotations-2.6.5.jar +jackson-core-2.6.5.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.7.3.jar +jackson-databind-2.6.5.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-paranamer-2.7.3.jar -jackson-module-scala_2.11-2.7.3.jar +jackson-module-paranamer-2.6.5.jar +jackson-module-scala_2.11-2.6.5.jar jackson-xc-1.9.13.jar janino-2.7.8.jar java-xmlbuilder-1.0.jar @@ -144,7 +144,7 @@ objenesis-2.1.jar opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar -paranamer-2.8.jar +paranamer-2.3.jar parquet-column-1.8.1.jar parquet-common-1.8.1.jar parquet-encoding-1.8.1.jar diff --git a/pom.xml b/pom.xml index ce9aa9aa00..fff5560afe 100644 --- a/pom.xml +++ b/pom.xml @@ -160,7 +160,7 @@ <jline.version>${scala.version}</jline.version> <jline.groupid>org.scala-lang</jline.groupid> <codehaus.jackson.version>1.9.13</codehaus.jackson.version> - <fasterxml.jackson.version>2.7.3</fasterxml.jackson.version> + <fasterxml.jackson.version>2.6.5</fasterxml.jackson.version> <snappy.version>1.1.2.4</snappy.version> <netlib.java.version>1.1.2</netlib.java.version> <calcite.version>1.2.0-incubating</calcite.version> @@ -180,7 +180,6 @@ <antlr4.version>4.5.3</antlr4.version> <jpam.version>1.1</jpam.version> <selenium.version>2.52.0</selenium.version> - <paranamer.version>2.8</paranamer.version> <test.java.home>${java.home}</test.java.home> <test.exclude.tags></test.exclude.tags> @@ -1826,11 +1825,6 @@ <artifactId>antlr4-runtime</artifactId> <version>${antlr4.version}</version> </dependency> - <dependency> - <groupId>com.thoughtworks.paranamer</groupId> - <artifactId>paranamer</artifactId> - <version>${paranamer.version}</version> - </dependency> </dependencies> </dependencyManagement> diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 13d21d7143..73105f881b 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -193,9 +193,6 @@ class DataFrameReader(object): set, it uses the default value, ``true``. :param allowNumericLeadingZero: allows leading zeros in numbers (e.g. 00012). If None is set, it uses the default value, ``false``. - :param allowNonNumericNumbers: allows using non-numeric numbers such as "NaN", "Infinity", - "-Infinity", "INF", "-INF", which are convertd to floating - point numbers, ``true``. :param allowBackslashEscapingAnyCharacter: allows accepting quoting of all character using backslash quoting mechanism. If None is set, it uses the default value, ``false``. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 2057878028..88fa5cd21d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -293,8 +293,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * </li> * <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers * (e.g. 00012)</li> - * <li>`allowNonNumericNumbers` (default `true`): allows using non-numeric numbers such as "NaN", - * "Infinity", "-Infinity", "INF", "-INF", which are convertd to floating point numbers.</li> * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all * character using backslash quoting mechanism</li> * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala index cafca32318..aeee2600a1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala @@ -129,15 +129,13 @@ object JacksonParser extends Logging { case (VALUE_STRING, FloatType) => // Special case handling for NaN and Infinity. val value = parser.getText - if (value.equals("NaN") || - value.equals("Infinity") || - value.equals("+Infinity") || - value.equals("-Infinity")) { + val lowerCaseValue = value.toLowerCase() + if (lowerCaseValue.equals("nan") || + lowerCaseValue.equals("infinity") || + lowerCaseValue.equals("-infinity") || + lowerCaseValue.equals("inf") || + lowerCaseValue.equals("-inf")) { value.toFloat - } else if (value.equals("+INF") || value.equals("INF")) { - Float.PositiveInfinity - } else if (value.equals("-INF")) { - Float.NegativeInfinity } else { throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.") } @@ -148,15 +146,13 @@ object JacksonParser extends Logging { case (VALUE_STRING, DoubleType) => // Special case handling for NaN and Infinity. val value = parser.getText - if (value.equals("NaN") || - value.equals("Infinity") || - value.equals("+Infinity") || - value.equals("-Infinity")) { + val lowerCaseValue = value.toLowerCase() + if (lowerCaseValue.equals("nan") || + lowerCaseValue.equals("infinity") || + lowerCaseValue.equals("-infinity") || + lowerCaseValue.equals("inf") || + lowerCaseValue.equals("-inf")) { value.toDouble - } else if (value.equals("+INF") || value.equals("INF")) { - Double.PositiveInfinity - } else if (value.equals("-INF")) { - Double.NegativeInfinity } else { throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 2aab955c1e..c31dffedbd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources.json import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext -import org.apache.spark.sql.types.{DoubleType, StructField, StructType} /** * Test cases for various [[JSONOptions]]. @@ -94,51 +93,23 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { assert(df.first().getLong(0) == 18) } - test("allowNonNumericNumbers off") { - // non-quoted non-numeric numbers don't work if allowNonNumericNumbers is off. - var testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", - """{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": INF}""", - """{"age": +INF}""", """{"age": -INF}""") - testCases.foreach { str => - val rdd = spark.sparkContext.parallelize(Seq(str)) - val df = spark.read.option("allowNonNumericNumbers", "false").json(rdd) - - assert(df.schema.head.name == "_corrupt_record") - } - - // quoted non-numeric numbers should still work even allowNonNumericNumbers is off. - testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "+Infinity"}""", - """{"age": "-Infinity"}""", """{"age": "INF"}""", """{"age": "+INF"}""", - """{"age": "-INF"}""") - val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity, - _.isNegInfinity, _.isPosInfinity, _.isPosInfinity, _.isNegInfinity) - val schema = StructType(StructField("age", DoubleType, true) :: Nil) - - testCases.zipWithIndex.foreach { case (str, idx) => - val rdd = spark.sparkContext.parallelize(Seq(str)) - val df = spark.read.option("allowNonNumericNumbers", "false").schema(schema).json(rdd) - - assert(df.schema.head.name == "age") - assert(tests(idx)(df.first().getDouble(0))) - } + // The following two tests are not really working - need to look into Jackson's + // JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS. + ignore("allowNonNumericNumbers off") { + val str = """{"age": NaN}""" + val rdd = spark.sparkContext.parallelize(Seq(str)) + val df = spark.read.json(rdd) + + assert(df.schema.head.name == "_corrupt_record") } - test("allowNonNumericNumbers on") { - val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", - """{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": +INF}""", - """{"age": -INF}""", """{"age": "NaN"}""", """{"age": "Infinity"}""", - """{"age": "-Infinity"}""") - val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity, - _.isNegInfinity, _.isPosInfinity, _.isNegInfinity, _.isNaN, _.isPosInfinity, - _.isNegInfinity, _.isPosInfinity, _.isNegInfinity) - val schema = StructType(StructField("age", DoubleType, true) :: Nil) - testCases.zipWithIndex.foreach { case (str, idx) => - val rdd = spark.sparkContext.parallelize(Seq(str)) - val df = spark.read.option("allowNonNumericNumbers", "true").schema(schema).json(rdd) - - assert(df.schema.head.name == "age") - assert(tests(idx)(df.first().getDouble(0))) - } + ignore("allowNonNumericNumbers on") { + val str = """{"age": NaN}""" + val rdd = spark.sparkContext.parallelize(Seq(str)) + val df = spark.read.option("allowNonNumericNumbers", "true").json(rdd) + + assert(df.schema.head.name == "age") + assert(df.first().getDouble(0).isNaN) } test("allowBackslashEscapingAnyCharacter off") { -- GitLab