From 7ab951885fd34aa8184b70a3a39b865a239e5052 Mon Sep 17 00:00:00 2001 From: Jen-Ming Chung <jenmingisme@gmail.com> Date: Thu, 17 Aug 2017 15:59:45 -0700 Subject: [PATCH] [SPARK-21677][SQL] json_tuple throws NullPointException when column is null as string type ## What changes were proposed in this pull request? ``` scala scala> Seq(("""{"Hyukjin": 224, "John": 1225}""")).toDS.selectExpr("json_tuple(value, trim(null))").show() ... java.lang.NullPointerException at ... ``` Currently the `null` field name will throw NullPointException. As a given field name null can't be matched with any field names in json, we just output null as its column value. This PR achieves it by returning a very unlikely column name `__NullFieldName` in evaluation of the field names. ## How was this patch tested? Added unit test. Author: Jen-Ming Chung <jenmingisme@gmail.com> Closes #18930 from jmchung/SPARK-21677. --- .../expressions/jsonExpressions.scala | 8 ++--- .../expressions/JsonExpressionsSuite.scala | 10 ++++++ .../sql-tests/inputs/json-functions.sql | 6 ++++ .../sql-tests/results/json-functions.sql.out | 34 ++++++++++++++++++- 4 files changed, 53 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 17b605438d..c3757373a3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -362,9 +362,9 @@ case class JsonTuple(children: Seq[Expression]) @transient private lazy val fieldExpressions: Seq[Expression] = children.tail // eagerly evaluate any foldable the field names - @transient private lazy val foldableFieldNames: IndexedSeq[String] = { + @transient private lazy val foldableFieldNames: IndexedSeq[Option[String]] = { fieldExpressions.map { - case expr if expr.foldable => expr.eval().asInstanceOf[UTF8String].toString + case expr if expr.foldable => Option(expr.eval()).map(_.asInstanceOf[UTF8String].toString) case _ => null }.toIndexedSeq } @@ -417,7 +417,7 @@ case class JsonTuple(children: Seq[Expression]) val fieldNames = if (constantFields == fieldExpressions.length) { // typically the user will provide the field names as foldable expressions // so we can use the cached copy - foldableFieldNames + foldableFieldNames.map(_.orNull) } else if (constantFields == 0) { // none are foldable so all field names need to be evaluated from the input row fieldExpressions.map(_.eval(input).asInstanceOf[UTF8String].toString) @@ -426,7 +426,7 @@ case class JsonTuple(children: Seq[Expression]) // prefer the cached copy when available foldableFieldNames.zip(fieldExpressions).map { case (null, expr) => expr.eval(input).asInstanceOf[UTF8String].toString - case (fieldName, _) => fieldName + case (fieldName, _) => fieldName.orNull } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index f892e80204..1cd2b4fc18 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -363,6 +363,16 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { InternalRow(UTF8String.fromString("b\nc"))) } + test("SPARK-21677: json_tuple throws NullPointException when column is null as string type") { + checkJsonTuple( + JsonTuple(Literal("""{"f1": 1, "f2": 2}""") :: + NonFoldableLiteral("f1") :: + NonFoldableLiteral("cast(NULL AS STRING)") :: + NonFoldableLiteral("f2") :: + Nil), + InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2"))) + } + val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID) test("from_json") { diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql index b3cc2cea51..5a46fb4321 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql @@ -20,3 +20,9 @@ select from_json('{"a":1}', 'a InvalidType'); select from_json('{"a":1}', 'a INT', named_struct('mode', 'PERMISSIVE')); select from_json('{"a":1}', 'a INT', map('mode', 1)); select from_json(); +-- json_tuple +SELECT json_tuple('{"a" : 1, "b" : 2}', CAST(NULL AS STRING), 'b', CAST(NULL AS STRING), 'a'); +CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a'); +SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable; +-- Clean up +DROP VIEW IF EXISTS jsonTable; diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index 22da20d9a9..ae21d00116 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 17 +-- Number of queries: 21 -- !query 0 @@ -178,3 +178,35 @@ struct<> -- !query 16 output org.apache.spark.sql.AnalysisException Invalid number of arguments for function from_json; line 1 pos 7 + + +-- !query 17 +SELECT json_tuple('{"a" : 1, "b" : 2}', CAST(NULL AS STRING), 'b', CAST(NULL AS STRING), 'a') +-- !query 17 schema +struct<c0:string,c1:string,c2:string,c3:string> +-- !query 17 output +NULL 2 NULL 1 + + +-- !query 18 +CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a') +-- !query 18 schema +struct<> +-- !query 18 output + + + +-- !query 19 +SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable +-- !query 19 schema +struct<c0:string,c1:string,c2:string> +-- !query 19 output +2 NULL 1 + + +-- !query 20 +DROP VIEW IF EXISTS jsonTable +-- !query 20 schema +struct<> +-- !query 20 output + -- GitLab