From b8410ff9ce8cef7159a7364272e4c4234c5b474f Mon Sep 17 00:00:00 2001
From: Cazen <Cazen@korea.com>
Date: Sun, 3 Jan 2016 17:01:19 -0800
Subject: [PATCH] [SPARK-12537][SQL] Add option to accept quoting of all
 character backslash quoting mechanism

We can provides the option to choose JSON parser can be enabled to accept quoting of all character or not.

Author: Cazen <Cazen@korea.com>
Author: Cazen Lee <cazen.lee@samsung.com>
Author: Cazen Lee <Cazen@korea.com>
Author: cazen.lee <cazen.lee@samsung.com>

Closes #10497 from Cazen/master.
---
 python/pyspark/sql/readwriter.py              |  2 ++
 .../apache/spark/sql/DataFrameReader.scala    |  2 ++
 .../datasources/json/JSONOptions.scala        |  9 +++++++--
 .../json/JsonParsingOptionsSuite.scala        | 19 +++++++++++++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index a3d7eca04b..a2771daabe 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -160,6 +160,8 @@ class DataFrameReader(object):
                 quotes
             * ``allowNumericLeadingZeros`` (default ``false``): allows leading zeros in numbers \
                 (e.g. 00012)
+            * ``allowBackslashEscapingAnyCharacter`` (default ``false``): allows accepting quoting \
+                of all character using backslash quoting mechanism
 
         >>> df1 = sqlContext.read.json('python/test_support/sql/people.json')
         >>> df1.dtypes
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 0acea95344..6debb302d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -258,6 +258,8 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    * </li>
    * <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers
    * (e.g. 00012)</li>
+   * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
+   * character using backslash quoting mechanism</li>
    *
    * @since 1.6.0
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala
index c132ead20e..f805c00925 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala
@@ -31,7 +31,8 @@ case class JSONOptions(
     allowUnquotedFieldNames: Boolean = false,
     allowSingleQuotes: Boolean = true,
     allowNumericLeadingZeros: Boolean = false,
-    allowNonNumericNumbers: Boolean = false) {
+    allowNonNumericNumbers: Boolean = false,
+    allowBackslashEscapingAnyCharacter: Boolean = false) {
 
   /** Sets config options on a Jackson [[JsonFactory]]. */
   def setJacksonOptions(factory: JsonFactory): Unit = {
@@ -40,6 +41,8 @@ case class JSONOptions(
     factory.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, allowSingleQuotes)
     factory.configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, allowNumericLeadingZeros)
     factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers)
+    factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
+      allowBackslashEscapingAnyCharacter)
   }
 }
 
@@ -59,6 +62,8 @@ object JSONOptions {
     allowNumericLeadingZeros =
       parameters.get("allowNumericLeadingZeros").map(_.toBoolean).getOrElse(false),
     allowNonNumericNumbers =
-      parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true)
+      parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true),
+    allowBackslashEscapingAnyCharacter =
+      parameters.get("allowBackslashEscapingAnyCharacter").map(_.toBoolean).getOrElse(false)
   )
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
index 4cc0a3a958..1742df31bb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
@@ -111,4 +111,23 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext {
     assert(df.schema.head.name == "age")
     assert(df.first().getDouble(0).isNaN)
   }
+
+  test("allowBackslashEscapingAnyCharacter off") {
+    val str = """{"name": "Cazen Lee", "price": "\$10"}"""
+    val rdd = sqlContext.sparkContext.parallelize(Seq(str))
+    val df = sqlContext.read.option("allowBackslashEscapingAnyCharacter", "false").json(rdd)
+
+    assert(df.schema.head.name == "_corrupt_record")
+  }
+
+  test("allowBackslashEscapingAnyCharacter on") {
+    val str = """{"name": "Cazen Lee", "price": "\$10"}"""
+    val rdd = sqlContext.sparkContext.parallelize(Seq(str))
+    val df = sqlContext.read.option("allowBackslashEscapingAnyCharacter", "true").json(rdd)
+
+    assert(df.schema.head.name == "name")
+    assert(df.schema.last.name == "price")
+    assert(df.first().getString(0) == "Cazen Lee")
+    assert(df.first().getString(1) == "$10")
+  }
 }
-- 
GitLab