diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index a3d7eca04b616aae8aa19668e1505687f0fc55fa..a2771daabe331f64b43c9e74e5f89a5ebc9fb522 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -160,6 +160,8 @@ class DataFrameReader(object): quotes * ``allowNumericLeadingZeros`` (default ``false``): allows leading zeros in numbers \ (e.g. 00012) + * ``allowBackslashEscapingAnyCharacter`` (default ``false``): allows accepting quoting \ + of all character using backslash quoting mechanism >>> df1 = sqlContext.read.json('python/test_support/sql/people.json') >>> df1.dtypes diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 0acea95344c22e85d36c2a953c828eb961dfe2b9..6debb302d9ecf9a6c0102972d740586e95e1ab9b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -258,6 +258,8 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging { * </li> * <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers * (e.g. 00012)</li> + * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all + * character using backslash quoting mechanism</li> * * @since 1.6.0 */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala index c132ead20e7d6b5edbb6fa8dfb55779fbccc618d..f805c0092585caa4c3cf0cfb7e0c1941ea6c552b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala @@ -31,7 +31,8 @@ case class JSONOptions( allowUnquotedFieldNames: Boolean = false, allowSingleQuotes: Boolean = true, allowNumericLeadingZeros: Boolean = false, - allowNonNumericNumbers: Boolean = false) { + allowNonNumericNumbers: Boolean = false, + allowBackslashEscapingAnyCharacter: Boolean = false) { /** Sets config options on a Jackson [[JsonFactory]]. */ def setJacksonOptions(factory: JsonFactory): Unit = { @@ -40,6 +41,8 @@ case class JSONOptions( factory.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, allowSingleQuotes) factory.configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, allowNumericLeadingZeros) factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers) + factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, + allowBackslashEscapingAnyCharacter) } } @@ -59,6 +62,8 @@ object JSONOptions { allowNumericLeadingZeros = parameters.get("allowNumericLeadingZeros").map(_.toBoolean).getOrElse(false), allowNonNumericNumbers = - parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true) + parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true), + allowBackslashEscapingAnyCharacter = + parameters.get("allowBackslashEscapingAnyCharacter").map(_.toBoolean).getOrElse(false) ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 4cc0a3a9585d9380086321c4c24be8a84cad2af5..1742df31bba9a05c8cb491c26939a63619adce1f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -111,4 +111,23 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { assert(df.schema.head.name == "age") assert(df.first().getDouble(0).isNaN) } + + test("allowBackslashEscapingAnyCharacter off") { + val str = """{"name": "Cazen Lee", "price": "\$10"}""" + val rdd = sqlContext.sparkContext.parallelize(Seq(str)) + val df = sqlContext.read.option("allowBackslashEscapingAnyCharacter", "false").json(rdd) + + assert(df.schema.head.name == "_corrupt_record") + } + + test("allowBackslashEscapingAnyCharacter on") { + val str = """{"name": "Cazen Lee", "price": "\$10"}""" + val rdd = sqlContext.sparkContext.parallelize(Seq(str)) + val df = sqlContext.read.option("allowBackslashEscapingAnyCharacter", "true").json(rdd) + + assert(df.schema.head.name == "name") + assert(df.schema.last.name == "price") + assert(df.first().getString(0) == "Cazen Lee") + assert(df.first().getString(1) == "$10") + } }