From 38cf8f2a50068f80350740ac28e31c8accd20634 Mon Sep 17 00:00:00 2001 From: Jurriaan Pruis <email@jurriaanpruis.nl> Date: Fri, 8 Jul 2016 11:45:41 -0700 Subject: [PATCH] [SPARK-13638][SQL] Add quoteAll option to CSV DataFrameWriter ## What changes were proposed in this pull request? Adds an quoteAll option for writing CSV which will quote all fields. See https://issues.apache.org/jira/browse/SPARK-13638 ## How was this patch tested? Added a test to verify the output columns are quoted for all fields in the Dataframe Author: Jurriaan Pruis <email@jurriaanpruis.nl> Closes #13374 from jurriaan/csv-quote-all. --- python/pyspark/sql/readwriter.py | 7 +++-- .../apache/spark/sql/DataFrameWriter.scala | 2 ++ .../datasources/csv/CSVOptions.scala | 2 ++ .../execution/datasources/csv/CSVParser.scala | 2 +- .../execution/datasources/csv/CSVSuite.scala | 26 +++++++++++++++++++ 5 files changed, 36 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 78d992e415..f7c354f513 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -633,7 +633,7 @@ class DataFrameWriter(OptionUtils): @since(2.0) def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None, - header=None, nullValue=None, escapeQuotes=None): + header=None, nullValue=None, escapeQuotes=None, quoteAll=None): """Saves the content of the :class:`DataFrame` in CSV format at the specified path. :param path: the path in any Hadoop supported file system @@ -658,6 +658,9 @@ class DataFrameWriter(OptionUtils): :param escapeQuotes: A flag indicating whether values containing quotes should always be enclosed in quotes. If None is set, it uses the default value ``true``, escaping all values containing a quote character. + :param quoteAll: A flag indicating whether all values should always be enclosed in + quotes. If None is set, it uses the default value ``false``, + only escaping values containing a quote character. :param header: writes the names of columns as the first line. If None is set, it uses the default value, ``false``. :param nullValue: sets the string representation of a null value. If None is set, it uses @@ -667,7 +670,7 @@ class DataFrameWriter(OptionUtils): """ self.mode(mode) self._set_opts(compression=compression, sep=sep, quote=quote, escape=escape, header=header, - nullValue=nullValue, escapeQuotes=escapeQuotes) + nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll) self._jwrite.csv(path) @since(1.5) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index f77af76d2b..12b304623d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -537,6 +537,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * <li>`escapeQuotes` (default `true`): a flag indicating whether values containing * quotes should always be enclosed in quotes. Default is to escape all values containing * a quote character.</li> + * <li>`quoteAll` (default `false`): A flag indicating whether all values should always be + * enclosed in quotes. Default is to only escape values containing a quote character.</li> * <li>`header` (default `false`): writes the names of columns as the first line.</li> * <li>`nullValue` (default empty string): sets the string representation of a null value.</li> * <li>`compression` (default `null`): compression codec to use when saving to file. This can be diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index 581eda7e09..22fb8163b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -115,6 +115,8 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str val maxMalformedLogPerPartition = getInt("maxMalformedLogPerPartition", 10) + val quoteAll = getBool("quoteAll", false) + val inputBufferSize = 128 val isCommentSet = this.comment != '\u0000' diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala index bf62732dd4..13ae76d498 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala @@ -78,7 +78,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten writerSettings.setNullValue(params.nullValue) writerSettings.setEmptyValue(params.nullValue) writerSettings.setSkipEmptyLines(true) - writerSettings.setQuoteAllFields(false) + writerSettings.setQuoteAllFields(params.quoteAll) writerSettings.setHeaders(headers: _*) writerSettings.setQuoteEscapingEnabled(params.escapeQuotes) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index f170065132..311f1fa8d2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -366,6 +366,32 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } } + test("save csv with quoteAll enabled") { + withTempDir { dir => + val csvDir = new File(dir, "csv").getCanonicalPath + + val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well")) + val df = spark.createDataFrame(data) + + // escapeQuotes should be true by default + df.coalesce(1).write + .format("csv") + .option("quote", "\"") + .option("escape", "\"") + .option("quoteAll", "true") + .save(csvDir) + + val results = spark.read + .format("text") + .load(csvDir) + .collect() + + val expected = "\"test \"\"quote\"\"\",\"123\",\"it \"\"works\"\"!\",\"\"\"very\"\" well\"" + + assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected))) + } + } + test("save csv with quote escaping enabled") { withTempDir { dir => val csvDir = new File(dir, "csv").getCanonicalPath -- GitLab