[SPARK-14031][SQL] speedup CSV writer

## What changes were proposed in this pull request? Currently, we create an CSVWriter for every row, it's very expensive and memory hungry, took about 15 seconds to write out 1 mm rows (two columns). This PR will write the rows in batch mode, create a CSVWriter for every 1k rows, which could write out 1 mm rows in about 1 seconds (15X faster). ## How was this patch tested? Manually benchmark it. Author: Davies Liu <davies@databricks.com> Closes #13229 from davies/csv_writer.

[SPARK-14031][SQL] speedup CSV writer
80091b8a · Davies Liu · Cheng Lian · dafcb05c · 80091b8a · 80091b8a
Commit 80091b8a authored 9 years ago by Davies Liu Committed by Cheng Lian 9 years ago
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
@@ -76,17 +76,26 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten
  writerSettings.setQuoteAllFields(false)
  writerSettings.setHeaders(headers: _*)
-  def writeRow(row: Seq[String], includeHeader: Boolean): String = {
+  private var buffer = new ByteArrayOutputStream()
-    val buffer = new ByteArrayOutputStream()
+  private var writer = new CsvWriter(
-    val outputWriter = new OutputStreamWriter(buffer, StandardCharsets.UTF_8)
+    new OutputStreamWriter(buffer, StandardCharsets.UTF_8),
-    val writer = new CsvWriter(outputWriter, writerSettings)
+    writerSettings)
+  def writeRow(row: Seq[String], includeHeader: Boolean): Unit = {
    if (includeHeader) {
      writer.writeHeaders()
    }
    writer.writeRow(row.toArray: _*)
+  }
+  def flush(): String = {
    writer.close()
-    buffer.toString.stripLineEnd
+    val lines = buffer.toString.stripLineEnd
+    buffer = new ByteArrayOutputStream()
+    writer = new CsvWriter(
+      new OutputStreamWriter(buffer, StandardCharsets.UTF_8),
+      writerSettings)
+    lines
  }
 }

--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -176,8 +176,8 @@ private[sql] class CsvOutputWriter(
    }.getRecordWriter(context)
  }
-  private var firstRow: Boolean = params.headerFlag
+  private val FLUSH_BATCH_SIZE = 1024L
+  private var records: Long = 0L
  private val csvWriter = new LineCsvWriter(params, dataSchema.fieldNames.toSeq)
  private def rowToString(row: Seq[Any]): Seq[String] = row.map { field =>
@@ -191,16 +191,23 @@ private[sql] class CsvOutputWriter(
  override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")
  override protected[sql] def writeInternal(row: InternalRow): Unit = {
-    // TODO: Instead of converting and writing every row, we should use the univocity buffer
+    csvWriter.writeRow(rowToString(row.toSeq(dataSchema)), records == 0L && params.headerFlag)
-    val resultString = csvWriter.writeRow(rowToString(row.toSeq(dataSchema)), firstRow)
+    records += 1
-    if (firstRow) {
+    if (records % FLUSH_BATCH_SIZE == 0) {
-      firstRow = false
+      flush()
+    }
+  }
+  private def flush(): Unit = {
+    val lines = csvWriter.flush()
+    if (lines.nonEmpty) {
+      text.set(lines)
+      recordWriter.write(NullWritable.get(), text)
    }
-    text.set(resultString)
-    recordWriter.write(NullWritable.get(), text)
  }
  override def close(): Unit = {
+    flush()
    recordWriter.close(context)
  }
 }