Skip to content
Snippets Groups Projects
Commit 02aa499d authored by hyukjinkwon's avatar hyukjinkwon Committed by Reynold Xin
Browse files

[SPARK-13509][SPARK-13507][SQL] Support for writing CSV with a single function call

https://issues.apache.org/jira/browse/SPARK-13507
https://issues.apache.org/jira/browse/SPARK-13509

## What changes were proposed in this pull request?
This PR adds the support to write CSV data directly by a single call to the given path.

Several unitests were added for each functionality.
## How was this patch tested?

This was tested with unittests and with `dev/run_tests` for coding style

Author: hyukjinkwon <gurwls223@gmail.com>
Author: Hyukjin Kwon <gurwls223@gmail.com>

Closes #11389 from HyukjinKwon/SPARK-13507-13509.
parent 916fc34f
No related branches found
No related tags found
No related merge requests found
...@@ -233,6 +233,23 @@ class DataFrameReader(object): ...@@ -233,6 +233,23 @@ class DataFrameReader(object):
paths = [paths] paths = [paths]
return self._df(self._jreader.text(self._sqlContext._sc._jvm.PythonUtils.toSeq(paths))) return self._df(self._jreader.text(self._sqlContext._sc._jvm.PythonUtils.toSeq(paths)))
@since(2.0)
def csv(self, paths):
"""Loads a CSV file and returns the result as a [[DataFrame]].
This function goes through the input once to determine the input schema. To avoid going
through the entire data once, specify the schema explicitly using [[schema]].
:param paths: string, or list of strings, for input path(s).
>>> df = sqlContext.read.csv('python/test_support/sql/ages.csv')
>>> df.dtypes
[('C0', 'string'), ('C1', 'string')]
"""
if isinstance(paths, basestring):
paths = [paths]
return self._df(self._jreader.csv(self._sqlContext._sc._jvm.PythonUtils.toSeq(paths)))
@since(1.5) @since(1.5)
def orc(self, path): def orc(self, path):
"""Loads an ORC file, returning the result as a :class:`DataFrame`. """Loads an ORC file, returning the result as a :class:`DataFrame`.
...@@ -448,6 +465,11 @@ class DataFrameWriter(object): ...@@ -448,6 +465,11 @@ class DataFrameWriter(object):
* ``ignore``: Silently ignore this operation if data already exists. * ``ignore``: Silently ignore this operation if data already exists.
* ``error`` (default case): Throw an exception if data already exists. * ``error`` (default case): Throw an exception if data already exists.
You can set the following JSON-specific option(s) for writing JSON files:
* ``compression`` (default ``None``): compression codec to use when saving to file.
This can be one of the known case-insensitive shorten names
(``bzip2``, ``gzip``, ``lz4``, and ``snappy``).
>>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data')) >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data'))
""" """
self.mode(mode)._jwrite.json(path) self.mode(mode)._jwrite.json(path)
...@@ -476,11 +498,39 @@ class DataFrameWriter(object): ...@@ -476,11 +498,39 @@ class DataFrameWriter(object):
def text(self, path): def text(self, path):
"""Saves the content of the DataFrame in a text file at the specified path. """Saves the content of the DataFrame in a text file at the specified path.
:param path: the path in any Hadoop supported file system
The DataFrame must have only one column that is of string type. The DataFrame must have only one column that is of string type.
Each row becomes a new line in the output file. Each row becomes a new line in the output file.
You can set the following option(s) for writing text files:
* ``compression`` (default ``None``): compression codec to use when saving to file.
This can be one of the known case-insensitive shorten names
(``bzip2``, ``gzip``, ``lz4``, and ``snappy``).
""" """
self._jwrite.text(path) self._jwrite.text(path)
@since(2.0)
def csv(self, path, mode=None):
"""Saves the content of the [[DataFrame]] in CSV format at the specified path.
:param path: the path in any Hadoop supported file system
:param mode: specifies the behavior of the save operation when data already exists.
* ``append``: Append contents of this :class:`DataFrame` to existing data.
* ``overwrite``: Overwrite existing data.
* ``ignore``: Silently ignore this operation if data already exists.
* ``error`` (default case): Throw an exception if data already exists.
You can set the following CSV-specific option(s) for writing CSV files:
* ``compression`` (default ``None``): compression codec to use when saving to file.
This can be one of the known case-insensitive shorten names
(``bzip2``, ``gzip``, ``lz4``, and ``snappy``).
>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
"""
self.mode(mode)._jwrite.csv(path)
@since(1.5) @since(1.5)
def orc(self, path, mode=None, partitionBy=None): def orc(self, path, mode=None, partitionBy=None):
"""Saves the content of the :class:`DataFrame` in ORC format at the specified path. """Saves the content of the :class:`DataFrame` in ORC format at the specified path.
......
Joe,20
Tom,30
Hyukjin,25
...@@ -453,6 +453,10 @@ final class DataFrameWriter private[sql](df: DataFrame) { ...@@ -453,6 +453,10 @@ final class DataFrameWriter private[sql](df: DataFrame) {
* format("json").save(path) * format("json").save(path)
* }}} * }}}
* *
* You can set the following JSON-specific option(s) for writing JSON files:
* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
* one of the known case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). </li>
*
* @since 1.4.0 * @since 1.4.0
*/ */
def json(path: String): Unit = format("json").save(path) def json(path: String): Unit = format("json").save(path)
...@@ -492,10 +496,29 @@ final class DataFrameWriter private[sql](df: DataFrame) { ...@@ -492,10 +496,29 @@ final class DataFrameWriter private[sql](df: DataFrame) {
* df.write().text("/path/to/output") * df.write().text("/path/to/output")
* }}} * }}}
* *
* You can set the following option(s) for writing text files:
* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
* one of the known case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). </li>
*
* @since 1.6.0 * @since 1.6.0
*/ */
def text(path: String): Unit = format("text").save(path) def text(path: String): Unit = format("text").save(path)
/**
* Saves the content of the [[DataFrame]] in CSV format at the specified path.
* This is equivalent to:
* {{{
* format("csv").save(path)
* }}}
*
* You can set the following CSV-specific option(s) for writing CSV files:
* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
* one of the known case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). </li>
*
* @since 2.0.0
*/
def csv(path: String): Unit = format("csv").save(path)
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Builder pattern config options // Builder pattern config options
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
......
...@@ -48,10 +48,7 @@ private[sql] class JSONOptions( ...@@ -48,10 +48,7 @@ private[sql] class JSONOptions(
parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true) parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true)
val allowBackslashEscapingAnyCharacter = val allowBackslashEscapingAnyCharacter =
parameters.get("allowBackslashEscapingAnyCharacter").map(_.toBoolean).getOrElse(false) parameters.get("allowBackslashEscapingAnyCharacter").map(_.toBoolean).getOrElse(false)
val compressionCodec = { val compressionCodec = parameters.get("compression").map(CompressionCodecs.getCodecClassName)
val name = parameters.get("compression").orElse(parameters.get("codec"))
name.map(CompressionCodecs.getCodecClassName)
}
/** Sets config options on a Jackson [[JsonFactory]]. */ /** Sets config options on a Jackson [[JsonFactory]]. */
def setJacksonOptions(factory: JsonFactory): Unit = { def setJacksonOptions(factory: JsonFactory): Unit = {
......
...@@ -115,10 +115,7 @@ private[sql] class TextRelation( ...@@ -115,10 +115,7 @@ private[sql] class TextRelation(
/** Write path. */ /** Write path. */
override def prepareJobForWrite(job: Job): OutputWriterFactory = { override def prepareJobForWrite(job: Job): OutputWriterFactory = {
val conf = job.getConfiguration val conf = job.getConfiguration
val compressionCodec = { val compressionCodec = parameters.get("compression").map(CompressionCodecs.getCodecClassName)
val name = parameters.get("compression").orElse(parameters.get("codec"))
name.map(CompressionCodecs.getCodecClassName)
}
compressionCodec.foreach { codec => compressionCodec.foreach { codec =>
CompressionCodecs.setCodecConfiguration(conf, codec) CompressionCodecs.setCodecConfiguration(conf, codec)
} }
......
...@@ -268,9 +268,8 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { ...@@ -268,9 +268,8 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
.load(testFile(carsFile)) .load(testFile(carsFile))
cars.coalesce(1).write cars.coalesce(1).write
.format("csv")
.option("header", "true") .option("header", "true")
.save(csvDir) .csv(csvDir)
val carsCopy = sqlContext.read val carsCopy = sqlContext.read
.format("csv") .format("csv")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment