From e1ac553402ab82bbc72fd64e5943b71c16b4b37d Mon Sep 17 00:00:00 2001 From: Liwei Lin <lwlin7@gmail.com> Date: Tue, 14 Mar 2017 22:30:16 -0700 Subject: [PATCH] [SPARK-19817][SS] Make it clear that `timeZone` is a general option in DataStreamReader/Writer ## What changes were proposed in this pull request? As timezone setting can also affect partition values, it works for all formats, we should make it clear. ## How was this patch tested? N/A Author: Liwei Lin <lwlin7@gmail.com> Closes #17299 from lw-lin/timezone. --- python/pyspark/sql/readwriter.py | 8 ++--- python/pyspark/sql/streaming.py | 32 ++++++++++++++----- .../apache/spark/sql/DataFrameReader.scala | 6 ++-- .../apache/spark/sql/DataFrameWriter.scala | 6 ++-- .../sql/streaming/DataStreamReader.scala | 22 ++++++++++--- .../sql/streaming/DataStreamWriter.scala | 18 +++++++++++ 6 files changed, 70 insertions(+), 22 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 705803791d..122e17f202 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -112,7 +112,7 @@ class DataFrameReader(OptionUtils): You can set the following option(s) for reading files: * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps - in the JSON/CSV datasources or parttion values. + in the JSON/CSV datasources or partition values. If it isn't set, it uses the default value, session local timezone. """ self._jreader = self._jreader.option(key, to_str(value)) @@ -124,7 +124,7 @@ class DataFrameReader(OptionUtils): You can set the following option(s) for reading files: * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps - in the JSON/CSV datasources or parttion values. + in the JSON/CSV datasources or partition values. If it isn't set, it uses the default value, session local timezone. """ for k in options: @@ -530,7 +530,7 @@ class DataFrameWriter(OptionUtils): You can set the following option(s) for writing files: * ``timeZone``: sets the string that indicates a timezone to be used to format - timestamps in the JSON/CSV datasources or parttion values. + timestamps in the JSON/CSV datasources or partition values. If it isn't set, it uses the default value, session local timezone. """ self._jwrite = self._jwrite.option(key, to_str(value)) @@ -542,7 +542,7 @@ class DataFrameWriter(OptionUtils): You can set the following option(s) for writing files: * ``timeZone``: sets the string that indicates a timezone to be used to format - timestamps in the JSON/CSV datasources or parttion values. + timestamps in the JSON/CSV datasources or partition values. If it isn't set, it uses the default value, session local timezone. """ for k in options: diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 625fb9ba38..288cc1e4f6 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -373,6 +373,11 @@ class DataStreamReader(OptionUtils): def option(self, key, value): """Adds an input option for the underlying data source. + You can set the following option(s) for reading files: + * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps + in the JSON/CSV datasources or partition values. + If it isn't set, it uses the default value, session local timezone. + .. note:: Experimental. >>> s = spark.readStream.option("x", 1) @@ -384,6 +389,11 @@ class DataStreamReader(OptionUtils): def options(self, **options): """Adds input options for the underlying data source. + You can set the following option(s) for reading files: + * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps + in the JSON/CSV datasources or partition values. + If it isn't set, it uses the default value, session local timezone. + .. note:: Experimental. >>> s = spark.readStream.options(x="1", y=2) @@ -429,7 +439,7 @@ class DataStreamReader(OptionUtils): allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None, allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None, mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None, - timeZone=None, wholeFile=None): + wholeFile=None): """ Loads a JSON file stream and returns the results as a :class:`DataFrame`. @@ -486,8 +496,6 @@ class DataStreamReader(OptionUtils): formats follow the formats at ``java.text.SimpleDateFormat``. This applies to timestamp type. If None is set, it uses the default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSZZ``. - :param timeZone: sets the string that indicates a timezone to be used to parse timestamps. - If None is set, it uses the default value, session local timezone. :param wholeFile: parse one record, which may span multiple lines, per file. If None is set, it uses the default value, ``false``. @@ -503,7 +511,7 @@ class DataStreamReader(OptionUtils): allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero, allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter, mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat, - timestampFormat=timestampFormat, timeZone=timeZone, wholeFile=wholeFile) + timestampFormat=timestampFormat, wholeFile=wholeFile) if isinstance(path, basestring): return self._df(self._jreader.json(path)) else: @@ -561,7 +569,7 @@ class DataStreamReader(OptionUtils): comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, - maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, timeZone=None, + maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, wholeFile=None): """Loads a CSV file stream and returns the result as a :class:`DataFrame`. @@ -619,8 +627,6 @@ class DataStreamReader(OptionUtils): ``-1`` meaning unlimited length. :param mode: allows a mode for dealing with corrupt records during parsing. If None is set, it uses the default value, ``PERMISSIVE``. - :param timeZone: sets the string that indicates a timezone to be used to parse timestamps. - If None is set, it uses the default value, session local timezone. * ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted \ record, and puts the malformed string into a field configured by \ @@ -653,7 +659,7 @@ class DataStreamReader(OptionUtils): nanValue=nanValue, positiveInf=positiveInf, negativeInf=negativeInf, dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns, maxCharsPerColumn=maxCharsPerColumn, - maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, timeZone=timeZone, + maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, wholeFile=wholeFile) if isinstance(path, basestring): return self._df(self._jreader.csv(path)) @@ -721,6 +727,11 @@ class DataStreamWriter(object): def option(self, key, value): """Adds an output option for the underlying data source. + You can set the following option(s) for writing files: + * ``timeZone``: sets the string that indicates a timezone to be used to format + timestamps in the JSON/CSV datasources or partition values. + If it isn't set, it uses the default value, session local timezone. + .. note:: Experimental. """ self._jwrite = self._jwrite.option(key, to_str(value)) @@ -730,6 +741,11 @@ class DataStreamWriter(object): def options(self, **options): """Adds output options for the underlying data source. + You can set the following option(s) for writing files: + * ``timeZone``: sets the string that indicates a timezone to be used to format + timestamps in the JSON/CSV datasources or partition values. + If it isn't set, it uses the default value, session local timezone. + .. note:: Experimental. """ for k in options: diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 309654c804..88fbfb4c92 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -73,7 +73,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * You can set the following option(s): * <ul> * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to parse timestamps in the JSON/CSV datasources or parttion values.</li> + * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li> * </ul> * * @since 1.4.0 @@ -110,7 +110,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * You can set the following option(s): * <ul> * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to parse timestamps in the JSON/CSV datasources or parttion values.</li> + * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li> * </ul> * * @since 1.4.0 @@ -126,7 +126,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * You can set the following option(s): * <ul> * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to parse timestamps in the JSON/CSV datasources or parttion values.</li> + * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li> * </ul> * * @since 1.4.0 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 608160a214..deaa800694 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -93,7 +93,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * You can set the following option(s): * <ul> * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to format timestamps in the JSON/CSV datasources or parttion values.</li> + * to be used to format timestamps in the JSON/CSV datasources or partition values.</li> * </ul> * * @since 1.4.0 @@ -130,7 +130,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * You can set the following option(s): * <ul> * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to format timestamps in the JSON/CSV datasources or parttion values.</li> + * to be used to format timestamps in the JSON/CSV datasources or partition values.</li> * </ul> * * @since 1.4.0 @@ -146,7 +146,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * You can set the following option(s): * <ul> * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to format timestamps in the JSON/CSV datasources or parttion values.</li> + * to be used to format timestamps in the JSON/CSV datasources or partition values.</li> * </ul> * * @since 1.4.0 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index aed8074a64..388ef182ce 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -61,6 +61,12 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo /** * Adds an input option for the underlying data source. * + * You can set the following option(s): + * <ul> + * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone + * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li> + * </ul> + * * @since 2.0.0 */ def option(key: String, value: String): DataStreamReader = { @@ -92,6 +98,12 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo /** * (Scala-specific) Adds input options for the underlying data source. * + * You can set the following option(s): + * <ul> + * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone + * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li> + * </ul> + * * @since 2.0.0 */ def options(options: scala.collection.Map[String, String]): DataStreamReader = { @@ -102,6 +114,12 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo /** * Adds input options for the underlying data source. * + * You can set the following option(s): + * <ul> + * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone + * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li> + * </ul> + * * @since 2.0.0 */ def options(options: java.util.Map[String, String]): DataStreamReader = { @@ -186,8 +204,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSZZ`): sets the string that * indicates a timestamp format. Custom date formats follow the formats at * `java.text.SimpleDateFormat`. This applies to timestamp type.</li> - * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to parse timestamps.</li> * <li>`wholeFile` (default `false`): parse one record, which may span multiple lines, * per file</li> * </ul> @@ -239,8 +255,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSZZ`): sets the string that * indicates a timestamp format. Custom date formats follow the formats at * `java.text.SimpleDateFormat`. This applies to timestamp type.</li> - * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to parse timestamps.</li> * <li>`maxColumns` (default `20480`): defines a hard limit of how many columns * a record can have.</li> * <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index c8fda8cd83..fe52013bad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -145,6 +145,12 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { /** * Adds an output option for the underlying data source. * + * You can set the following option(s): + * <ul> + * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone + * to be used to format timestamps in the JSON/CSV datasources or partition values.</li> + * </ul> + * * @since 2.0.0 */ def option(key: String, value: String): DataStreamWriter[T] = { @@ -176,6 +182,12 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { /** * (Scala-specific) Adds output options for the underlying data source. * + * You can set the following option(s): + * <ul> + * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone + * to be used to format timestamps in the JSON/CSV datasources or partition values.</li> + * </ul> + * * @since 2.0.0 */ def options(options: scala.collection.Map[String, String]): DataStreamWriter[T] = { @@ -186,6 +198,12 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { /** * Adds output options for the underlying data source. * + * You can set the following option(s): + * <ul> + * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone + * to be used to format timestamps in the JSON/CSV datasources or partition values.</li> + * </ul> + * * @since 2.0.0 */ def options(options: java.util.Map[String, String]): DataStreamWriter[T] = { -- GitLab