From 44c8bfda793b7655e2bd1da5e9915a09ed9d42ce Mon Sep 17 00:00:00 2001 From: Felix Cheung <felixcheung_m@hotmail.com> Date: Wed, 26 Oct 2016 23:06:11 -0700 Subject: [PATCH] [SQL][DOC] updating doc for JSON source to link to jsonlines.org ## What changes were proposed in this pull request? API and programming guide doc changes for Scala, Python and R. ## How was this patch tested? manual test Author: Felix Cheung <felixcheung_m@hotmail.com> Closes #15629 from felixcheung/jsondoc. --- R/pkg/R/DataFrame.R | 3 ++- R/pkg/R/SQLContext.R | 3 ++- docs/sparkr.md | 2 +- docs/sql-programming-guide.md | 22 +++++++++++-------- python/pyspark/sql/readwriter.py | 5 +++-- python/pyspark/sql/streaming.py | 3 ++- .../apache/spark/sql/DataFrameReader.scala | 14 +++++++----- .../apache/spark/sql/DataFrameWriter.scala | 3 ++- .../sql/streaming/DataStreamReader.scala | 3 ++- 9 files changed, 35 insertions(+), 23 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index be34e4b32f..1df8bbf9fe 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -761,7 +761,8 @@ setMethod("toJSON", #' Save the contents of SparkDataFrame as a JSON file #' -#' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out +#' Save the contents of a SparkDataFrame as a JSON file (\href{http://jsonlines.org/}{ +#' JSON Lines text format or newline-delimited JSON}). Files written out #' with this method can be read back in as a SparkDataFrame using read.json(). #' #' @param x A SparkDataFrame diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 0d6a229e63..216ca51666 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -324,7 +324,8 @@ setMethod("toDF", signature(x = "RDD"), #' Create a SparkDataFrame from a JSON file. #' -#' Loads a JSON file (one object per line), returning the result as a SparkDataFrame +#' Loads a JSON file (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON} +#' ), returning the result as a SparkDataFrame #' It goes through the entire dataset once to determine the schema. #' #' @param path Path of file to read. A vector of multiple paths is allowed. diff --git a/docs/sparkr.md b/docs/sparkr.md index c1829efd18..f30bd4026f 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -135,7 +135,7 @@ sparkR.session(sparkPackages = "com.databricks:spark-avro_2.11:3.0.0") {% endhighlight %} </div> -We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail. +We can see how to use data sources using an example JSON input file. Note that the file that is used here is _not_ a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. For more information, please see [JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a consequence, a regular multi-line JSON file will most often fail. <div data-lang="r" markdown="1"> {% highlight r %} diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 064af41965..b9be7a7545 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -316,7 +316,7 @@ Serializable and has getters and setters for all of its fields. Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes. Rows are constructed by passing a list of key/value pairs as kwargs to the Row class. The keys of this list define the column names of the table, -and the types are inferred by sampling the whole datase, similar to the inference that is performed on JSON files. +and the types are inferred by sampling the whole dataset, similar to the inference that is performed on JSON files. {% include_example schema_inferring python/sql/basic.py %} </div> @@ -832,8 +832,9 @@ This conversion can be done using `SparkSession.read.json()` on either an RDD of or a JSON file. Note that the file that is offered as _a json file_ is not a typical JSON file. Each -line must contain a separate, self-contained valid JSON object. As a consequence, -a regular multi-line JSON file will most often fail. +line must contain a separate, self-contained valid JSON object. For more information, please see +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a +consequence, a regular multi-line JSON file will most often fail. {% include_example json_dataset scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} </div> @@ -844,8 +845,9 @@ This conversion can be done using `SparkSession.read().json()` on either an RDD or a JSON file. Note that the file that is offered as _a json file_ is not a typical JSON file. Each -line must contain a separate, self-contained valid JSON object. As a consequence, -a regular multi-line JSON file will most often fail. +line must contain a separate, self-contained valid JSON object. For more information, please see +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a +consequence, a regular multi-line JSON file will most often fail. {% include_example json_dataset java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} </div> @@ -855,8 +857,9 @@ Spark SQL can automatically infer the schema of a JSON dataset and load it as a This conversion can be done using `SparkSession.read.json` on a JSON file. Note that the file that is offered as _a json file_ is not a typical JSON file. Each -line must contain a separate, self-contained valid JSON object. As a consequence, -a regular multi-line JSON file will most often fail. +line must contain a separate, self-contained valid JSON object. For more information, please see +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a +consequence, a regular multi-line JSON file will most often fail. {% include_example json_dataset python/sql/datasource.py %} </div> @@ -867,8 +870,9 @@ the `read.json()` function, which loads data from a directory of JSON files wher files is a JSON object. Note that the file that is offered as _a json file_ is not a typical JSON file. Each -line must contain a separate, self-contained valid JSON object. As a consequence, -a regular multi-line JSON file will most often fail. +line must contain a separate, self-contained valid JSON object. For more information, please see +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a +consequence, a regular multi-line JSON file will most often fail. {% include_example json_dataset r/RSparkSQLExample.R %} diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 91c2b17049..bc786ef95e 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -160,8 +160,9 @@ class DataFrameReader(OptionUtils): allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None, mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None): """ - Loads a JSON file (one object per line) or an RDD of Strings storing JSON objects - (one object per record) and returns the result as a :class`DataFrame`. + Loads a JSON file (`JSON Lines text format or newline-delimited JSON + <[http://jsonlines.org/>`_) or an RDD of Strings storing JSON objects (one object per + record) and returns the result as a :class`DataFrame`. If the ``schema`` parameter is not specified, this function goes through the input once to determine the input schema. diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 35fc469291..559647bbab 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -640,7 +640,8 @@ class DataStreamReader(OptionUtils): mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None): """ - Loads a JSON file stream (one object per line) and returns a :class`DataFrame`. + Loads a JSON file stream (`JSON Lines text format or newline-delimited JSON + <[http://jsonlines.org/>`_) and returns a :class`DataFrame`. If the ``schema`` parameter is not specified, this function goes through the input once to determine the input schema. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index b7b2203cdd..a77937efd7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -239,7 +239,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads a JSON file (one object per line) and returns the result as a [[DataFrame]]. + * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]]) + * and returns the result as a [[DataFrame]]. * See the documentation on the overloaded `json()` method with varargs for more details. * * @since 1.4.0 @@ -250,7 +251,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { } /** - * Loads a JSON file (one object per line) and returns the result as a [[DataFrame]]. + * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]]) + * and returns the result as a [[DataFrame]]. * * This function goes through the input once to determine the input schema. If you know the * schema in advance, use the version that specifies the schema to avoid the extra scan. @@ -295,8 +297,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { def json(paths: String*): DataFrame = format("json").load(paths : _*) /** - * Loads a `JavaRDD[String]` storing JSON objects (one object per record) and - * returns the result as a [[DataFrame]]. + * Loads a `JavaRDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format + * or newline-delimited JSON]]) and returns the result as a [[DataFrame]]. * * Unless the schema is specified using [[schema]] function, this function goes through the * input once to determine the input schema. @@ -307,8 +309,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { def json(jsonRDD: JavaRDD[String]): DataFrame = json(jsonRDD.rdd) /** - * Loads an `RDD[String]` storing JSON objects (one object per record) and - * returns the result as a [[DataFrame]]. + * Loads an `RDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format or + * newline-delimited JSON]]) and returns the result as a [[DataFrame]]. * * Unless the schema is specified using [[schema]] function, this function goes through the * input once to determine the input schema. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 5be3277651..4b5f0246b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -434,7 +434,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } /** - * Saves the content of the [[DataFrame]] in JSON format at the specified path. + * Saves the content of the [[DataFrame]] in JSON format ([[http://jsonlines.org/ JSON Lines text + * format or newline-delimited JSON]]) at the specified path. * This is equivalent to: * {{{ * format("json").save(path) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 87b7306218..40b482e4c0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -134,7 +134,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo } /** - * Loads a JSON file stream (one object per line) and returns the result as a [[DataFrame]]. + * Loads a JSON file stream ([[http://jsonlines.org/ JSON Lines text format or newline-delimited + * JSON]]) and returns the result as a [[DataFrame]]. * * This function goes through the input once to determine the input schema. If you know the * schema in advance, use the version that specifies the schema to avoid the extra scan. -- GitLab