From 8d6ef895ee492b8febbaac7ab2ef2c907b48fa4a Mon Sep 17 00:00:00 2001 From: Felix Cheung <felixcheung_m@hotmail.com> Date: Thu, 2 Mar 2017 01:02:38 -0800 Subject: [PATCH] [SPARK-18352][DOCS] wholeFile JSON update doc and programming guide ## What changes were proposed in this pull request? Update doc for R, programming guide. Clarify default behavior for all languages. ## How was this patch tested? manually Author: Felix Cheung <felixcheung_m@hotmail.com> Closes #17128 from felixcheung/jsonwholefiledoc. --- R/pkg/R/SQLContext.R | 10 ++++--- docs/sql-programming-guide.md | 26 +++++++++++-------- python/pyspark/sql/readwriter.py | 4 +-- python/pyspark/sql/streaming.py | 4 +-- .../apache/spark/sql/DataFrameReader.scala | 4 +-- .../sql/streaming/DataStreamReader.scala | 4 +-- 6 files changed, 30 insertions(+), 22 deletions(-) diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index e771a057e2..8354f705f6 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -332,8 +332,10 @@ setMethod("toDF", signature(x = "RDD"), #' Create a SparkDataFrame from a JSON file. #' -#' Loads a JSON file (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON} -#' ), returning the result as a SparkDataFrame +#' Loads a JSON file, returning the result as a SparkDataFrame +#' By default, (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON} +#' ) is supported. For JSON (one record per file), set a named property \code{wholeFile} to +#' \code{TRUE}. #' It goes through the entire dataset once to determine the schema. #' #' @param path Path of file to read. A vector of multiple paths is allowed. @@ -346,6 +348,7 @@ setMethod("toDF", signature(x = "RDD"), #' sparkR.session() #' path <- "path/to/file.json" #' df <- read.json(path) +#' df <- read.json(path, wholeFile = TRUE) #' df <- jsonFile(path) #' } #' @name read.json @@ -778,6 +781,7 @@ dropTempView <- function(viewName) { #' @return SparkDataFrame #' @rdname read.df #' @name read.df +#' @seealso \link{read.json} #' @export #' @examples #'\dontrun{ @@ -785,7 +789,7 @@ dropTempView <- function(viewName) { #' df1 <- read.df("path/to/file.json", source = "json") #' schema <- structType(structField("name", "string"), #' structField("info", "map<string,double>")) -#' df2 <- read.df(mapTypeJsonPath, "json", schema) +#' df2 <- read.df(mapTypeJsonPath, "json", schema, wholeFile = TRUE) #' df3 <- loadDF("data/test_table", "parquet", mergeSchema = "true") #' } #' @name read.df diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 2dd1ab6ef3..b077575155 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -386,8 +386,8 @@ For example: The [built-in DataFrames functions](api/scala/index.html#org.apache.spark.sql.functions$) provide common aggregations such as `count()`, `countDistinct()`, `avg()`, `max()`, `min()`, etc. -While those functions are designed for DataFrames, Spark SQL also has type-safe versions for some of them in -[Scala](api/scala/index.html#org.apache.spark.sql.expressions.scalalang.typed$) and +While those functions are designed for DataFrames, Spark SQL also has type-safe versions for some of them in +[Scala](api/scala/index.html#org.apache.spark.sql.expressions.scalalang.typed$) and [Java](api/java/org/apache/spark/sql/expressions/javalang/typed.html) to work with strongly typed Datasets. Moreover, users are not limited to the predefined aggregate functions and can create their own. @@ -397,7 +397,7 @@ Moreover, users are not limited to the predefined aggregate functions and can cr <div data-lang="scala" markdown="1"> -Users have to extend the [UserDefinedAggregateFunction](api/scala/index.html#org.apache.spark.sql.expressions.UserDefinedAggregateFunction) +Users have to extend the [UserDefinedAggregateFunction](api/scala/index.html#org.apache.spark.sql.expressions.UserDefinedAggregateFunction) abstract class to implement a custom untyped aggregate function. For example, a user-defined average can look like: @@ -888,8 +888,9 @@ or a JSON file. Note that the file that is offered as _a json file_ is not a typical JSON file. Each line must contain a separate, self-contained valid JSON object. For more information, please see -[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a -consequence, a regular multi-line JSON file will most often fail. +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). + +For a regular multi-line JSON file, set the `wholeFile` option to `true`. {% include_example json_dataset scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} </div> @@ -901,8 +902,9 @@ or a JSON file. Note that the file that is offered as _a json file_ is not a typical JSON file. Each line must contain a separate, self-contained valid JSON object. For more information, please see -[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a -consequence, a regular multi-line JSON file will most often fail. +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). + +For a regular multi-line JSON file, set the `wholeFile` option to `true`. {% include_example json_dataset java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} </div> @@ -913,8 +915,9 @@ This conversion can be done using `SparkSession.read.json` on a JSON file. Note that the file that is offered as _a json file_ is not a typical JSON file. Each line must contain a separate, self-contained valid JSON object. For more information, please see -[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a -consequence, a regular multi-line JSON file will most often fail. +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). + +For a regular multi-line JSON file, set the `wholeFile` parameter to `True`. {% include_example json_dataset python/sql/datasource.py %} </div> @@ -926,8 +929,9 @@ files is a JSON object. Note that the file that is offered as _a json file_ is not a typical JSON file. Each line must contain a separate, self-contained valid JSON object. For more information, please see -[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). As a -consequence, a regular multi-line JSON file will most often fail. +[JSON Lines text format, also called newline-delimited JSON](http://jsonlines.org/). + +For a regular multi-line JSON file, set a named parameter `wholeFile` to `TRUE`. {% include_example json_dataset r/RSparkSQLExample.R %} diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index ec47618e73..45fb9b7591 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -163,8 +163,8 @@ class DataFrameReader(OptionUtils): """ Loads a JSON file and returns the results as a :class:`DataFrame`. - Both JSON (one record per file) and `JSON Lines <http://jsonlines.org/>`_ - (newline-delimited JSON) are supported and can be selected with the `wholeFile` parameter. + `JSON Lines <http://jsonlines.org/>`_(newline-delimited JSON) is supported by default. + For JSON (one record per file), set the `wholeFile` parameter to ``true``. If the ``schema`` parameter is not specified, this function goes through the input once to determine the input schema. diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 7587875cb9..625fb9ba38 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -433,8 +433,8 @@ class DataStreamReader(OptionUtils): """ Loads a JSON file stream and returns the results as a :class:`DataFrame`. - Both JSON (one record per file) and `JSON Lines <http://jsonlines.org/>`_ - (newline-delimited JSON) are supported and can be selected with the `wholeFile` parameter. + `JSON Lines <http://jsonlines.org/>`_(newline-delimited JSON) is supported by default. + For JSON (one record per file), set the `wholeFile` parameter to ``true``. If the ``schema`` parameter is not specified, this function goes through the input once to determine the input schema. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 63be1e5302..41470ae6aa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -263,8 +263,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { /** * Loads a JSON file and returns the results as a `DataFrame`. * - * Both JSON (one record per file) and <a href="http://jsonlines.org/">JSON Lines</a> - * (newline-delimited JSON) are supported and can be selected with the `wholeFile` option. + * <a href="http://jsonlines.org/">JSON Lines</a> (newline-delimited JSON) is supported by + * default. For JSON (one record per file), set the `wholeFile` option to true. * * This function goes through the input once to determine the input schema. If you know the * schema in advance, use the version that specifies the schema to avoid the extra scan. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 6a275281d8..aed8074a64 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -143,8 +143,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo /** * Loads a JSON file stream and returns the results as a `DataFrame`. * - * Both JSON (one record per file) and <a href="http://jsonlines.org/">JSON Lines</a> - * (newline-delimited JSON) are supported and can be selected with the `wholeFile` option. + * <a href="http://jsonlines.org/">JSON Lines</a> (newline-delimited JSON) is supported by + * default. For JSON (one record per file), set the `wholeFile` option to true. * * This function goes through the input once to determine the input schema. If you know the * schema in advance, use the version that specifies the schema to avoid the extra scan. -- GitLab