From ee3b1715620d48b8d22d086ddeef49ad7ff249d2 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 9 May 2016 09:58:36 -0700
Subject: [PATCH] [MINOR] [SPARKR] Update data-manipulation.R to use native csv
 reader

## What changes were proposed in this pull request?
* Since Spark has supported native csv reader, it does not necessary to use the third party ```spark-csv``` in ```examples/src/main/r/data-manipulation.R```. Meanwhile, remove all ```spark-csv``` usage in SparkR.
* Running R applications through ```sparkR``` is not supported as of Spark 2.0, so we change to use ```./bin/spark-submit``` to run the example.

## How was this patch tested?
Offline test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #13005 from yanboliang/r-df-examples.
---
 R/pkg/R/sparkR.R                        | 3 +--
 R/pkg/inst/tests/testthat/test_client.R | 6 ++----
 docs/sparkr.md                          | 4 ++--
 examples/src/main/r/data-manipulation.R | 7 +++----
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index c187869fdf..04a8b1e1f3 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -103,8 +103,7 @@ sparkR.stop <- function() {
 #'                  list(spark.executor.memory="4g"),
 #'                  list(LD_LIBRARY_PATH="/directory of JVM libraries (libjvm.so) on workers/"),
 #'                  c("one.jar", "two.jar", "three.jar"),
-#'                  c("com.databricks:spark-avro_2.10:2.0.1",
-#'                    "com.databricks:spark-csv_2.10:1.3.0"))
+#'                  c("com.databricks:spark-avro_2.10:2.0.1"))
 #'}
 
 sparkR.init <- function(
diff --git a/R/pkg/inst/tests/testthat/test_client.R b/R/pkg/inst/tests/testthat/test_client.R
index 28276a020d..0cf25fe1db 100644
--- a/R/pkg/inst/tests/testthat/test_client.R
+++ b/R/pkg/inst/tests/testthat/test_client.R
@@ -37,9 +37,7 @@ test_that("multiple packages don't produce a warning", {
 
 test_that("sparkJars sparkPackages as character vectors", {
   args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "",
-                                  c("com.databricks:spark-avro_2.10:2.0.1",
-                                    "com.databricks:spark-csv_2.10:1.3.0"))
+                                  c("com.databricks:spark-avro_2.10:2.0.1"))
   expect_match(args, "--jars one.jar,two.jar,three.jar")
-  expect_match(args,
-    "--packages com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.3.0")
+  expect_match(args, "--packages com.databricks:spark-avro_2.10:2.0.1")
 })
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 760534ae14..9b5eaa1ec7 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -115,13 +115,13 @@ head(df)
 
 SparkR supports operating on a variety of data sources through the `DataFrame` interface. This section describes the general methods for loading and saving data using Data Sources. You can check the Spark SQL programming guide for more [specific options](sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources.
 
-The general method for creating DataFrames from data sources is `read.df`. This method takes in the `SQLContext`, the path for the file to load and the type of data source. SparkR supports reading JSON and Parquet files natively and through [Spark Packages](http://spark-packages.org/) you can find data source connectors for popular file formats like [CSV](http://spark-packages.org/package/databricks/spark-csv) and [Avro](http://spark-packages.org/package/databricks/spark-avro). These packages can either be added by
+The general method for creating DataFrames from data sources is `read.df`. This method takes in the `SQLContext`, the path for the file to load and the type of data source. SparkR supports reading JSON, CSV and Parquet files natively and through [Spark Packages](http://spark-packages.org/) you can find data source connectors for popular file formats like [Avro](http://spark-packages.org/package/databricks/spark-avro). These packages can either be added by
 specifying `--packages` with `spark-submit` or `sparkR` commands, or if creating context through `init`
 you can specify the packages with the `packages` argument.
 
 <div data-lang="r" markdown="1">
 {% highlight r %}
-sc <- sparkR.init(sparkPackages="com.databricks:spark-csv_2.11:1.0.3")
+sc <- sparkR.init(sparkPackages="com.databricks:spark-avro_2.11:2.0.1")
 sqlContext <- sparkRSQL.init(sc)
 {% endhighlight %}
 </div>
diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R
index 594bf49d60..58a30135aa 100644
--- a/examples/src/main/r/data-manipulation.R
+++ b/examples/src/main/r/data-manipulation.R
@@ -20,8 +20,7 @@
 # The data set is made up of 227,496 rows x 14 columns. 
 
 # To run this example use
-# ./bin/sparkR --packages com.databricks:spark-csv_2.10:1.0.3
-#     examples/src/main/r/data-manipulation.R <path_to_csv>
+# ./bin/spark-submit examples/src/main/r/data-manipulation.R <path_to_csv>
 
 # Load SparkR library into your R session
 library(SparkR)
@@ -29,7 +28,7 @@ library(SparkR)
 args <- commandArgs(trailing = TRUE)
 
 if (length(args) != 1) {
-  print("Usage: data-manipulation.R <path-to-flights.csv")
+  print("Usage: data-manipulation.R <path-to-flights.csv>")
   print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv")
   q("no")
 }
@@ -53,7 +52,7 @@ SFO_df <- flights_df[flights_df$dest == "SFO", ]
 SFO_DF <- createDataFrame(sqlContext, SFO_df)
 
 #  Directly create a SparkDataFrame from the source data
-flightsDF <- read.df(sqlContext, flightsCsvPath, source = "com.databricks.spark.csv", header = "true")
+flightsDF <- read.df(sqlContext, flightsCsvPath, source = "csv", header = "true")
 
 # Print the schema of this SparkDataFrame
 printSchema(flightsDF)
-- 
GitLab