Skip to content
Snippets Groups Projects
Commit a0ff6d16 authored by Yanbo Liang's avatar Yanbo Liang Committed by Joseph K. Bradley
Browse files

[SPARK-11978][ML] Move dataset_example.py to examples/ml and rename to dataframe_example.py

Since ```Dataset``` has a new meaning in Spark 1.6, we should rename it to avoid confusion.
#9873 finished the work of Scala example, here we focus on the Python one.
Move dataset_example.py to ```examples/ml``` and rename to ```dataframe_example.py```.
BTW, fix minor missing issues of #9873.
cc mengxr

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #9957 from yanboliang/SPARK-11978.
parent aea676ca
No related branches found
No related tags found
No related merge requests found
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
# #
""" """
An example of how to use DataFrame as a dataset for ML. Run with:: An example of how to use DataFrame for ML. Run with::
bin/spark-submit examples/src/main/python/mllib/dataset_example.py bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input>
""" """
from __future__ import print_function from __future__ import print_function
...@@ -28,36 +28,48 @@ import shutil ...@@ -28,36 +28,48 @@ import shutil
from pyspark import SparkContext from pyspark import SparkContext
from pyspark.sql import SQLContext from pyspark.sql import SQLContext
from pyspark.mllib.util import MLUtils
from pyspark.mllib.stat import Statistics from pyspark.mllib.stat import Statistics
def summarize(dataset):
print("schema: %s" % dataset.schema().json())
labels = dataset.map(lambda r: r.label)
print("label average: %f" % labels.mean())
features = dataset.map(lambda r: r.features)
summary = Statistics.colStats(features)
print("features average: %r" % summary.mean())
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) > 2: if len(sys.argv) > 2:
print("Usage: dataset_example.py <libsvm file>", file=sys.stderr) print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
exit(-1) exit(-1)
sc = SparkContext(appName="DatasetExample") sc = SparkContext(appName="DataFrameExample")
sqlContext = SQLContext(sc) sqlContext = SQLContext(sc)
if len(sys.argv) == 2: if len(sys.argv) == 2:
input = sys.argv[1] input = sys.argv[1]
else: else:
input = "data/mllib/sample_libsvm_data.txt" input = "data/mllib/sample_libsvm_data.txt"
points = MLUtils.loadLibSVMFile(sc, input)
dataset0 = sqlContext.inferSchema(points).setName("dataset0").cache() # Load input data
summarize(dataset0) print("Loading LIBSVM file with UDT from " + input + ".")
df = sqlContext.read.format("libsvm").load(input).cache()
print("Schema from LIBSVM:")
df.printSchema()
print("Loaded training data as a DataFrame with " +
str(df.count()) + " records.")
# Show statistical summary of labels.
labelSummary = df.describe("label")
labelSummary.show()
# Convert features column to an RDD of vectors.
features = df.select("features").map(lambda r: r.features)
summary = Statistics.colStats(features)
print("Selected features column with average values:\n" +
str(summary.mean()))
# Save the records in a parquet file.
tempdir = tempfile.NamedTemporaryFile(delete=False).name tempdir = tempfile.NamedTemporaryFile(delete=False).name
os.unlink(tempdir) os.unlink(tempdir)
print("Save dataset as a Parquet file to %s." % tempdir) print("Saving to " + tempdir + " as Parquet file.")
dataset0.saveAsParquetFile(tempdir) df.write.parquet(tempdir)
print("Load it back and summarize it again.")
dataset1 = sqlContext.parquetFile(tempdir).setName("dataset1").cache() # Load the records back.
summarize(dataset1) print("Loading Parquet file with UDT from " + tempdir)
newDF = sqlContext.read.parquet(tempdir)
print("Schema from Parquet:")
newDF.printSchema()
shutil.rmtree(tempdir) shutil.rmtree(tempdir)
sc.stop()
...@@ -44,10 +44,10 @@ object DataFrameExample { ...@@ -44,10 +44,10 @@ object DataFrameExample {
def main(args: Array[String]) { def main(args: Array[String]) {
val defaultParams = Params() val defaultParams = Params()
val parser = new OptionParser[Params]("DatasetExample") { val parser = new OptionParser[Params]("DataFrameExample") {
head("Dataset: an example app using DataFrame as a Dataset for ML.") head("DataFrameExample: an example app using DataFrame for ML.")
opt[String]("input") opt[String]("input")
.text(s"input path to dataset") .text(s"input path to dataframe")
.action((x, c) => c.copy(input = x)) .action((x, c) => c.copy(input = x))
checkConfig { params => checkConfig { params =>
success success
...@@ -88,7 +88,7 @@ object DataFrameExample { ...@@ -88,7 +88,7 @@ object DataFrameExample {
// Save the records in a parquet file. // Save the records in a parquet file.
val tmpDir = Files.createTempDir() val tmpDir = Files.createTempDir()
tmpDir.deleteOnExit() tmpDir.deleteOnExit()
val outputDir = new File(tmpDir, "dataset").toString val outputDir = new File(tmpDir, "dataframe").toString
println(s"Saving to $outputDir as Parquet file.") println(s"Saving to $outputDir as Parquet file.")
df.write.parquet(outputDir) df.write.parquet(outputDir)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment