Skip to content
Snippets Groups Projects
Commit 635888cb authored by Sean Owen's avatar Sean Owen Committed by Xiangrui Meng
Browse files

SPARK-2363. Clean MLlib's sample data files

(Just made a PR for this, mengxr was the reporter of:)

MLlib has sample data under serveral folders:
1) data/mllib
2) data/
3) mllib/data/*
Per previous discussion with Matei Zaharia, we want to put them under `data/mllib` and clean outdated files.

Author: Sean Owen <sowen@cloudera.com>

Closes #1394 from srowen/SPARK-2363 and squashes the following commits:

54313dd [Sean Owen] Move ML example data from /mllib/data/ and /data/ into /data/mllib/
parent 4c8be64e
No related branches found
No related tags found
No related merge requests found
Showing
with 16 additions and 16 deletions
File moved
File moved
File moved
File moved
File moved
File moved
File moved
File moved
......@@ -46,7 +46,7 @@ import org.apache.spark.bagel.Bagel._
Next, we load a sample graph from a text file as a distributed dataset and package it into `PRVertex` objects. We also cache the distributed dataset because Bagel will use it multiple times and we'd like to avoid recomputing it.
{% highlight scala %}
val input = sc.textFile("data/pagerank_data.txt")
val input = sc.textFile("data/mllib/pagerank_data.txt")
val numVerts = input.count()
......
......@@ -193,7 +193,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
{% endhighlight %}
</div>
......@@ -207,7 +207,7 @@ import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.api.java.JavaRDD;
JavaRDD<LabeledPoint> examples =
MLUtils.loadLibSVMFile(jsc.sc(), "mllib/data/sample_libsvm_data.txt").toJavaRDD();
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
{% endhighlight %}
</div>
......@@ -218,7 +218,7 @@ examples stored in LIBSVM format.
{% highlight python %}
from pyspark.mllib.util import MLUtils
examples = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
examples = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
{% endhighlight %}
</div>
</div>
......
......@@ -51,7 +51,7 @@ import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
// Load and parse the data
val data = sc.textFile("data/kmeans_data.txt")
val data = sc.textFile("data/mllib/kmeans_data.txt")
val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
// Cluster the data into two classes using KMeans
......@@ -86,7 +86,7 @@ from numpy import array
from math import sqrt
# Load and parse the data
data = sc.textFile("data/kmeans_data.txt")
data = sc.textFile("data/mllib/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
# Build the model (cluster the data)
......
......@@ -58,7 +58,7 @@ import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
// Load and parse the data
val data = sc.textFile("mllib/data/als/test.data")
val data = sc.textFile("data/mllib/als/test.data")
val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
Rating(user.toInt, item.toInt, rate.toDouble)
})
......@@ -112,7 +112,7 @@ from pyspark.mllib.recommendation import ALS
from numpy import array
# Load and parse the data
data = sc.textFile("mllib/data/als/test.data")
data = sc.textFile("data/mllib/als/test.data")
ratings = data.map(lambda line: array([float(x) for x in line.split(',')]))
# Build the recommendation model using Alternating Least Squares
......
......@@ -122,7 +122,7 @@ import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Gini
// Load and parse the data file
val data = sc.textFile("mllib/data/sample_tree_data.csv")
val data = sc.textFile("data/mllib/sample_tree_data.csv")
val parsedData = data.map { line =>
val parts = line.split(',').map(_.toDouble)
LabeledPoint(parts(0), Vectors.dense(parts.tail))
......@@ -161,7 +161,7 @@ import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Variance
// Load and parse the data file
val data = sc.textFile("mllib/data/sample_tree_data.csv")
val data = sc.textFile("data/mllib/sample_tree_data.csv")
val parsedData = data.map { line =>
val parts = line.split(',').map(_.toDouble)
LabeledPoint(parts(0), Vectors.dense(parts.tail))
......
......@@ -187,7 +187,7 @@ import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
// Load training data in LIBSVM format.
val data = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
// Split data into training (60%) and test (40%).
val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
......@@ -259,7 +259,7 @@ def parsePoint(line):
values = [float(x) for x in line.split(' ')]
return LabeledPoint(values[0], values[1:])
data = sc.textFile("mllib/data/sample_svm_data.txt")
data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint)
# Build the model
......@@ -309,7 +309,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
// Load and parse the data
val data = sc.textFile("mllib/data/ridge-data/lpsa.data")
val data = sc.textFile("data/mllib/ridge-data/lpsa.data")
val parsedData = data.map { line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
......@@ -356,7 +356,7 @@ def parsePoint(line):
values = [float(x) for x in line.replace(',', ' ').split(' ')]
return LabeledPoint(values[0], values[1:])
data = sc.textFile("mllib/data/ridge-data/lpsa.data")
data = sc.textFile("data/mllib/ridge-data/lpsa.data")
parsedData = data.map(parsePoint)
# Build the model
......
......@@ -40,7 +40,7 @@ import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
val data = sc.textFile("mllib/data/sample_naive_bayes_data.txt")
val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
val parsedData = data.map { line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
......
......@@ -214,7 +214,7 @@ import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.classification.LogisticRegressionModel
val data = MLUtils.loadLibSVMFile(sc, "mllib/data/sample_libsvm_data.txt")
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
val numFeatures = data.take(1)(0).features.size
// Split data into training (60%) and test (40%).
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment