diff --git a/data/mllib/pic_data.txt b/data/mllib/pic_data.txt new file mode 100644 index 0000000000000000000000000000000000000000..fcfef8cd19131f664b938c1ef18fcb98601a6bd7 --- /dev/null +++ b/data/mllib/pic_data.txt @@ -0,0 +1,19 @@ +0 1 1.0 +0 2 1.0 +0 3 1.0 +1 2 1.0 +1 3 1.0 +2 3 1.0 +3 4 0.1 +4 5 1.0 +4 15 1.0 +5 6 1.0 +6 7 1.0 +7 8 1.0 +8 9 1.0 +9 10 1.0 +10 11 1.0 +11 12 1.0 +12 13 1.0 +13 14 1.0 +14 15 1.0 diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index dcaa3784be874f36ab4286947e98189b0efb91af..3aad4149f99db9ec81451be5cf8d609715fd727d 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -327,11 +327,17 @@ which contains the computed clustering assignments. import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel} import org.apache.spark.mllib.linalg.Vectors -val similarities: RDD[(Long, Long, Double)] = ... +// Load and parse the data +val data = sc.textFile("data/mllib/pic_data.txt") +val similarities = data.map { line => + val parts = line.split(' ') + (parts(0).toLong, parts(1).toLong, parts(2).toDouble) +} +// Cluster the data into two classes using PowerIterationClustering val pic = new PowerIterationClustering() - .setK(3) - .setMaxIterations(20) + .setK(2) + .setMaxIterations(10) val model = pic.run(similarities) model.assignments.foreach { a => @@ -363,11 +369,22 @@ import scala.Tuple2; import scala.Tuple3; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.clustering.PowerIterationClustering; import org.apache.spark.mllib.clustering.PowerIterationClusteringModel; -JavaRDD<Tuple3<Long, Long, Double>> similarities = ... +// Load and parse the data +JavaRDD<String> data = sc.textFile("data/mllib/pic_data.txt"); +JavaRDD<Tuple3<Long, Long, Double>> similarities = data.map( + new Function<String, Tuple3<Long, Long, Double>>() { + public Tuple3<Long, Long, Double> call(String line) { + String[] parts = line.split(" "); + return new Tuple3<>(new Long(parts[0]), new Long(parts[1]), new Double(parts[2])); + } + } +); +// Cluster the data into two classes using PowerIterationClustering PowerIterationClustering pic = new PowerIterationClustering() .setK(2) .setMaxIterations(10); @@ -383,6 +400,35 @@ PowerIterationClusteringModel sameModel = PowerIterationClusteringModel.load(sc. {% endhighlight %} </div> +<div data-lang="python" markdown="1"> + +[`PowerIterationClustering`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering) +implements the PIC algorithm. +It takes an `RDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the +affinity matrix. +Calling `PowerIterationClustering.run` returns a +[`PowerIterationClusteringModel`](api/python/pyspark.mllib.html#pyspark.mllib.clustering.PowerIterationClustering), +which contains the computed clustering assignments. + +{% highlight python %} +from __future__ import print_function +from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel + +# Load and parse the data +data = sc.textFile("data/mllib/pic_data.txt") +similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')])) + +# Cluster the data into two classes using PowerIterationClustering +model = PowerIterationClustering.train(similarities, 2, 10) + +model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster))) + +# Save and load model +model.save(sc, "myModelPath") +sameModel = PowerIterationClusteringModel.load(sc, "myModelPath") +{% endhighlight %} +</div> + </div> ## Latent Dirichlet allocation (LDA)