Skip to content
Snippets Groups Projects
Commit 8038da23 authored by Ankur Dave's avatar Ankur Dave
Browse files

Merge pull request #2 from jegonzal/GraphXCCIssue

Improving documentation and identifying potential bug in CC calculation.
parents 97cd27e3 80e4d98d
No related branches found
No related tags found
No related merge requests found
...@@ -84,7 +84,8 @@ import org.apache.spark.graphx._ ...@@ -84,7 +84,8 @@ import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
{% endhighlight %} {% endhighlight %}
If you are not using the Spark shell you will also need a Spark context. If you are not using the Spark shell you will also need a `SparkContext`. To learn more about
getting started with Spark refer to the [Spark Quick Start Guide](quick-start.html).
# The Property Graph # The Property Graph
<a name="property_graph"></a> <a name="property_graph"></a>
...@@ -190,7 +191,7 @@ and `graph.edges` members respectively. ...@@ -190,7 +191,7 @@ and `graph.edges` members respectively.
{% highlight scala %} {% highlight scala %}
val graph: Graph[(String, String), String] // Constructed from above val graph: Graph[(String, String), String] // Constructed from above
// Count all users which are postdocs // Count all users which are postdocs
graph.vertices.filter { case (id, (name, pos)) => pos == "postdoc"}.count graph.vertices.filter { case (id, (name, pos)) => pos == "postdoc" }.count
// Count all the edges where src > dst // Count all the edges where src > dst
graph.edges.filter(e => e.srcId > e.dstId).count graph.edges.filter(e => e.srcId > e.dstId).count
{% endhighlight %} {% endhighlight %}
...@@ -258,8 +259,10 @@ val graph: Graph[(String, String), String] ...@@ -258,8 +259,10 @@ val graph: Graph[(String, String), String]
val indDegrees: VertexRDD[Int] = graph.inDegrees val indDegrees: VertexRDD[Int] = graph.inDegrees
{% endhighlight %} {% endhighlight %}
The reason for differentiating between core graph operations and GraphOps is to be able to support The reason for differentiating between core graph operations and [`GraphOps`][GraphOps] is to be
various graph representations in the future. able to support different graph representations in the future. Each graph representation must
provide implementations of the core operations and reuse many of the useful operations defined in
[`GraphOps`][GraphOps].
## Property Operators ## Property Operators
...@@ -334,14 +337,32 @@ interest or eliminate broken links. For example in the following code we remove ...@@ -334,14 +337,32 @@ interest or eliminate broken links. For example in the following code we remove
[Graph.subgraph]: api/graphx/index.html#org.apache.spark.graphx.Graph@subgraph((EdgeTriplet[VD,ED])⇒Boolean,(VertexID,VD)⇒Boolean):Graph[VD,ED] [Graph.subgraph]: api/graphx/index.html#org.apache.spark.graphx.Graph@subgraph((EdgeTriplet[VD,ED])⇒Boolean,(VertexID,VD)⇒Boolean):Graph[VD,ED]
{% highlight scala %} {% highlight scala %}
val users: RDD[(VertexId, (String, String))] // Create an RDD for the vertices
val edges: RDD[Edge[String]] val users: RDD[(VertexID, (String, String))] =
sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
(5L, ("franklin", "prof")), (2L, ("istoica", "prof")),
(4L, ("peter", "student"))))
// Create an RDD for edges
val relationships: RDD[Edge[String]] =
sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"),
Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"),
Edge(4L, 0L, "student"), Edge(5L, 0L, "colleague")))
// Define a default user in case there are relationship with missing user // Define a default user in case there are relationship with missing user
val defaultUser = ("John Doe", "Missing") val defaultUser = ("John Doe", "Missing")
// Build the initial Graph // Build the initial Graph
val graph = Graph(users, relationships, defaultUser) val graph = Graph(users, relationships, defaultUser)
// Notice that there is a user 0 (for which we have no information) connecting users
// 4 (peter) and 5 (franklin).
graph.triplets.map(
triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1
).collect.foreach(println(_))
// Remove missing vertices as well as the edges to connected to them // Remove missing vertices as well as the edges to connected to them
val validGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing") val validGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing")
// The valid subgraph will disconnect users 4 and 5 by removing user 0
validGraph.vertices.collect.foreach(println(_))
validGraph.triplets.map(
triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1
).collect.foreach(println(_))
{% endhighlight %} {% endhighlight %}
> Note in the above example only the vertex predicate is provided. The `subgraph` operator defaults > Note in the above example only the vertex predicate is provided. The `subgraph` operator defaults
......
...@@ -325,8 +325,8 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) { ...@@ -325,8 +325,8 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) {
* *
* @see [[org.apache.spark.graphx.lib.ConnectedComponents]] * @see [[org.apache.spark.graphx.lib.ConnectedComponents]]
*/ */
def connectedComponents(): Graph[VertexID, ED] = { def connectedComponents(undirected: Boolean = true): Graph[VertexID, ED] = {
ConnectedComponents.run(graph) ConnectedComponents.run(graph, undirected)
} }
/** /**
......
...@@ -14,26 +14,42 @@ object ConnectedComponents { ...@@ -14,26 +14,42 @@ object ConnectedComponents {
* @tparam ED the edge attribute type (preserved in the computation) * @tparam ED the edge attribute type (preserved in the computation)
* *
* @param graph the graph for which to compute the connected components * @param graph the graph for which to compute the connected components
* @param undirected compute reachability ignoring edge direction.
* *
* @return a graph with vertex attributes containing the smallest vertex in each * @return a graph with vertex attributes containing the smallest vertex in each
* connected component * connected component
*/ */
def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[VertexID, ED] = { def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], undirected: Boolean = true):
Graph[VertexID, ED] = {
val ccGraph = graph.mapVertices { case (vid, _) => vid } val ccGraph = graph.mapVertices { case (vid, _) => vid }
if (undirected) {
def sendMessage(edge: EdgeTriplet[VertexID, ED]) = { def sendMessage(edge: EdgeTriplet[VertexID, ED]) = {
if (edge.srcAttr < edge.dstAttr) { if (edge.srcAttr < edge.dstAttr) {
Iterator((edge.dstId, edge.srcAttr)) Iterator((edge.dstId, edge.srcAttr))
} else if (edge.srcAttr > edge.dstAttr) { } else if (edge.srcAttr > edge.dstAttr) {
Iterator((edge.srcId, edge.dstAttr)) Iterator((edge.srcId, edge.dstAttr))
} else { } else {
Iterator.empty Iterator.empty
}
}
val initialMessage = Long.MaxValue
Pregel(ccGraph, initialMessage, activeDirection = EdgeDirection.Both)(
vprog = (id, attr, msg) => math.min(attr, msg),
sendMsg = sendMessage,
mergeMsg = (a, b) => math.min(a, b))
} else {
def sendMessage(edge: EdgeTriplet[VertexID, ED]) = {
if (edge.srcAttr < edge.dstAttr) {
Iterator((edge.dstId, edge.srcAttr))
} else {
Iterator.empty
}
} }
val initialMessage = Long.MaxValue
Pregel(ccGraph, initialMessage, activeDirection = EdgeDirection.Out)(
vprog = (id, attr, msg) => math.min(attr, msg),
sendMsg = sendMessage,
mergeMsg = (a, b) => math.min(a, b))
} }
val initialMessage = Long.MaxValue
Pregel(ccGraph, initialMessage)(
vprog = (id, attr, msg) => math.min(attr, msg),
sendMsg = sendMessage,
mergeMsg = (a, b) => math.min(a, b))
} // end of connectedComponents } // end of connectedComponents
} }
...@@ -80,4 +80,34 @@ class ConnectedComponentsSuite extends FunSuite with LocalSparkContext { ...@@ -80,4 +80,34 @@ class ConnectedComponentsSuite extends FunSuite with LocalSparkContext {
} }
} // end of reverse chain connected components } // end of reverse chain connected components
test("Connected Components on a Toy Connected Graph") {
withSpark { sc =>
// Create an RDD for the vertices
val users: RDD[(VertexID, (String, String))] =
sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
(5L, ("franklin", "prof")), (2L, ("istoica", "prof")),
(4L, ("peter", "student"))))
// Create an RDD for edges
val relationships: RDD[Edge[String]] =
sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"),
Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"),
Edge(4L, 0L, "student"), Edge(5L, 0L, "colleague")))
// Edges are:
// 2 ---> 5 ---> 3
// | \
// V \|
// 4 ---> 0 7
//
// Define a default user in case there are relationship with missing user
val defaultUser = ("John Doe", "Missing")
// Build the initial Graph
val graph = Graph(users, relationships, defaultUser)
val ccGraph = graph.connectedComponents(undirected = true)
val vertices = ccGraph.vertices.collect
for ( (id, cc) <- vertices ) {
assert(cc == 0)
}
}
} // end of toy connected components
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment