diff --git a/README.md b/README.md
index ea3fe042f2f839641a1751250981d41da30fa162..139bdc070c5b7fcfaa09b1cd04a4a56a3c67cc87 100644
--- a/README.md
+++ b/README.md
@@ -114,5 +114,3 @@ submitting any copyrighted material via pull request, email, or other means
 you agree to license the material under the project's open source license and
 warrant that you have the legal authority to do so.
 
-
-
diff --git a/core/src/main/scala/org/apache/spark/rdd/IndexedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/IndexedRDD.scala
index 79a007a939fe680d037b740f9caca6bd6adfe7fb..5f95559f15122707987b5e64a23e1bd67eb6f3c1 100644
--- a/core/src/main/scala/org/apache/spark/rdd/IndexedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/IndexedRDD.scala
@@ -24,6 +24,8 @@ import java.util.{HashMap => JHashMap, BitSet => JBitSet, HashSet => JHashSet}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.BitSet
+
 
 import org.apache.spark._
 import org.apache.spark.rdd._
@@ -53,9 +55,9 @@ class RDDIndex[@specialized K: ClassManifest](private[spark] val rdd: RDD[BlockI
     rdd.persist(newLevel)
     return this
   }
-}
-
 
+  def partitioner: Partitioner = rdd.partitioner.get
+}
 
 
 
@@ -69,14 +71,16 @@ class RDDIndex[@specialized K: ClassManifest](private[spark] val rdd: RDD[BlockI
  */
 class IndexedRDD[K: ClassManifest, V: ClassManifest](
     @transient val index:  RDDIndex[K],
-    @transient val valuesRDD: RDD[ Seq[Seq[V]] ])
+    @transient val valuesRDD: RDD[ (IndexedSeq[V], BitSet) ])
   extends RDD[(K, V)](index.rdd.context, 
     List(new OneToOneDependency(index.rdd), new OneToOneDependency(valuesRDD)) ) {
 
+
   /**
    * An internal representation which joins the block indices with the values
    */
-  protected[spark] val tuples = new ZippedRDD(index.rdd.context, index.rdd, valuesRDD)
+  protected[spark] val tuples = 
+    new ZippedRDD(index.rdd.context, index.rdd, valuesRDD)
 
 
   /**
@@ -90,6 +94,7 @@ class IndexedRDD[K: ClassManifest, V: ClassManifest](
    */
   override def getPartitions: Array[Partition] = tuples.getPartitions 
   
+
   /**
    * The preferred locations are computed based on the preferred locations of the tuples.
    */
@@ -107,6 +112,316 @@ class IndexedRDD[K: ClassManifest, V: ClassManifest](
   }
 
 
+  /**
+   * Pass each value in the key-value pair RDD through a map function without changing the keys;
+   * this also retains the original RDD's partitioning.
+   */
+  def mapValues[U: ClassManifest](f: V => U): IndexedRDD[K, U] = {
+    val cleanF = index.rdd.context.clean(f)
+    val newValuesRDD: RDD[ (IndexedSeq[U], BitSet) ] = 
+    valuesRDD.mapPartitions(iter => iter.map{ 
+      case (values, bs) => 
+        val newValues = new Array[U](values.size)
+        for ( ind <- bs ) {
+          newValues(ind) = f(values(ind))
+        }
+        (newValues.toIndexedSeq, bs)
+      }, preservesPartitioning = true)
+    new IndexedRDD[K,U](index, newValuesRDD)
+  }
+
+
+  /**
+   * Pass each value in the key-value pair RDD through a map function without changing the keys;
+   * this also retains the original RDD's partitioning.
+   */
+  def mapValuesWithKeys[U: ClassManifest](f: (K, V) => U): IndexedRDD[K, U] = {
+    val cleanF = index.rdd.context.clean(f)
+    val newValues: RDD[ (IndexedSeq[U], BitSet) ] = 
+      index.rdd.zipPartitions(valuesRDD){ 
+      (keysIter, valuesIter) => 
+      val index = keysIter.next()
+      assert(keysIter.hasNext() == false)
+      val (oldValues, bs) = valuesIter.next()
+      assert(valuesIter.hasNext() == false)
+       // Allocate the array to store the results into
+      val newValues: Array[U] = new Array[U](oldValues.size)
+      // Populate the new Values
+      for( (k,i) <- index ) {
+        if (bs(i)) { newValues(i) = f(k, oldValues(i)) }      
+      }
+      Array((newValues.toIndexedSeq, bs)).iterator
+    }
+    new IndexedRDD[K,U](index, newValues)
+  }
+
+
+  def zipJoin[W: ClassManifest](other: IndexedRDD[K,W]): IndexedRDD[K,(V,W)] = {
+    if(index != other.index) {
+      throw new SparkException("A zipJoin can only be applied to RDDs with the same index!")
+    }
+    val newValuesRDD: RDD[ (IndexedSeq[(V,W)], BitSet) ] = valuesRDD.zipPartitions(other.valuesRDD){
+      (thisIter, otherIter) => 
+      val (thisValues, thisBS) = thisIter.next()
+      assert(!thisIter.hasNext)
+      val (otherValues, otherBS) = otherIter.next()
+      assert(!otherIter.hasNext)
+      val newBS = thisBS & otherBS
+      val newValues = thisValues.view.zip(otherValues)
+      Iterator((newValues.toIndexedSeq, newBS))
+    }
+    new IndexedRDD(index, newValuesRDD)
+  }
+
+
+  def leftZipJoin[W: ClassManifest](other: IndexedRDD[K,W]): IndexedRDD[K,(V,Option[W])] = {
+    if(index != other.index) {
+      throw new SparkException("A zipJoin can only be applied to RDDs with the same index!")
+    }
+    val newValuesRDD: RDD[ (IndexedSeq[(V,Option[W])], BitSet) ] = valuesRDD.zipPartitions(other.valuesRDD){
+      (thisIter, otherIter) => 
+      val (thisValues, thisBS) = thisIter.next()
+      assert(!thisIter.hasNext)
+      val (otherValues, otherBS) = otherIter.next()
+      assert(!otherIter.hasNext)
+      val otherOption = otherValues.view.zipWithIndex
+        .map{ (x: (W, Int)) => if(otherBS(x._2)) Option(x._1) else None }
+      val newValues = thisValues.view.zip(otherOption)
+      Iterator((newValues.toIndexedSeq, thisBS))
+    }
+    new IndexedRDD(index, newValuesRDD)
+  }
+
+
+
+  def leftJoin[W: ClassManifest](
+    other: RDD[(K,W)], merge: (W,W) => W = (a:W, b:W) => a):
+    IndexedRDD[K, (V, Option[W]) ] = {
+    val cleanMerge = index.rdd.context.clean(merge)
+
+    other match {
+      case other: IndexedRDD[_, _] if index == other.index => {
+        leftZipJoin(other)
+      }    
+      case _ => {
+        // Get the partitioner from the index
+        val partitioner = index.rdd.partitioner match {
+          case Some(p) => p
+          case None => throw new SparkException("An index must have a partitioner.")
+        }
+        // Shuffle the other RDD using the partitioner for this index
+        val otherShuffled = 
+          if (other.partitioner == Some(partitioner)) other 
+          else other.partitionBy(partitioner)
+        val newValues: RDD[ (IndexedSeq[(V,Option[W])], BitSet) ] = 
+          index.rdd.zipPartitions(valuesRDD, other) {
+          (thisIndexIter, thisIter, tuplesIter) =>
+          val index = thisIndexIter.next()
+          assert(!thisIndexIter.hasNext)
+          val (thisValues, thisBS) = thisIter.next()
+          assert(!thisIter.hasNext)
+          val newW = new Array[W](thisValues.size)
+          // track which values are matched with values in other
+          val wBS = new BitSet(thisValues.size)
+          for( (k, w) <- tuplesIter if index.contains(k) ) {
+            val ind = index.get(k)
+            if(thisBS(ind)) {
+              if(wBS(ind)) {
+                newW(ind) = cleanMerge(newW(ind), w) 
+              } else {
+                newW(ind) = w
+                wBS(ind) = true
+              }
+            }
+          }
+
+          val otherOption = newW.view.zipWithIndex
+            .map{ (x: (W, Int)) => if(wBS(x._2)) Option(x._1) else None }
+          val newValues = thisValues.view.zip(otherOption)
+
+          Iterator((newValues.toIndexedSeq, thisBS))
+        } // end of newValues
+        new IndexedRDD(index, newValues) 
+      }
+    }
+  }
+
+
+
+  // 
+  // def zipJoinToRDD[W: ClassManifest](other: IndexedRDD[K,W]): RDD[(K,(V,W))] = {
+  //   if(index != other.index) {
+  //     throw new SparkException("ZipJoinRDD can only be applied to RDDs with the same index!")
+  //   }
+  //   index.rdd.zipPartitions(valuesRDD, other.valuesRDD){
+  //     (thisIndexIter, thisIter, otherIter) => 
+  //     val index = thisIndexIter.next()
+  //     assert(!thisIndexIter.hasNext)
+  //     val (thisValues, thisBS) = thisIter.next()
+  //     assert(!thisIter.hasNext)
+  //     val (otherValues, otherBS) = otherIter.next()
+  //     assert(!otherIter.hasNext)
+  //     val newBS = thisBS & otherBS
+  //     index.iterator.filter{ case (k,i) => newBS(i) }.map{ 
+  //       case (k,i) => (k, (thisValues(i), otherValues(i)))
+  //     }
+  //   }
+  // }
+
+
+/*  This is probably useful but we are not using it
+  def zipJoinWithKeys[W: ClassManifest, Z: ClassManifest](
+    other: RDD[(K,W)])(
+    f: (K, V, W) => Z, 
+    merge: (Z,Z) => Z = (a:Z, b:Z) => a):
+    IndexedRDD[K,Z] = {
+    val cleanF = index.rdd.context.clean(f)
+    val cleanMerge = index.rdd.context.clean(merge)
+    other match {
+      case other: IndexedRDD[_, _] if index == other.index => {
+        val newValues = index.rdd.zipPartitions(valuesRDD, other.valuesRDD){
+          (thisIndexIter, thisIter, otherIter) => 
+          val index = thisIndexIter.next()
+          assert(!thisIndexIter.hasNext)
+          val (thisValues, thisBS) = thisIter.next()
+          assert(!thisIter.hasNext)
+          val (otherValues, otherBS) = otherIter.next()
+          assert(!otherIter.hasNext)
+          val newValues = new Array[Z](thisValues.size)
+          val newBS = thisBS & otherBS
+          for( (k,i) <- index ) {
+            if (newBS(i)) { 
+              newValues(i) = cleanF(k, thisValues(i), otherValues(i))
+            }       
+          }
+          List((newValues, newBS)).iterator
+        }
+        new IndexedRDD(index, newValues) 
+      }
+    
+      case _ => {
+        // Get the partitioner from the index
+        val partitioner = index.rdd.partitioner match {
+          case Some(p) => p
+          case None => throw new SparkException("An index must have a partitioner.")
+        }
+        // Shuffle the other RDD using the partitioner for this index
+        val otherShuffled = 
+          if (other.partitioner == Some(partitioner)) other 
+          else other.partitionBy(partitioner)
+
+        val newValues = index.rdd.zipPartitions(valuesRDD, other) {
+          (thisIndexIter, thisIter, tuplesIter) =>
+          val index = thisIndexIter.next()
+          assert(!thisIndexIter.hasNext)
+          val (thisValues, thisBS) = thisIter.next()
+          assert(!thisIter.hasNext)
+
+          val newValues = new Array[Z](thisValues.size)
+          // track which values are matched with values in other
+          val tempBS = new BitSet(thisValues.size)
+
+          for( (k, w) <- tuplesIter if index.contains(k) ) {
+            val ind = index.get(k)
+            if(thisBS(ind)) {
+              val result = cleanF(k, thisValues(ind), w)
+              if(tempBS(ind)) {
+                newValues(ind) = cleanMerge(newValues(ind), result) 
+              } else {
+                newValues(ind) = result
+                tempBS(ind) = true
+              }
+            }
+          } 
+          List((newValues, tempBS)).iterator
+        } // end of newValues
+        new IndexedRDD(index, newValues) 
+      }
+    }
+  }
+*/
+
+/*
+  def zipJoinLeftWithKeys[W: ClassManifest, Z: ClassManifest](
+    other: RDD[(K,W)])(
+    f: (K, V, Option[W]) => Z, 
+    merge: (Z,Z) => Z = (a:Z, b:Z) => a):
+    IndexedRDD[K,Z] = {
+    val cleanF = index.rdd.context.clean(f)
+    val cleanMerge = index.rdd.context.clean(merge)
+    other match {
+      case other: IndexedRDD[_, _] if index == other.index => {
+        val newValues = index.rdd.zipPartitions(valuesRDD, other.valuesRDD){
+          (thisIndexIter, thisIter, otherIter) => 
+          val index = thisIndexIter.next()
+          assert(!thisIndexIter.hasNext)
+          val (thisValues, thisBS) = thisIter.next()
+          assert(!thisIter.hasNext)
+          val (otherValues, otherBS) = otherIter.next()
+          assert(!otherIter.hasNext)
+          val newValues = new Array[Z](thisValues.size)
+          for( (k,i) <- index ) {
+            if (thisBS(i)) { 
+              val otherVal = if(otherBS(i)) Some(otherValues(i)) else None
+              newValues(i) = cleanF(k, thisValues(i), otherVal)
+            }       
+          }
+          List((newValues, thisBS)).iterator
+        }
+        new IndexedRDD(index, newValues) 
+      }
+    
+      case _ => {
+        // Get the partitioner from the index
+        val partitioner = index.rdd.partitioner match {
+          case Some(p) => p
+          case None => throw new SparkException("An index must have a partitioner.")
+        }
+        // Shuffle the other RDD using the partitioner for this index
+        val otherShuffled = 
+          if (other.partitioner == Some(partitioner)) other 
+          else other.partitionBy(partitioner)
+        val newValues = index.rdd.zipPartitions(valuesRDD, other) {
+          (thisIndexIter, thisIter, tuplesIter) =>
+          val index = thisIndexIter.next()
+          assert(!thisIndexIter.hasNext)
+          val (thisValues, thisBS) = thisIter.next()
+          assert(!thisIter.hasNext)
+
+          val newValues = new Array[Z](thisValues.size)
+          // track which values are matched with values in other
+          val tempBS = new BitSet(thisValues.size)
+
+          for( (k, w) <- tuplesIter if index.contains(k) ) {
+            val ind = index.get(k)
+            if(thisBS(ind)) {
+              val result = cleanF(k, thisValues(ind), Option(w))
+              if(tempBS(ind)) {
+                newValues(ind) = cleanMerge(newValues(ind), result) 
+              } else {
+                newValues(ind) = result
+                tempBS(ind) = true
+              }
+            }
+          } 
+
+          // Process remaining keys in lef join
+          for( (k,ind) <- index if thisBS(ind) && !tempBS(ind)) {
+            newValues(ind) = cleanF(k, thisValues(ind), None)
+          }
+          List((newValues, thisBS)).iterator
+        } // end of newValues
+        new IndexedRDD(index, newValues) 
+      }
+    }
+  }
+
+*/
+
+
+  /**
+   * The IndexedRDD has its own optimized version of the pairRDDFunctions.  
+   */
   override def pairRDDFunctions[K1, V1](
       implicit t: (K, V) <:< (K1,V1), k: ClassManifest[K1], v: ClassManifest[V1]): 
     PairRDDFunctions[K1, V1] = {
@@ -114,22 +429,39 @@ class IndexedRDD[K: ClassManifest, V: ClassManifest](
   }
 
 
- 
+  override def filter(f: Tuple2[K,V] => Boolean): RDD[(K,V)] = {
+    val cleanF = index.rdd.context.clean(f)
+    val newValues = index.rdd.zipPartitions(valuesRDD){ 
+      (keysIter, valuesIter) => 
+      val index = keysIter.next()
+      assert(keysIter.hasNext() == false)
+      val (oldValues, bs) = valuesIter.next()
+      assert(valuesIter.hasNext() == false)
+      // Allocate the array to store the results into
+      val newBS = new BitSet(oldValues.size)
+      // Populate the new Values
+      for( (k,i) <- index ) {
+        newBS(i) = bs(i) && cleanF( (k, oldValues(i)) ) 
+      }
+      Array((oldValues, newBS)).iterator
+    }
+    new IndexedRDD[K,V](index, newValues)
+  }
 
 
   /**
    * Provide the RDD[(K,V)] equivalent output. 
    */
   override def compute(part: Partition, context: TaskContext): Iterator[(K, V)] = {
-    tuples.compute(part, context).flatMap { case (indexMap, values) => 
+    tuples.compute(part, context).flatMap { case (indexMap, (values, bs) ) => 
       // Walk the index to construct the key, value pairs
-      indexMap.iterator
+      indexMap.iterator 
         // Extract rows with key value pairs and indicators
-        .map{ case (k, ind) => (k, values(ind))  }
+        .map{ case (k, ind) => (bs(ind), k, ind)  }
         // Remove tuples that aren't actually present in the array
-        .filter{ case (_, valar) => valar != null && !valar.isEmpty()}
+        .filter( _._1 )
         // Extract the pair (removing the indicator from the tuple)
-        .flatMap{ case (k, valar) =>  valar.map(v => (k,v))}
+        .map( x => (x._2, values(x._3) ) )
     }
   }
 
@@ -139,74 +471,143 @@ class IndexedRDD[K: ClassManifest, V: ClassManifest](
 
 
 object IndexedRDD {
-  def apply[K: ClassManifest, V: ClassManifest](
-    tbl: RDD[(K,V)],
-    existingIndex: RDDIndex[K] = null ): IndexedRDD[K, V] = {
 
-    if (existingIndex == null) {
-      // Shuffle the table (if necessary)
-      val shuffledTbl =
-        if (tbl.partitioner.isEmpty) {
-          new ShuffledRDD[K, V, (K,V)](tbl, Partitioner.defaultPartitioner(tbl))
-        } else { tbl }
 
-      val groups = shuffledTbl.mapPartitions( iter => {
-        val indexMap = new BlockIndex[K]()
-        val values = new ArrayBuffer[Seq[V]]()
-        for ((k,v) <- iter){
-          if(!indexMap.contains(k)) {
-            val ind = indexMap.size
-            indexMap.put(k, ind)
-            values.append(ArrayBuffer.empty[V])
-          }
+  def apply[K: ClassManifest, V: ClassManifest](rdd: RDD[(K,V)]): IndexedRDD[K,V] = 
+    apply(rdd, (a:V, b:V) => a )
+
+  def apply[K: ClassManifest, V: ClassManifest](
+    rdd: RDD[(K,V)], reduceFunc: (V, V) => V): IndexedRDD[K,V] = {
+    // Preaggregate and shuffle if necessary
+    // Preaggregation.
+    val aggregator = new Aggregator[K, V, V](v => v, reduceFunc, reduceFunc)
+    val partitioner = new HashPartitioner(rdd.partitions.size)
+    val preAgg = rdd.mapPartitions(aggregator.combineValuesByKey).partitionBy(partitioner)
+
+    val groups = preAgg.mapPartitions( iter => {
+      val indexMap = new BlockIndex[K]()
+      val values = new ArrayBuffer[V]
+      val bs = new BitSet
+      for ((k,v) <- iter) {
+        if(!indexMap.contains(k)) {
+          val ind = indexMap.size
+          indexMap.put(k, ind)
+          values.append(v)
+          bs(ind) = true
+        } else {
           val ind = indexMap.get(k)
-          values(ind).asInstanceOf[ArrayBuffer[V]].append(v)
+          values(ind) = reduceFunc(values(ind), v)
         }
-        List((indexMap, values.toSeq)).iterator
-        }, true).cache
-      // extract the index and the values
-      val index = groups.mapPartitions(_.map{ case (kMap,vAr) => kMap }, true)
-      val values = groups.mapPartitions(_.map{ case (kMap,vAr) => vAr }, true)
-      new IndexedRDD[K,V](new RDDIndex(index), values)
-    } else {
-      val index = existingIndex
-      val partitioner = index.rdd.partitioner match {
-        case Some(p) => p
-        case None => throw new SparkException("An index must have a partitioner.")
       }
+      Iterator( (indexMap, (values.toIndexedSeq, bs)) )
+      }, true).cache
+    // extract the index and the values
+    val index = groups.mapPartitions(_.map{ case (kMap, vAr) => kMap }, true)
+    val values: RDD[(IndexedSeq[V], BitSet)] = 
+      groups.mapPartitions(_.map{ case (kMap,vAr) => vAr }, true)
+    new IndexedRDD[K,V](new RDDIndex(index), values)
+  }
 
-      // Shuffle the table according to the index (if necessary)
-      val shuffledTbl = 
-        if (tbl.partitioner == Some(partitioner)) {
-          tbl
-        } else {
-          new ShuffledRDD[K, V, (K,V)](tbl, partitioner)
-        }
 
-      // Use the index to build the new values table
-      val values = index.rdd.zipPartitions(shuffledTbl)(
-        (indexIter, tblIter) => {
-          // There is only one map
-          val index = indexIter.next()
-          assert(!indexIter.hasNext())
-          val values = new Array[Seq[V]](index.size)
-          for ((k,v) <- tblIter) {
-            if (!index.contains(k)) {
-              throw new SparkException("Error: Trying to bind an external index " +
-                "to an RDD which contains keys that are not in the index.")
-            }
-            val ind = index(k)
-            if (values(ind) == null) {
-              values(ind) = ArrayBuffer.empty[V]
-            }
-            values(ind).asInstanceOf[ArrayBuffer[V]].append(v)
-          }
-          List(values.toSeq).iterator
-          })
 
-      new IndexedRDD[K,V](index, values)
+  def apply[K: ClassManifest, V: ClassManifest](
+    rdd: RDD[(K,V)], index: RDDIndex[K]): IndexedRDD[K,V] = 
+    apply(rdd, index, (a:V,b:V) => a)
+
+
+  def apply[K: ClassManifest, V: ClassManifest](
+    rdd: RDD[(K,V)], index: RDDIndex[K], 
+    reduceFunc: (V, V) => V): IndexedRDD[K,V] = 
+    apply(rdd,index, (v:V) => v, reduceFunc, reduceFunc)
+  // {
+  //   // Get the index Partitioner
+  //   val partitioner = index.rdd.partitioner match {
+  //     case Some(p) => p
+  //     case None => throw new SparkException("An index must have a partitioner.")
+  //   }
+  //   // Preaggregate and shuffle if necessary
+  //   val partitioned = 
+  //     if (rdd.partitioner != Some(partitioner)) {
+  //       // Preaggregation.
+  //       val aggregator = new Aggregator[K, V, V](v => v, reduceFunc, reduceFunc)
+  //       rdd.mapPartitions(aggregator.combineValuesByKey).partitionBy(partitioner)
+  //     } else {
+  //       rdd
+  //     }
+
+  //   // Use the index to build the new values table
+  //   val values = index.rdd.zipPartitions(partitioned)( (indexIter, tblIter) => {
+  //     // There is only one map
+  //     val index = indexIter.next()
+  //     assert(!indexIter.hasNext())
+  //     val values = new Array[V](index.size)
+  //     val bs = new BitSet(index.size)
+  //     for ((k,v) <- tblIter) {
+  //       if (!index.contains(k)) {
+  //         throw new SparkException("Error: Trying to bind an external index " +
+  //           "to an RDD which contains keys that are not in the index.")
+  //       }
+  //       val ind = index(k)
+  //       if (bs(ind)) { 
+  //         values(ind) = reduceFunc(values(ind), v) 
+  //       } else {
+  //         values(ind) = v
+  //         bs(ind) = true
+  //       }
+  //     }
+  //     List((values, bs)).iterator
+  //   })
+  //   new IndexedRDD[K,V](index, values)
+  // } // end of apply
+
+
+  def apply[K: ClassManifest, V: ClassManifest, C: ClassManifest](
+    rdd: RDD[(K,V)], 
+    index: RDDIndex[K],
+    createCombiner: V => C,
+    mergeValue: (C, V) => C,
+    mergeCombiners: (C, C) => C): IndexedRDD[K,C] = {
+    // Get the index Partitioner
+    val partitioner = index.rdd.partitioner match {
+      case Some(p) => p
+      case None => throw new SparkException("An index must have a partitioner.")
     }
-  }
+    // Preaggregate and shuffle if necessary
+    val partitioned = 
+      if (rdd.partitioner != Some(partitioner)) {
+        // Preaggregation.
+        val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, 
+          mergeCombiners)
+        rdd.mapPartitions(aggregator.combineValuesByKey).partitionBy(partitioner)
+      } else {
+        rdd.mapValues(x => createCombiner(x))
+      }
+
+    // Use the index to build the new values table
+    val values: RDD[ (IndexedSeq[C], BitSet) ] = index.rdd.zipPartitions(partitioned)( (indexIter, tblIter) => {
+      // There is only one map
+      val index = indexIter.next()
+      assert(!indexIter.hasNext())
+      val values = new Array[C](index.size)
+      val bs = new BitSet(index.size)
+      for ((k,c) <- tblIter) {
+        if (!index.contains(k)) {
+          throw new SparkException("Error: Trying to bind an external index " +
+            "to an RDD which contains keys that are not in the index.")
+        }
+        val ind = index(k)
+        if (bs(ind)) { 
+          values(ind) = mergeCombiners(values(ind), c) 
+        } else {
+          values(ind) = c
+          bs(ind) = true
+        }
+      }
+      Iterator((values, bs))
+    })
+    new IndexedRDD(index, values)
+  } // end of apply
+
 
   /**
    * Construct and index of the unique values in a given RDD.
@@ -226,9 +627,7 @@ object IndexedRDD {
       }
       case Some(partitioner) => 
         tbl.partitionBy(partitioner)
-//        new ShuffledRDD[K, Boolean](tbl, partitioner)
     }
-   
 
     val index = shuffledTbl.mapPartitions( iter => {
       val indexMap = new BlockIndex[K]()
@@ -238,12 +637,12 @@ object IndexedRDD {
           indexMap.put(k, ind)   
         }
       }
-      List(indexMap).iterator
+      Iterator(indexMap)
       }, true).cache
     new RDDIndex(index)
   }
 
-}
+} // end of object IndexedRDD
 
 
 
diff --git a/core/src/main/scala/org/apache/spark/IndexedRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/IndexedRDDFunctions.scala
similarity index 61%
rename from core/src/main/scala/org/apache/spark/IndexedRDDFunctions.scala
rename to core/src/main/scala/org/apache/spark/rdd/IndexedRDDFunctions.scala
index 65c6963b71565946a7401e0e89a4448b59211b9e..fd7c16089d69e2ed41a5ebdfe8f5177ed5a7b982 100644
--- a/core/src/main/scala/org/apache/spark/IndexedRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/IndexedRDDFunctions.scala
@@ -22,6 +22,8 @@ import java.util.{HashMap => JHashMap, BitSet => JBitSet, HashSet => JHashSet}
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
 
+import scala.collection.mutable.BitSet
+
 import org.apache.spark._
 
 
@@ -35,19 +37,22 @@ class IndexedRDDFunctions[K: ClassManifest, V: ClassManifest](self: IndexedRDD[K
    def reindex(): IndexedRDD[K,V] = IndexedRDD(self)
 
 
-  /**
-   * Pass each value in the key-value pair RDD through a map function without changing the keys;
-   * this also retains the original RDD's partitioning.
-   */
-  override def mapValues[U: ClassManifest](f: V => U): RDD[(K, U)] = {
-    val cleanF = self.index.rdd.context.clean(f)
-    val newValues = self.valuesRDD.mapPartitions(_.map(values => values.map{ 
-        case null => null 
-        case row => row.map(x => f(x))
-      }), true)
-    new IndexedRDD[K,U](self.index, newValues)
-  }
-
+  // /**
+  //  * Pass each value in the key-value pair RDD through a map function without changing the keys;
+  //  * this also retains the original RDD's partitioning.
+  //  */
+  // override def mapValues[U: ClassManifest](f: V => U): RDD[(K, U)] = {
+  //   val cleanF = self.index.rdd.context.clean(f)
+  //   val newValuesRDD = self.valuesRDD.mapPartitions(iter => iter.map{ 
+  //     case (values, bs) => 
+  //       val newValues = new Array[U](values.size)
+  //       for ( ind <- bs ) {
+  //         newValues(ind) = f(values(ind))
+  //       }
+  //       (newValues.toSeq, bs)
+  //     }, preservesPartitioning = true)
+  //   new IndexedRDD[K,U](self.index, newValuesRDD)
+  // }
 
   /**
    * Pass each value in the key-value pair RDD through a flatMap function without changing the
@@ -55,11 +60,20 @@ class IndexedRDDFunctions[K: ClassManifest, V: ClassManifest](self: IndexedRDD[K
    */
   override def flatMapValues[U: ClassManifest](f: V => TraversableOnce[U]): RDD[(K,U)] = {
     val cleanF = self.index.rdd.context.clean(f)
-    val newValues = self.valuesRDD.mapPartitions(_.map(values => values.map{
-        case null => null 
-        case row => row.flatMap(x => f(x))
-      }), true)
-    new IndexedRDD[K,U](self.index, newValues)
+    val newValuesRDD: RDD[(IndexedSeq[U], BitSet)] = self.valuesRDD.mapPartitions(iter => iter.map{ 
+      case (values, bs) => 
+        val newValues = new Array[U](values.size)
+        val newBS = new BitSet(values.size)
+        for ( ind <- bs ) {
+          val res = f(values(ind))
+          if(!res.isEmpty) {
+            newValues(ind) = res.toIterator.next()
+            newBS(ind) = true
+          }
+        }
+        (newValues.toIndexedSeq, newBS)
+      }, preservesPartitioning = true)
+    new IndexedRDD[K,U](self.index, newValuesRDD)
   }
 
 
@@ -79,31 +93,18 @@ class IndexedRDDFunctions[K: ClassManifest, V: ClassManifest](self: IndexedRDD[K
       partitioner: Partitioner,
       mapSideCombine: Boolean = true,
       serializerClass: String = null): RDD[(K, C)] = {
-    val newValues = self.valuesRDD.mapPartitions(
-      _.map{ groups: Seq[Seq[V]] => 
-        groups.map{ group: Seq[V] => 
-          if (group != null && !group.isEmpty) {
-            val c: C = createCombiner(group.head)
-            val sum: C = group.tail.foldLeft(c)(mergeValue)
-            Seq(sum)
-          } else {
-            null
-          }
-        }
-      }, true)
-    new IndexedRDD[K,C](self.index, newValues)
+    mapValues(createCombiner)
   }
 
  
-
-  /**
-   * Group the values for each key in the RDD into a single sequence. Hash-partitions the
-   * resulting RDD with the existing partitioner/parallelism level.
-   */
-  override def groupByKey(partitioner: Partitioner): RDD[(K, Seq[V])] = {
-    val newValues = self.valuesRDD.mapPartitions(_.map{ar => ar.map{s => Seq(s)} }, true)
-    new IndexedRDD[K, Seq[V]](self.index, newValues)
-  }
+  // /**
+  //  * Group the values for each key in the RDD into a single sequence. Hash-partitions the
+  //  * resulting RDD with the existing partitioner/parallelism level.
+  //  */
+  // override def groupByKey(partitioner: Partitioner): RDD[(K, Seq[V])] = {
+  //   val newValues = self.valuesRDD.mapPartitions(_.map{ar => ar.map{s => Seq(s)} }, true)
+  //   new IndexedRDD[K, Seq[V]](self.index, newValues)
+  // }
 
 
   /**
@@ -119,24 +120,25 @@ class IndexedRDDFunctions[K: ClassManifest, V: ClassManifest](self: IndexedRDD[K
         // then we simply merge the value RDDs. 
         // However it is possible that both RDDs are missing a value for a given key in 
         // which case the returned RDD should have a null value
-        val newValues = 
-          self.valuesRDD.zipPartitions(other.valuesRDD)(
-          (thisIter, otherIter) => {
-            val thisValues: Seq[Seq[V]] = thisIter.next()
+        val newValues: RDD[(IndexedSeq[(Seq[V], Seq[W])], BitSet)] = 
+          self.valuesRDD.zipPartitions(other.valuesRDD){
+          (thisIter, otherIter) => 
+            val (thisValues, thisBS) = thisIter.next()
             assert(!thisIter.hasNext)
-            val otherValues: Seq[Seq[W]] = otherIter.next()
-            assert(!otherIter.hasNext)   
-            // Zip the values and if both arrays are null then the key is not present and 
-            // so the resulting value must be null (not a tuple of empty sequences)
-            val tmp: Seq[Seq[(Seq[V], Seq[W])]] = thisValues.view.zip(otherValues).map{               
-              case (null, null) => null // The key is not present in either RDD
-              case (a, null) => Seq((a, Seq.empty[W]))
-              case (null, b) => Seq((Seq.empty[V], b))
-              case (a, b) => Seq((a,b))
-            }.toSeq
-            List(tmp).iterator
-          })
-        new IndexedRDD[K, (Seq[V], Seq[W])](self.index, newValues) 
+            val (otherValues, otherBS) = otherIter.next()
+            assert(!otherIter.hasNext)
+
+            val newValues = new Array[(Seq[V], Seq[W])](thisValues.size)
+            val newBS = thisBS | otherBS
+
+            for( ind <- newBS ) {
+              val a = if (thisBS(ind)) Seq(thisValues(ind)) else Seq.empty[V]
+              val b = if (otherBS(ind)) Seq(otherValues(ind)) else Seq.empty[W]
+              newValues(ind) = (a, b)
+            }
+            Iterator((newValues.toIndexedSeq, newBS))
+        }
+        new IndexedRDD(self.index, newValues) 
       }
       case other: IndexedRDD[_, _] 
         if self.index.rdd.partitioner == other.index.rdd.partitioner => {
@@ -164,33 +166,40 @@ class IndexedRDDFunctions[K: ClassManifest, V: ClassManifest](self: IndexedRDD[K
             List(newIndex).iterator
           }).cache()
         // Use the new index along with the this and the other indices to merge the values
-        val newValues = 
+        val newValues: RDD[(IndexedSeq[(Seq[V], Seq[W])], BitSet)] = 
           newIndex.zipPartitions(self.tuples, other.tuples)(
             (newIndexIter, thisTuplesIter, otherTuplesIter) => {
               // Get the new index for this partition
               val newIndex = newIndexIter.next()
               assert(!newIndexIter.hasNext)
               // Get the corresponding indicies and values for this and the other IndexedRDD
-              val (thisIndex, thisValues) = thisTuplesIter.next()
+              val (thisIndex, (thisValues, thisBS)) = thisTuplesIter.next()
               assert(!thisTuplesIter.hasNext)
-              val (otherIndex, otherValues) = otherTuplesIter.next()
+              val (otherIndex, (otherValues, otherBS)) = otherTuplesIter.next()
               assert(!otherTuplesIter.hasNext)
               // Preallocate the new Values array
-              val newValues = new Array[Seq[(Seq[V],Seq[W])]](newIndex.size)
+              val newValues = new Array[(Seq[V], Seq[W])](newIndex.size)
+              val newBS = new BitSet(newIndex.size)
+
               // Lookup the sequences in both submaps
               for ((k,ind) <- newIndex) {
-                val thisSeq = if (thisIndex.contains(k)) thisValues(thisIndex.get(k)) else null
-                val otherSeq = if (otherIndex.contains(k)) otherValues(otherIndex.get(k)) else null
-                // if either of the sequences is not null then the key was in one of the two tables
-                // and so the value should appear in the returned table
-                newValues(ind) = (thisSeq, otherSeq) match {
-                  case (null, null) => null
-                  case (a, null) => Seq( (a, Seq.empty[W]) )
-                  case (null, b) => Seq( (Seq.empty[V], b) )
-                  case (a, b) => Seq( (a,b) ) 
+                // Get the left key
+                val a = if (thisIndex.contains(k)) {
+                  val ind = thisIndex.get(k)
+                  if(thisBS(ind)) Seq(thisValues(ind)) else Seq.empty[V]
+                } else Seq.empty[V]
+                // Get the right key
+                val b = if (otherIndex.contains(k)) {
+                  val ind = otherIndex.get(k)
+                  if (otherBS(ind)) Seq(otherValues(ind)) else Seq.empty[W]
+                } else Seq.empty[W]
+                // If at least one key was present then we generate a tuple.
+                if (!a.isEmpty || !b.isEmpty) {
+                  newValues(ind) = (a, b)
+                  newBS(ind) = true                  
                 }
               }
-              List(newValues.toSeq).iterator
+              Iterator((newValues.toIndexedSeq, newBS))
             })
         new IndexedRDD(new RDDIndex(newIndex), newValues)
       }
@@ -212,49 +221,54 @@ class IndexedRDDFunctions[K: ClassManifest, V: ClassManifest](self: IndexedRDD[K
           self.tuples.zipPartitions(otherShuffled)(
           (thisTuplesIter, otherTuplesIter) => {
             // Get the corresponding indicies and values for this IndexedRDD
-            val (thisIndex, thisValues) = thisTuplesIter.next()
+            val (thisIndex, (thisValues, thisBS)) = thisTuplesIter.next()
             assert(!thisTuplesIter.hasNext())
             // Construct a new index
             val newIndex = thisIndex.clone().asInstanceOf[BlockIndex[K]]
             // Construct a new array Buffer to store the values
-            val newValues = ArrayBuffer.fill[(Seq[V], Seq[W])](thisValues.size)(null)
+            val newValues = ArrayBuffer.fill[ (Seq[V], Seq[W]) ](thisValues.size)(null)
+            val newBS = new BitSet(thisValues.size)
             // populate the newValues with the values in this IndexedRDD
             for ((k,i) <- thisIndex) {
-              if (thisValues(i) != null) {
-                newValues(i) = (thisValues(i), ArrayBuffer.empty[W]) 
+              if (thisBS(i)) {
+                newValues(i) = (Seq(thisValues(i)), ArrayBuffer.empty[W]) 
+                newBS(i) = true
               }
             }
             // Now iterate through the other tuples updating the map
             for ((k,w) <- otherTuplesIter){
-              if (!newIndex.contains(k)) {
-                // update the index
-                val ind = newIndex.size
-                newIndex.put(k, ind)
-                // Update the values
-                newValues.append( (Seq.empty[V], ArrayBuffer(w) ) )               
-              } else {
+              if (newIndex.contains(k)) {
                 val ind = newIndex.get(k)
-                if(newValues(ind) == null) {
+                if(newBS(ind)) {
+                  newValues(ind)._2.asInstanceOf[ArrayBuffer[W]].append(w)
+                } else {
                   // If the other key was in the index but not in the values 
                   // of this indexed RDD then create a new values entry for it 
+                  newBS(ind) = true
                   newValues(ind) = (Seq.empty[V], ArrayBuffer(w))
-                } else {
-                  newValues(ind)._2.asInstanceOf[ArrayBuffer[W]].append(w)
-                }
+                }              
+              } else {
+                // update the index
+                val ind = newIndex.size
+                newIndex.put(k, ind)
+                newBS(ind) = true
+                // Update the values
+                newValues.append( (Seq.empty[V], ArrayBuffer(w) ) ) 
               }
             }
-            // Finalize the new values array
-            val newValuesArray: Seq[Seq[(Seq[V],Seq[W])]] = 
-              newValues.view.map{ 
-                case null => null
-                case (s, ab) => Seq((s, ab.toSeq)) 
-                }.toSeq 
-            List( (newIndex, newValuesArray) ).iterator
+            // // Finalize the new values array
+            // val newValuesArray: Seq[Seq[(Seq[V],Seq[W])]] = 
+            //   newValues.view.map{ 
+            //     case null => null
+            //     case (s, ab) => Seq((s, ab.toSeq)) 
+            //     }.toSeq 
+            Iterator( (newIndex, (newValues.toIndexedSeq, newBS)) )
           }).cache()
 
         // Extract the index and values from the above RDD  
         val newIndex = groups.mapPartitions(_.map{ case (kMap,vAr) => kMap }, true)
-        val newValues = groups.mapPartitions(_.map{ case (kMap,vAr) => vAr }, true)
+        val newValues: RDD[(IndexedSeq[(Seq[V], Seq[W])], BitSet)] = 
+          groups.mapPartitions(_.map{ case (kMap,vAr) => vAr }, true)
           
         new IndexedRDD[K, (Seq[V], Seq[W])](new RDDIndex(newIndex), newValues)
       }
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 15099f57b37ae16e699e6dfc769ddc9825cb35a3..81bf867188ac494328063fe46486617bbbcaae01 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -265,9 +265,7 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](self: RDD[(K, V)])
    * pair (k, (v, None)) if no elements in `other` have key k. Uses the given Partitioner to
    * partition the output RDD.
    */
-
-  def leftOuterJoin[W: ClassManifest](other: RDD[(K, W)], partitioner: Partitioner): 
-  RDD[(K, (V, Option[W]))] = {
+  def leftOuterJoin[W: ClassManifest](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, Option[W]))] = {
     this.cogroup(other, partitioner).flatMapValues { case (vs, ws) =>
       if (ws.isEmpty) {
         vs.iterator.map(v => (v, None))
@@ -399,6 +397,15 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](self: RDD[(K, V)])
     new MappedValuesRDD(self, cleanF)
   }
 
+
+  /**
+   * Pass each value in the key-value pair RDD through a map function without changing the keys;
+   * this also retains the original RDD's partitioning.
+   */
+  def mapValuesWithKeys[U: ClassManifest](f: (K, V) => U): RDD[(K, U)] = {
+    self.map{ case (k,v) => (k, f(k,v)) }
+  }
+
   /**
    * Pass each value in the key-value pair RDD through a flatMap function without changing the
    * keys; this also retains the original RDD's partitioning.
@@ -701,16 +708,19 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](self: RDD[(K, V)])
   def values: RDD[V] = self.map(_._2)
 
 
+
+  def indexed(): IndexedRDD[K,V] = IndexedRDD(self)
+
   def indexed(numPartitions: Int): IndexedRDD[K,V] = 
     IndexedRDD(self.partitionBy(new HashPartitioner(numPartitions)))
 
   def indexed(partitioner: Partitioner): IndexedRDD[K,V] = 
     IndexedRDD(self.partitionBy(partitioner))
 
-
-  def indexed(existingIndex: RDDIndex[K] = null): IndexedRDD[K,V] = 
+  def indexed(existingIndex: RDDIndex[K]): IndexedRDD[K,V] = 
     IndexedRDD(self, existingIndex)
 
+
   private[spark] def getKeyClass() = implicitly[ClassManifest[K]].erasure
 
   private[spark] def getValueClass() = implicitly[ClassManifest[V]].erasure
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 754c3b75a8e8db76d6b6612dea378c52a3f319f0..d14b4c60c7323c66fd25da3e0cdfa4a4763182fe 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -817,11 +817,14 @@ abstract class RDD[T: ClassManifest](
   }
 
 
+  /**
+   * Construct an index over the unique elements in this RDD.  The
+   * index can then be used to organize a RDD[(T,V)].
+   */
   def makeIndex(partitioner: Option[Partitioner] = None): RDDIndex[T] = 
     IndexedRDD.makeIndex(this, partitioner)
 
 
-
   /**
    * Return the first element in this RDD.
    */
diff --git a/core/src/test/scala/org/apache/spark/rdd/IndexedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/IndexedRDDSuite.scala
new file mode 100644
index 0000000000000000000000000000000000000000..3a2ce4e4da4c6510d6377dc3658d7d08450391c3
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/rdd/IndexedRDDSuite.scala
@@ -0,0 +1,461 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rdd
+
+
+import org.scalatest.FunSuite
+import org.scalatest.prop.Checkers
+import org.scalacheck.Arbitrary._
+import org.scalacheck.Gen
+import org.scalacheck.Prop._
+
+import com.google.common.io.Files
+
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.HashSet
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.ShuffledRDD
+import org.apache.spark.rdd.IndexedRDD
+
+import org.apache.spark.SparkContext._
+import org.apache.spark._
+
+
+
+class IndexedRDDSuite extends FunSuite with SharedSparkContext {
+
+  def lineage(rdd: RDD[_]): collection.mutable.HashSet[RDD[_]] = {
+    val set = new collection.mutable.HashSet[RDD[_]]
+    def visit(rdd: RDD[_]) {
+      for (dep <- rdd.dependencies) {
+        set += dep.rdd
+        visit(dep.rdd)
+      }
+    }
+    visit(rdd)
+    set
+  }  
+
+  test("groupByKey") {
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1))).indexed()
+    val groups = pairs.groupByKey().collect()
+    assert(groups.size === 2)
+    val valuesFor1 = groups.find(_._1 == 1).get._2
+    assert(valuesFor1.toList.sorted === List(1, 2, 3))
+    val valuesFor2 = groups.find(_._1 == 2).get._2
+    assert(valuesFor2.toList.sorted === List(1))
+  }
+
+  test("groupByKey with duplicates") {
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))).indexed()
+    val groups = pairs.groupByKey().collect()
+    assert(groups.size === 2)
+    val valuesFor1 = groups.find(_._1 == 1).get._2
+    assert(valuesFor1.toList.sorted === List(1, 1, 2, 3))
+    val valuesFor2 = groups.find(_._1 == 2).get._2
+    assert(valuesFor2.toList.sorted === List(1))
+  }
+
+  test("groupByKey with negative key hash codes") {
+    val pairs = sc.parallelize(Array((-1, 1), (-1, 2), (-1, 3), (2, 1))).indexed()
+    val groups = pairs.groupByKey().collect()
+    assert(groups.size === 2)
+    val valuesForMinus1 = groups.find(_._1 == -1).get._2
+    assert(valuesForMinus1.toList.sorted === List(1, 2, 3))
+    val valuesFor2 = groups.find(_._1 == 2).get._2
+    assert(valuesFor2.toList.sorted === List(1))
+  }
+
+  test("groupByKey with many output partitions") {
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1))).indexed(10)
+    val groups = pairs.groupByKey().collect()
+    assert(groups.size === 2)
+    val valuesFor1 = groups.find(_._1 == 1).get._2
+    assert(valuesFor1.toList.sorted === List(1, 2, 3))
+    val valuesFor2 = groups.find(_._1 == 2).get._2
+    assert(valuesFor2.toList.sorted === List(1))
+  }
+
+  test("reduceByKey") {
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))).indexed()
+    val sums = pairs.reduceByKey(_+_).collect()
+    assert(sums.toSet === Set((1, 7), (2, 1)))
+  }
+
+  test("reduceByKey with collectAsMap") {
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))).indexed()
+    val sums = pairs.reduceByKey(_+_).collectAsMap()
+    assert(sums.size === 2)
+    assert(sums(1) === 7)
+    assert(sums(2) === 1)
+  }
+
+  test("reduceByKey with many output partitons") {
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))).indexed(10)
+    val sums = pairs.reduceByKey(_+_).collect()
+    assert(sums.toSet === Set((1, 7), (2, 1)))
+  }
+
+  test("reduceByKey with partitioner") {
+    val p = new Partitioner() {
+      def numPartitions = 2
+      def getPartition(key: Any) = key.asInstanceOf[Int]
+    }
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 1), (0, 1))).indexed(p)
+    val sums = pairs.reduceByKey(_+_)
+    assert(sums.collect().toSet === Set((1, 4), (0, 1)))
+    assert(sums.partitioner === Some(p))
+    // count the dependencies to make sure there is only 1 ShuffledRDD
+    val deps = lineage(sums)
+    
+    assert(deps.filter(_.isInstanceOf[ShuffledRDD[_,_,_]]).size === 1) // ShuffledRDD, ParallelCollection
+  }
+
+
+
+  test("joinIndexVsPair") {
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed()
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
+    val joined = rdd1.join(rdd2).collect()
+    assert(joined.size === 4)
+    assert(joined.toSet === Set(
+      (1, (1, 'x')),
+      (1, (2, 'x')),
+      (2, (1, 'y')),
+      (2, (1, 'z'))
+    ))
+  }
+
+  test("joinIndexVsIndex") {
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed()
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))).indexed()
+    val joined = rdd1.join(rdd2).collect()
+    assert(joined.size === 4)
+    assert(joined.toSet === Set(
+      (1, (1, 'x')),
+      (1, (2, 'x')),
+      (2, (1, 'y')),
+      (2, (1, 'z'))
+    ))
+  }
+
+  test("joinSharedIndex") {
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1), (4,-4), (4, 4) )).indexed()
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))).indexed(rdd1.index)
+    val joined = rdd1.join(rdd2).collect()
+    assert(joined.size === 6)
+    assert(joined.toSet === Set(
+      (1, (1, 'x')),
+      (1, (2, 'x')),
+      (2, (1, 'y')),
+      (2, (1, 'z')),
+      (4, (-4, 'w')),
+      (4, (4, 'w'))
+    ))
+  }
+
+
+  test("join all-to-all") {
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (1, 3))).indexed()
+    val rdd2 = sc.parallelize(Array((1, 'x'), (1, 'y'))).indexed(rdd1.index)
+    val joined = rdd1.join(rdd2).collect()
+    assert(joined.size === 6)
+    assert(joined.toSet === Set(
+      (1, (1, 'x')),
+      (1, (1, 'y')),
+      (1, (2, 'x')),
+      (1, (2, 'y')),
+      (1, (3, 'x')),
+      (1, (3, 'y'))
+    ))
+  }
+
+  test("leftOuterJoinIndex") {
+    val index = sc.parallelize( 1 to 6 ).makeIndex()
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed(index)
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
+    val joined = rdd1.leftOuterJoin(rdd2).collect()
+    assert(joined.size === 5)
+    assert(joined.toSet === Set(
+      (1, (1, Some('x'))),
+      (1, (2, Some('x'))),
+      (2, (1, Some('y'))),
+      (2, (1, Some('z'))),
+      (3, (1, None))
+    ))
+  }
+
+  test("leftOuterJoinIndextoIndex") {
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed()
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))).indexed()
+    val joined = rdd1.leftOuterJoin(rdd2).collect()
+    assert(joined.size === 5)
+    assert(joined.toSet === Set(
+      (1, (1, Some('x'))),
+      (1, (2, Some('x'))),
+      (2, (1, Some('y'))),
+      (2, (1, Some('z'))),
+      (3, (1, None))
+    ))
+  }
+
+  test("leftOuterJoinIndextoSharedIndex") {
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1), (4, -4))).indexed()
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))).indexed(rdd1.index)
+    val joined = rdd1.leftOuterJoin(rdd2).collect()
+    assert(joined.size === 6)
+    assert(joined.toSet === Set(
+      (1, (1, Some('x'))),
+      (1, (2, Some('x'))),
+      (2, (1, Some('y'))),
+      (2, (1, Some('z'))),
+      (4, (-4, Some('w'))),
+      (3, (1, None))
+    ))
+  }
+
+test("leftOuterJoinIndextoIndexExternal") {
+    val index = sc.parallelize( 1 to 6 ).makeIndex()
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed(index)
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))).indexed(index)
+    val joined = rdd1.leftOuterJoin(rdd2).collect()
+    assert(joined.size === 5)
+    assert(joined.toSet === Set(
+      (1, (1, Some('x'))),
+      (1, (2, Some('x'))),
+      (2, (1, Some('y'))),
+      (2, (1, Some('z'))),
+      (3, (1, None))
+    ))
+  }
+
+
+  test("rightOuterJoin") {
+    val index = sc.parallelize( 1 to 6 ).makeIndex()
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed(index)
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
+    val joined = rdd1.rightOuterJoin(rdd2).collect()
+    assert(joined.size === 5)
+    assert(joined.toSet === Set(
+      (1, (Some(1), 'x')),
+      (1, (Some(2), 'x')),
+      (2, (Some(1), 'y')),
+      (2, (Some(1), 'z')),
+      (4, (None, 'w'))
+    ))
+  }
+
+  test("rightOuterJoinIndex2Index") {
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed()
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))).indexed()
+    val joined = rdd1.rightOuterJoin(rdd2).collect()
+    assert(joined.size === 5)
+    assert(joined.toSet === Set(
+      (1, (Some(1), 'x')),
+      (1, (Some(2), 'x')),
+      (2, (Some(1), 'y')),
+      (2, (Some(1), 'z')),
+      (4, (None, 'w'))
+    ))
+  }
+
+
+  test("rightOuterJoinIndex2Indexshared") {
+    val index = sc.parallelize( 1 to 6 ).makeIndex()
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed(index)
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))).indexed(index)
+    val joined = rdd1.rightOuterJoin(rdd2).collect()
+    assert(joined.size === 5)
+    assert(joined.toSet === Set(
+      (1, (Some(1), 'x')),
+      (1, (Some(2), 'x')),
+      (2, (Some(1), 'y')),
+      (2, (Some(1), 'z')),
+      (4, (None, 'w'))
+    ))
+  }
+
+
+  test("join with no matches index") {
+    val index = IndexedRDD.makeIndex( sc.parallelize( 1 to 6 ) )
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed(index)
+    val rdd2 = sc.parallelize(Array((4, 'x'), (5, 'y'), (5, 'z'), (6, 'w')))
+    val joined = rdd1.join(rdd2).collect()
+    assert(joined.size === 0)
+  }
+
+  test("join with no matches shared index") {
+    val index = IndexedRDD.makeIndex( sc.parallelize( 1 to 6 ) )
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed(index)
+    val rdd2 = sc.parallelize(Array((4, 'x'), (5, 'y'), (5, 'z'), (6, 'w'))).indexed(index)
+    val joined = rdd1.join(rdd2).collect()
+    assert(joined.size === 0)
+  }
+
+
+  test("join with many output partitions") {
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed(10)
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
+    val joined = rdd1.join(rdd2).collect()
+    assert(joined.size === 4)
+    assert(joined.toSet === Set(
+      (1, (1, 'x')),
+      (1, (2, 'x')),
+      (2, (1, 'y')),
+      (2, (1, 'z'))
+    ))
+  }
+
+  test("join with many output partitions and two indices") {
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed(10)
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))).indexed(20)
+    val joined = rdd1.join(rdd2).collect()
+    assert(joined.size === 4)
+    assert(joined.toSet === Set(
+      (1, (1, 'x')),
+      (1, (2, 'x')),
+      (2, (1, 'y')),
+      (2, (1, 'z'))
+    ))
+  }
+
+
+  test("groupWith") {
+    val index = IndexedRDD.makeIndex( sc.parallelize( 1 to 6 ) )
+
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))).indexed(index)
+    val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))).indexed(index)
+    val joined = rdd1.groupWith(rdd2).collect()
+    assert(joined.size === 4)
+    assert(joined.toSet === Set(
+      (1, (ArrayBuffer(1, 2), ArrayBuffer('x'))),
+      (2, (ArrayBuffer(1), ArrayBuffer('y', 'z'))),
+      (3, (ArrayBuffer(1), ArrayBuffer())),
+      (4, (ArrayBuffer(), ArrayBuffer('w')))
+    ))
+  }
+
+  test("zero-partition RDD") {
+    val emptyDir = Files.createTempDir()
+    val file = sc.textFile(emptyDir.getAbsolutePath)
+    assert(file.partitions.size == 0)
+    assert(file.collect().toList === Nil)
+    // Test that a shuffle on the file works, because this used to be a bug
+    assert(file.map(line => (line, 1)).reduceByKey(_ + _).collect().toList === Nil)
+  }
+
+  test("keys and values") {
+    val rdd = sc.parallelize(Array((1, "a"), (2, "b"))).indexed()
+    assert(rdd.keys.collect().toList === List(1, 2))
+    assert(rdd.values.collect().toList === List("a", "b"))
+  }
+
+  test("default partitioner uses partition size") {
+    // specify 2000 partitions
+    val a = sc.makeRDD(Array(1, 2, 3, 4), 2000)
+    // do a map, which loses the partitioner
+    val b = a.map(a => (a, (a * 2).toString))
+    // then a group by, and see we didn't revert to 2 partitions
+    val c = b.groupByKey()
+    assert(c.partitions.size === 2000)
+  }
+
+  // test("default partitioner uses largest partitioner indexed to indexed") {
+  //   val a = sc.makeRDD(Array((1, "a"), (2, "b")), 2).indexed()
+  //   val b = sc.makeRDD(Array((1, "a"), (2, "b")), 2000).indexed()
+  //   val c = a.join(b)
+  //   assert(c.partitions.size === 2000)
+  // }
+
+
+
+  test("subtract") {
+    val a = sc.parallelize(Array(1, 2, 3), 2)
+    val b = sc.parallelize(Array(2, 3, 4), 4)
+    val c = a.subtract(b)
+    assert(c.collect().toSet === Set(1))
+    assert(c.partitions.size === a.partitions.size)
+  }
+
+  test("subtract with narrow dependency") {
+    // use a deterministic partitioner
+    val p = new Partitioner() {
+      def numPartitions = 5
+      def getPartition(key: Any) = key.asInstanceOf[Int]
+    }
+    // partitionBy so we have a narrow dependency
+    val a = sc.parallelize(Array((1, "a"), (2, "b"), (3, "c"))).indexed(p)
+    // more partitions/no partitioner so a shuffle dependency
+    val b = sc.parallelize(Array((2, "b"), (3, "cc"), (4, "d")), 4)
+    val c = a.subtract(b)
+    assert(c.collect().toSet === Set((1, "a"), (3, "c")))
+    // Ideally we could keep the original partitioner...
+    assert(c.partitioner === None)
+  }
+
+  test("subtractByKey") {
+
+    val a = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 2).indexed()
+    val b = sc.parallelize(Array((2, 20), (3, 30), (4, 40)), 4)
+    val c = a.subtractByKey(b)
+    assert(c.collect().toSet === Set((1, "a"), (1, "a")))
+    assert(c.partitions.size === a.partitions.size)
+  }
+
+  // test("subtractByKey with narrow dependency") {
+  //   // use a deterministic partitioner
+  //   val p = new Partitioner() {
+  //     def numPartitions = 5
+  //     def getPartition(key: Any) = key.asInstanceOf[Int]
+  //   }
+
+  //   val index = sc.parallelize( 1 to 6 ).makeIndex(Some(p))
+  //   // partitionBy so we have a narrow dependency
+  //   val a = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c"))).indexed(index)
+  //   // more partitions/no partitioner so a shuffle dependency
+  //   val b = sc.parallelize(Array((2, "b"), (3, "cc"), (4, "d")), 4).indexed(index)
+  //   val c = a.subtractByKey(b)
+  //   assert(c.collect().toSet === Set((1, "a"), (1, "a")))
+  //   assert(c.partitioner.get === p)
+  // }
+
+  test("foldByKey") {
+    val index = IndexedRDD.makeIndex( sc.parallelize( 1 to 6 ) )
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))).indexed(index)
+    val sums = pairs.foldByKey(0)(_+_).collect()
+    assert(sums.toSet === Set((1, 7), (2, 1)))
+  }
+
+  test("foldByKey with mutable result type") {
+    val index = IndexedRDD.makeIndex( sc.parallelize( 1 to 6 ) )
+
+    val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))).indexed(index)
+    val bufs = pairs.mapValues(v => ArrayBuffer(v)).cache()
+    // Fold the values using in-place mutation
+    val sums = bufs.foldByKey(new ArrayBuffer[Int])(_ ++= _).collect()
+    assert(sums.toSet === Set((1, ArrayBuffer(1, 2, 3, 1)), (2, ArrayBuffer(1))))
+    // Check that the mutable objects in the original RDD were not changed
+    assert(bufs.collect().toSet === Set(
+      (1, ArrayBuffer(1)),
+      (1, ArrayBuffer(2)),
+      (1, ArrayBuffer(3)),
+      (1, ArrayBuffer(1)),
+      (2, ArrayBuffer(1))))
+  }
+}
diff --git a/graph/src/main/scala/org/apache/spark/graph/Analytics.scala b/graph/src/main/scala/org/apache/spark/graph/Analytics.scala
index 09cf81eeeb78bc0a9099e89c3ec0ad952e7b09a8..92632db491bebab139c064624503bdfe3d4d2c4f 100644
--- a/graph/src/main/scala/org/apache/spark/graph/Analytics.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/Analytics.scala
@@ -6,37 +6,6 @@ import org.apache.spark._
 
 object Analytics extends Logging {
 
-//  def main(args: Array[String]) {
-//    //pregelPagerank()
-//  }
-
-  // /**
-  //  * Compute the PageRank of a graph returning the pagerank of each vertex as an RDD
-  //  */
-  // // def pagerank[VD: Manifest, ED: Manifest](graph: Graph[VD, ED], numIter: Int) = {
-  // //   // Compute the out degree of each vertex
-  // //   val pagerankGraph = graph.updateVertices[Int, (Int, Float)](graph.outDegrees,
-  // //     (vertex, deg) => (deg.getOrElse(0), 1.0F)
-  // //   )
-  // //   GraphLab.iterateGA[(Int, Float), ED, Float](pagerankGraph)(
-  // //     (me_id, edge) => edge.src.data._2 / edge.src.data._1, // gather
-  // //     (a: Float, b: Float) => a + b, // merge
-  // //     (vertex, a: Option[Float]) => (vertex.data._1, (0.15F + 0.85F * a.getOrElse(0F))), // apply
-  // //     numIter).mapVertices{ case Vertex(id, (outDeg, r)) => Vertex(id, r) }
-  // // }
-  // def pagerank[VD: Manifest, ED: Manifest](graph: Graph[VD, ED], numIter: Int) = {
-  //   // Compute the out degree of each vertex
-  //   val pagerankGraph = graph.updateVertices[Int, (Int, Double)](graph.outDegrees,
-  //     (vertex, deg) => (deg.getOrElse(0), 1.0)
-  //   )
-  //   GraphLab.iterateGA2[(Int, Double), ED, Double](pagerankGraph)(
-  //     (me_id, edge) => edge.src.data._2 / edge.src.data._1, // gather
-  //     (a: Double, b: Double) => a + b, // merge
-  //     0.0, // default
-  //     (vertex, a: Double) => (vertex.data._1, (0.15 + 0.85 * a)), // apply
-  //     numIter).mapVertices{ case Vertex(id, (outDeg, r)) => Vertex(id, r) }
-  // }
-
   /**
    * Compute the PageRank of a graph returning the pagerank of each vertex as an RDD
    */
@@ -44,17 +13,21 @@ object Analytics extends Logging {
                                            numIter: Int,
                                            resetProb: Double = 0.15) = {
     // Compute the out degree of each vertex
-    val pagerankGraph = graph.leftJoinVertices[Int, (Int, Double)](graph.outDegrees,
-      (vertex, deg) => (deg.getOrElse(0), 1.0)
-    )
+    val pagerankGraph = graph.outerJoinVertices(graph.outDegrees){
+      (vid, vdata, deg) => (deg.getOrElse(0), 1.0)
+    }
+
+    println(pagerankGraph.statistics)
+    
     Pregel.iterate[(Int, Double), ED, Double](pagerankGraph)(
-      (vertex, a: Double) => (vertex.data._1, (resetProb + (1.0 - resetProb) * a)), // apply
-      (me_id, edge) => Some(edge.src.data._2 / edge.src.data._1), // gather
+      (vid, data, a: Double) => (data._1, (resetProb + (1.0 - resetProb) * a)), // apply
+      (me_id, edge) => Some(edge.srcAttr._2 / edge.srcAttr._1), // gather
       (a: Double, b: Double) => a + b, // merge
       1.0,
-      numIter).mapVertices{ case Vertex(id, (outDeg, r)) => r }
+      numIter).mapVertices{ case (id, (outDeg, r)) => r }
   }
 
+
   /**
    * Compute the PageRank of a graph returning the pagerank of each vertex as an RDD
    */
@@ -63,18 +36,18 @@ object Analytics extends Logging {
                                                   maxIter: Int = Integer.MAX_VALUE,
                                                   resetProb: Double = 0.15) = {
     // Compute the out degree of each vertex
-    val pagerankGraph = graph.leftJoinVertices[Int, (Int, Double, Double)](graph.outDegrees,
-      (vertex, degIter) => (degIter.sum, 1.0, 1.0)
-    )
-
+    val pagerankGraph = graph.outerJoinVertices(graph.outDegrees){
+      (id, data, degIter) => (degIter.sum, 1.0, 1.0)
+    }
+    
     // Run PageRank
     GraphLab.iterate(pagerankGraph)(
-      (me_id, edge) => edge.src.data._2 / edge.src.data._1, // gather
+      (me_id, edge) => edge.srcAttr._2 / edge.srcAttr._1, // gather
       (a: Double, b: Double) => a + b,
-      (vertex, a: Option[Double]) =>
-        (vertex.data._1, (resetProb + (1.0 - resetProb) * a.getOrElse(0.0)), vertex.data._2), // apply
-      (me_id, edge) => math.abs(edge.src.data._3 - edge.src.data._2) > tol, // scatter
-      maxIter).mapVertices { case Vertex(vid, data) => data._2 }
+      (id, data, a: Option[Double]) =>
+        (data._1, (resetProb + (1.0 - resetProb) * a.getOrElse(0.0)), data._2), // apply
+      (me_id, edge) => math.abs(edge.srcAttr._3 - edge.srcAttr._2) > tol, // scatter
+      maxIter).mapVertices { case (vid, data) => data._2 }
   }
 
 
@@ -84,346 +57,239 @@ object Analytics extends Logging {
    * lowest vertex id in the connected component containing
    * that vertex.
    */
-  def connectedComponents[VD: Manifest, ED: Manifest](graph: Graph[VD, ED]) = {
-    val ccGraph = graph.mapVertices { case Vertex(vid, _) => vid }
-
+  def connectedComponents[VD: Manifest, ED: Manifest](graph: Graph[VD, ED], numIter: Int) = {
+    val ccGraph = graph.mapVertices { case (vid, _) => vid }
     GraphLab.iterate(ccGraph)(
-      (me_id, edge) => edge.otherVertex(me_id).data, // gather
+      (me_id, edge) => edge.otherVertexAttr(me_id), // gather
       (a: Vid, b: Vid) => math.min(a, b), // merge
-      (v, a: Option[Vid]) => math.min(v.data, a.getOrElse(Long.MaxValue)), // apply
-      (me_id, edge) => (edge.vertex(me_id).data < edge.otherVertex(me_id).data), // scatter
+      (id, data, a: Option[Vid]) => math.min(data, a.getOrElse(Long.MaxValue)), // apply
+      (me_id, edge) => (edge.vertexAttr(me_id) < edge.otherVertexAttr(me_id)), // scatter
+      numIter,
       gatherDirection = EdgeDirection.Both, scatterDirection = EdgeDirection.Both
     )
   }
-
-  //   /**
-  //    * Compute the shortest path to a set of markers
-  //    */
-  //   def shortestPath[VD: Manifest](graph: Graph[VD, Float], sources: List[Int], numIter: Int) = {
-  //     val sourceSet = sources.toSet
-  //     val spGraph = graph.mapVertices {
-  //       case Vertex(vid, _) => Vertex(vid, (if(sourceSet.contains(vid)) 0.0F else Float.MaxValue))
-  //     }
-  //     GraphLab.iterateGA[Float, Float, Float](spGraph)(
-  //       (me_id, edge) => edge.otherVertex(me_id).data + edge.data, // gather
-  //       (a: Float, b: Float) => math.min(a, b), // merge
-  //       (v, a: Option[Float]) => math.min(v.data, a.getOrElse(Float.MaxValue)), // apply
-  //       numIter,
-  //       gatherDirection = EdgeDirection.In)
-  //   }
-
-  //   // /**
-  //   //  * Compute the connected component membership of each vertex
-  //   //  * and return an RDD with the vertex value containing the
-  //   //  * lowest vertex id in the connected component containing
-  //   //  * that vertex.
-  //   //  */
-  //   // def dynamicConnectedComponents[VD: Manifest, ED: Manifest](graph: Graph[VD, ED],
-  //   //   numIter: Int = Int.MaxValue) = {
-
-  //   //   val vertices = graph.vertices.mapPartitions(iter => iter.map { case (vid, _) => (vid, vid) })
-  //   //   val edges = graph.edges // .mapValues(v => None)
-  //   //   val ccGraph = new Graph(vertices, edges)
-
-  //   //   ccGraph.iterateDynamic(
-  //   //     (me_id, edge) => edge.otherVertex(me_id).data, // gather
-  //   //     (a: Int, b: Int) => math.min(a, b), // merge
-  //   //     Integer.MAX_VALUE,
-  //   //     (v, a: Int) => math.min(v.data, a), // apply
-  //   //     (me_id, edge) => edge.otherVertex(me_id).data > edge.vertex(me_id).data, // scatter
-  //   //     numIter,
-  //   //     gatherEdges = EdgeDirection.Both,
-  //   //     scatterEdges = EdgeDirection.Both).vertices
-  //   //   //
-  //   //   //    graph_ret.vertices.collect.foreach(println)
-  //   //   //    graph_ret.edges.take(10).foreach(println)
-  //   // }
-
-
-  //   // /**
-  //   //  * Compute the shortest path to a set of markers
-  //   //  */
-  //   //  def dynamicShortestPath[VD: Manifest, ED: Manifest](graph: Graph[VD, Float],
-  //   //   sources: List[Int], numIter: Int) = {
-  //   //   val sourceSet = sources.toSet
-  //   //   val vertices = graph.vertices.mapPartitions(
-  //   //     iter => iter.map {
-  //   //       case (vid, _) => (vid, (if(sourceSet.contains(vid)) 0.0F else Float.MaxValue) )
-  //   //       });
-
-  //   //   val edges = graph.edges // .mapValues(v => None)
-  //   //   val spGraph = new Graph(vertices, edges)
-
-  //   //   val niterations = Int.MaxValue
-  //   //   spGraph.iterateDynamic(
-  //   //     (me_id, edge) => edge.otherVertex(me_id).data + edge.data, // gather
-  //   //     (a: Float, b: Float) => math.min(a, b), // merge
-  //   //     Float.MaxValue,
-  //   //     (v, a: Float) => math.min(v.data, a), // apply
-  //   //     (me_id, edge) => edge.vertex(me_id).data + edge.data < edge.otherVertex(me_id).data, // scatter
-  //   //     numIter,
-  //   //     gatherEdges = EdgeDirection.In,
-  //   //     scatterEdges = EdgeDirection.Out).vertices
-  //   // }
-
-
-  //   // /**
-  //   //  *
-  //   //  */
-  //   // def alternatingLeastSquares[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, Double],
-  //   //   latentK: Int, lambda: Double, numIter: Int) = {
-  //   //   val vertices = graph.vertices.mapPartitions( _.map {
-  //   //       case (vid, _) => (vid,  Array.fill(latentK){ scala.util.Random.nextDouble() } )
-  //   //       }).cache
-  //   //   val maxUser = graph.edges.map(_._1).reduce(math.max(_,_))
-  //   //   val edges = graph.edges // .mapValues(v => None)
-  //   //   val alsGraph = new Graph(vertices, edges)
-  //   //   alsGraph.numVPart = graph.numVPart
-  //   //   alsGraph.numEPart = graph.numEPart
-
-  //   //   val niterations = Int.MaxValue
-  //   //   alsGraph.iterateDynamic[(Array[Double], Array[Double])](
-  //   //     (me_id, edge) => { // gather
-  //   //       val X = edge.otherVertex(me_id).data
-  //   //       val y = edge.data
-  //   //       val Xy = X.map(_ * y)
-  //   //       val XtX = (for(i <- 0 until latentK; j <- i until latentK) yield(X(i) * X(j))).toArray
-  //   //       (Xy, XtX)
-  //   //     },
-  //   //     (a, b) => {
-  //   //     // The difference between the while loop and the zip is a FACTOR OF TWO in overall
-  //   //     //  runtime
-  //   //       var i = 0
-  //   //       while(i < a._1.length) { a._1(i) += b._1(i); i += 1 }
-  //   //       i = 0
-  //   //       while(i < a._2.length) { a._2(i) += b._2(i); i += 1 }
-  //   //       a
-  //   //       // (a._1.zip(b._1).map{ case (q,r) => q+r }, a._2.zip(b._2).map{ case (q,r) => q+r })
-  //   //     },
-  //   //     (Array.empty[Double], Array.empty[Double]), // default value is empty
-  //   //     (vertex, accum) => { // apply
-  //   //       val XyArray  = accum._1
-  //   //       val XtXArray = accum._2
-  //   //       if(XyArray.isEmpty) vertex.data // no neighbors
-  //   //       else {
-  //   //         val XtX = DenseMatrix.tabulate(latentK,latentK){ (i,j) =>
-  //   //           (if(i < j) XtXArray(i + (j+1)*j/2) else XtXArray(i + (j+1)*j/2)) +
-  //   //           (if(i == j) lambda else 1.0F) //regularization
-  //   //         }
-  //   //         val Xy = DenseMatrix.create(latentK,1,XyArray)
-  //   //         val w = XtX \ Xy
-  //   //         w.data
-  //   //       }
-  //   //     },
-  //   //     (me_id, edge) => true,
-  //   //     numIter,
-  //   //     gatherEdges = EdgeDirection.Both,
-  //   //     scatterEdges = EdgeDirection.Both,
-  //   //     vertex => vertex.id < maxUser).vertices
-  //   // }
-
-  //   def main(args: Array[String]) = {
-  //     val host = args(0)
-  //     val taskType = args(1)
-  //     val fname = args(2)
-  //     val options =  args.drop(3).map { arg =>
-  //       arg.dropWhile(_ == '-').split('=') match {
-  //         case Array(opt, v) => (opt -> v)
-  //         case _ => throw new IllegalArgumentException("Invalid argument: " + arg)
-  //       }
-  //     }
-
-  //     System.setProperty("spark.serializer", "spark.KryoSerializer")
-  //     //System.setProperty("spark.shuffle.compress", "false")
-  //     System.setProperty("spark.kryo.registrator", "spark.graph.GraphKryoRegistrator")
-
-  //     taskType match {
-  //       case "pagerank" => {
-
-  //         var numIter = Int.MaxValue
-  //         var isDynamic = false
-  //         var tol:Float = 0.001F
-  //         var outFname = ""
-  //         var numVPart = 4
-  //         var numEPart = 4
-
-  //         options.foreach{
-  //           case ("numIter", v) => numIter = v.toInt
-  //           case ("dynamic", v) => isDynamic = v.toBoolean
-  //           case ("tol", v) => tol = v.toFloat
-  //           case ("output", v) => outFname = v
-  //           case ("numVPart", v) => numVPart = v.toInt
-  //           case ("numEPart", v) => numEPart = v.toInt
-  //           case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
-  //         }
-
-  //         if(!isDynamic && numIter == Int.MaxValue) {
-  //           println("Set number of iterations!")
-  //           sys.exit(1)
-  //         }
-  //         println("======================================")
-  //         println("|             PageRank               |")
-  //         println("--------------------------------------")
-  //         println(" Using parameters:")
-  //         println(" \tDynamic:  " + isDynamic)
-  //         if(isDynamic) println(" \t  |-> Tolerance: " + tol)
-  //         println(" \tNumIter:  " + numIter)
-  //         println("======================================")
-
-  //         val sc = new SparkContext(host, "PageRank(" + fname + ")")
-
-  //         val graph = Graph.textFile(sc, fname, a => 1.0F).withPartitioner(numVPart, numEPart).cache()
-
-  //         val startTime = System.currentTimeMillis
-  //         logInfo("GRAPHX: starting tasks")
-  //         logInfo("GRAPHX: Number of vertices " + graph.vertices.count)
-  //         logInfo("GRAPHX: Number of edges " + graph.edges.count)
-
-  //         val pr = Analytics.pagerank(graph, numIter)
-  //         // val pr = if(isDynamic) Analytics.dynamicPagerank(graph, tol, numIter)
-  //         //   else  Analytics.pagerank(graph, numIter)
-  //         logInfo("GRAPHX: Total rank: " + pr.vertices.map{ case Vertex(id,r) => r }.reduce(_+_) )
-  //         if (!outFname.isEmpty) {
-  //           println("Saving pageranks of pages to " + outFname)
-  //           pr.vertices.map{case Vertex(id, r) => id + "\t" + r}.saveAsTextFile(outFname)
-  //         }
-  //         logInfo("GRAPHX: Runtime:    " + ((System.currentTimeMillis - startTime)/1000.0) + " seconds")
-  //         sc.stop()
-  //       }
-
-  //      case "cc" => {
-
-  //         var numIter = Int.MaxValue
-  //         var isDynamic = false
-
-  //         options.foreach{
-  //           case ("numIter", v) => numIter = v.toInt
-  //           case ("dynamic", v) => isDynamic = v.toBoolean
-  //           case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
-  //         }
-
-  //         if(!isDynamic && numIter == Int.MaxValue) {
-  //           println("Set number of iterations!")
-  //           sys.exit(1)
-  //         }
-  //         println("======================================")
-  //         println("|      Connected Components          |")
-  //         println("--------------------------------------")
-  //         println(" Using parameters:")
-  //         println(" \tDynamic:  " + isDynamic)
-  //         println(" \tNumIter:  " + numIter)
-  //         println("======================================")
-
-  //         val sc = new SparkContext(host, "ConnectedComponents(" + fname + ")")
-  //         val graph = Graph.textFile(sc, fname, a => 1.0F)
-  //         val cc = Analytics.connectedComponents(graph, numIter)
-  //         // val cc = if(isDynamic) Analytics.dynamicConnectedComponents(graph, numIter)
-  //         //   else  Analytics.connectedComponents(graph, numIter)
-  //         println("Components: " + cc.vertices.map(_.data).distinct())
-
-  //         sc.stop()
-  //       }
-
-  //      case "shortestpath" => {
-
-  //         var numIter = Int.MaxValue
-  //         var isDynamic = true
-  //         var sources: List[Int] = List.empty
-
-  //         options.foreach{
-  //           case ("numIter", v) => numIter = v.toInt
-  //           case ("dynamic", v) => isDynamic = v.toBoolean
-  //           case ("source", v) => sources ++= List(v.toInt)
-  //           case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
-  //         }
-
-
-  //         if(!isDynamic && numIter == Int.MaxValue) {
-  //           println("Set number of iterations!")
-  //           sys.exit(1)
-  //         }
-
-  //         if(sources.isEmpty) {
-  //           println("No sources provided!")
-  //           sys.exit(1)
-  //         }
-
-  //         println("======================================")
-  //         println("|          Shortest Path             |")
-  //         println("--------------------------------------")
-  //         println(" Using parameters:")
-  //         println(" \tDynamic:  " + isDynamic)
-  //         println(" \tNumIter:  " + numIter)
-  //         println(" \tSources:  [" + sources.mkString(", ") + "]")
-  //         println("======================================")
-
-  //         val sc = new SparkContext(host, "ShortestPath(" + fname + ")")
-  //         val graph = Graph.textFile(sc, fname, a => (if(a.isEmpty) 1.0F else a(0).toFloat ) )
-  //         val sp = Analytics.shortestPath(graph, sources, numIter)
-  //         // val cc = if(isDynamic) Analytics.dynamicShortestPath(graph, sources, numIter)
-  //         //   else  Analytics.shortestPath(graph, sources, numIter)
-  //         println("Longest Path: " + sp.vertices.map(_.data).reduce(math.max(_,_)))
-
-  //         sc.stop()
-  //       }
-
-
-  //      //  case "als" => {
-
-  //      //    var numIter = 5
-  //      //    var lambda = 0.01
-  //      //    var latentK = 10
-  //      //    var usersFname = "usersFactors.tsv"
-  //      //    var moviesFname = "moviesFname.tsv"
-  //      //    var numVPart = 4
-  //      //    var numEPart = 4
-
-  //      //    options.foreach{
-  //      //      case ("numIter", v) => numIter = v.toInt
-  //      //      case ("lambda", v) => lambda = v.toDouble
-  //      //      case ("latentK", v) => latentK = v.toInt
-  //      //      case ("usersFname", v) => usersFname = v
-  //      //      case ("moviesFname", v) => moviesFname = v
-  //      //      case ("numVPart", v) => numVPart = v.toInt
-  //      //      case ("numEPart", v) => numEPart = v.toInt
-  //      //      case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
-  //      //    }
-
-  //      //    println("======================================")
-  //      //    println("|       Alternating Least Squares    |")
-  //      //    println("--------------------------------------")
-  //      //    println(" Using parameters:")
-  //      //    println(" \tNumIter:     " + numIter)
-  //      //    println(" \tLambda:      " + lambda)
-  //      //    println(" \tLatentK:     " + latentK)
-  //      //    println(" \tusersFname:  " + usersFname)
-  //      //    println(" \tmoviesFname: " + moviesFname)
-  //      //    println("======================================")
-
-  //      //    val sc = new SparkContext(host, "ALS(" + fname + ")")
-  //      //    val graph = Graph.textFile(sc, fname, a => a(0).toDouble )
-  //      //    graph.numVPart = numVPart
-  //      //    graph.numEPart = numEPart
-
-  //      //    val maxUser = graph.edges.map(_._1).reduce(math.max(_,_))
-  //      //    val minMovie = graph.edges.map(_._2).reduce(math.min(_,_))
-  //      //    assert(maxUser < minMovie)
-
-  //      //    val factors = Analytics.alternatingLeastSquares(graph, latentK, lambda, numIter).cache
-  //      //    factors.filter(_._1 <= maxUser).map(r => r._1 + "\t" + r._2.mkString("\t"))
-  //      //      .saveAsTextFile(usersFname)
-  //      //    factors.filter(_._1 >= minMovie).map(r => r._1 + "\t" + r._2.mkString("\t"))
-  //      //      .saveAsTextFile(moviesFname)
-
-  //      //    sc.stop()
-  //      //  }
-
-
-  //       case _ => {
-  //         println("Invalid task type.")
-  //       }
-  //     }
-  //   }
+  
+  def main(args: Array[String]) = {
+    val host = args(0)
+    val taskType = args(1)
+    val fname = args(2)
+    val options =  args.drop(3).map { arg =>
+      arg.dropWhile(_ == '-').split('=') match {
+        case Array(opt, v) => (opt -> v)
+        case _ => throw new IllegalArgumentException("Invalid argument: " + arg)
+      }
+    }
+    
+    def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = {
+      loggers.map{
+        loggerName =>
+          val logger = org.apache.log4j.Logger.getLogger(loggerName)
+        val prevLevel = logger.getLevel()
+        logger.setLevel(level)
+        loggerName -> prevLevel
+      }.toMap
+    }
+//       setLogLevels(org.apache.log4j.Level.DEBUG, Seq("org.apache.spark"))
+
+     val serializer = "org.apache.spark.serializer.KryoSerializer"
+     System.setProperty("spark.serializer", serializer)
+     //System.setProperty("spark.shuffle.compress", "false")
+     System.setProperty("spark.kryo.registrator", "org.apache.spark.graph.GraphKryoRegistrator")
+
+     taskType match {
+       case "pagerank" => {
+
+         var numIter = Int.MaxValue
+         var isDynamic = false
+         var tol:Float = 0.001F
+         var outFname = ""
+         var numVPart = 4
+         var numEPart = 4
+
+         options.foreach{
+           case ("numIter", v) => numIter = v.toInt
+           case ("dynamic", v) => isDynamic = v.toBoolean
+           case ("tol", v) => tol = v.toFloat
+           case ("output", v) => outFname = v
+           case ("numVPart", v) => numVPart = v.toInt
+           case ("numEPart", v) => numEPart = v.toInt
+           case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+         }
+
+         if(!isDynamic && numIter == Int.MaxValue) {
+           println("Set number of iterations!")
+           sys.exit(1)
+         }
+         println("======================================")
+         println("|             PageRank               |")
+         println("--------------------------------------")
+         println(" Using parameters:")
+         println(" \tDynamic:  " + isDynamic)
+         if(isDynamic) println(" \t  |-> Tolerance: " + tol)
+         println(" \tNumIter:  " + numIter)
+         println("======================================")
+
+         val sc = new SparkContext(host, "PageRank(" + fname + ")")
+
+         val graph = GraphLoader.textFile(sc, fname, a => 1.0F, 
+          minEdgePartitions = numEPart, minVertexPartitions = numVPart).cache()
+
+         val startTime = System.currentTimeMillis
+         logInfo("GRAPHX: starting tasks")
+         logInfo("GRAPHX: Number of vertices " + graph.vertices.count)
+         logInfo("GRAPHX: Number of edges " + graph.edges.count)
+
+         val pr = Analytics.pagerank(graph, numIter)
+         // val pr = if(isDynamic) Analytics.dynamicPagerank(graph, tol, numIter)
+         //   else  Analytics.pagerank(graph, numIter)
+         logInfo("GRAPHX: Total rank: " + pr.vertices.map{ case (id,r) => r }.reduce(_+_) )
+         if (!outFname.isEmpty) {
+           println("Saving pageranks of pages to " + outFname)
+           pr.vertices.map{case (id, r) => id + "\t" + r}.saveAsTextFile(outFname)
+         }
+         logInfo("GRAPHX: Runtime:    " + ((System.currentTimeMillis - startTime)/1000.0) + " seconds")
+         sc.stop()
+       }
+
+        case "cc" => {
+
+           var numIter = Int.MaxValue
+           var numVPart = 4
+           var numEPart = 4
+           var isDynamic = false
+
+           options.foreach{
+             case ("numIter", v) => numIter = v.toInt
+             case ("dynamic", v) => isDynamic = v.toBoolean
+             case ("numEPart", v) => numEPart = v.toInt
+             case ("numVPart", v) => numVPart = v.toInt
+             case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+           }
+
+           if(!isDynamic && numIter == Int.MaxValue) {
+             println("Set number of iterations!")
+             sys.exit(1)
+           }
+           println("======================================")
+           println("|      Connected Components          |")
+           println("--------------------------------------")
+           println(" Using parameters:")
+           println(" \tDynamic:  " + isDynamic)
+           println(" \tNumIter:  " + numIter)
+           println("======================================")
+
+           val sc = new SparkContext(host, "ConnectedComponents(" + fname + ")")
+           //val graph = GraphLoader.textFile(sc, fname, a => 1.0F)
+           val graph = GraphLoader.textFile(sc, fname, a => 1.0F, 
+            minEdgePartitions = numEPart, minVertexPartitions = numVPart).cache()
+           val cc = Analytics.connectedComponents(graph, numIter)
+           //val cc = if(isDynamic) Analytics.dynamicConnectedComponents(graph, numIter)
+           //         else Analytics.connectedComponents(graph, numIter)
+           println("Components: " + cc.vertices.map{ case (vid,data) => data}.distinct())
+
+           sc.stop()
+         }
+//
+//        case "shortestpath" => {
+//
+//           var numIter = Int.MaxValue
+//           var isDynamic = true
+//           var sources: List[Int] = List.empty
+//
+//           options.foreach{
+//             case ("numIter", v) => numIter = v.toInt
+//             case ("dynamic", v) => isDynamic = v.toBoolean
+//             case ("source", v) => sources ++= List(v.toInt)
+//             case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+//           }
+//
+//
+//           if(!isDynamic && numIter == Int.MaxValue) {
+//             println("Set number of iterations!")
+//             sys.exit(1)
+//           }
+//
+//           if(sources.isEmpty) {
+//             println("No sources provided!")
+//             sys.exit(1)
+//           }
+//
+//           println("======================================")
+//           println("|          Shortest Path             |")
+//           println("--------------------------------------")
+//           println(" Using parameters:")
+//           println(" \tDynamic:  " + isDynamic)
+//           println(" \tNumIter:  " + numIter)
+//           println(" \tSources:  [" + sources.mkString(", ") + "]")
+//           println("======================================")
+//
+//           val sc = new SparkContext(host, "ShortestPath(" + fname + ")")
+//           val graph = GraphLoader.textFile(sc, fname, a => (if(a.isEmpty) 1.0F else a(0).toFloat ) )
+//           //val sp = Analytics.shortestPath(graph, sources, numIter)
+//           // val cc = if(isDynamic) Analytics.dynamicShortestPath(graph, sources, numIter)
+//           //   else  Analytics.shortestPath(graph, sources, numIter)
+//           println("Longest Path: " + sp.vertices.map(_.data).reduce(math.max(_,_)))
+//
+//           sc.stop()
+//         }
+
+
+      //  case "als" => {
+
+      //    var numIter = 5
+      //    var lambda = 0.01
+      //    var latentK = 10
+      //    var usersFname = "usersFactors.tsv"
+      //    var moviesFname = "moviesFname.tsv"
+      //    var numVPart = 4
+      //    var numEPart = 4
+
+      //    options.foreach{
+      //      case ("numIter", v) => numIter = v.toInt
+      //      case ("lambda", v) => lambda = v.toDouble
+      //      case ("latentK", v) => latentK = v.toInt
+      //      case ("usersFname", v) => usersFname = v
+      //      case ("moviesFname", v) => moviesFname = v
+      //      case ("numVPart", v) => numVPart = v.toInt
+      //      case ("numEPart", v) => numEPart = v.toInt
+      //      case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+      //    }
+
+      //    println("======================================")
+      //    println("|       Alternating Least Squares    |")
+      //    println("--------------------------------------")
+      //    println(" Using parameters:")
+      //    println(" \tNumIter:     " + numIter)
+      //    println(" \tLambda:      " + lambda)
+      //    println(" \tLatentK:     " + latentK)
+      //    println(" \tusersFname:  " + usersFname)
+      //    println(" \tmoviesFname: " + moviesFname)
+      //    println("======================================")
+
+      //    val sc = new SparkContext(host, "ALS(" + fname + ")")
+      //    val graph = GraphLoader.textFile(sc, fname, a => a(0).toDouble )
+      //    graph.numVPart = numVPart
+      //    graph.numEPart = numEPart
+
+      //    val maxUser = graph.edges.map(_._1).reduce(math.max(_,_))
+      //    val minMovie = graph.edges.map(_._2).reduce(math.min(_,_))
+      //    assert(maxUser < minMovie)
+
+      //    val factors = Analytics.alternatingLeastSquares(graph, latentK, lambda, numIter).cache
+      //    factors.filter(_._1 <= maxUser).map(r => r._1 + "\t" + r._2.mkString("\t"))
+      //      .saveAsTextFile(usersFname)
+      //    factors.filter(_._1 >= minMovie).map(r => r._1 + "\t" + r._2.mkString("\t"))
+      //      .saveAsTextFile(moviesFname)
+
+      //    sc.stop()
+      //  }
+
+
+       case _ => {
+         println("Invalid task type.")
+       }
+     }
+   }
 
   // /**
   //  * Compute the PageRank of a graph returning the pagerank of each vertex as an RDD
@@ -637,7 +503,7 @@ object Analytics extends Logging {
 
   //       val sc = new SparkContext(host, "PageRank(" + fname + ")")
 
-  //       val graph = Graph.textFile(sc, fname, a => 1.0).withPartitioner(numVPart, numEPart).cache()
+  //       val graph = GraphLoader.textFile(sc, fname, a => 1.0).withPartitioner(numVPart, numEPart).cache()
 
   //       val startTime = System.currentTimeMillis
   //       logInfo("GRAPHX: starting tasks")
@@ -680,7 +546,7 @@ object Analytics extends Logging {
   //       println("======================================")
 
   //       val sc = new SparkContext(host, "ConnectedComponents(" + fname + ")")
-  //       val graph = Graph.textFile(sc, fname, a => 1.0)
+  //       val graph = GraphLoader.textFile(sc, fname, a => 1.0)
   //       val cc = Analytics.connectedComponents(graph, numIter)
   //       // val cc = if(isDynamic) Analytics.dynamicConnectedComponents(graph, numIter)
   //       //   else  Analytics.connectedComponents(graph, numIter)
@@ -723,7 +589,7 @@ object Analytics extends Logging {
   //       println("======================================")
 
   //       val sc = new SparkContext(host, "ShortestPath(" + fname + ")")
-  //       val graph = Graph.textFile(sc, fname, a => (if(a.isEmpty) 1.0 else a(0).toDouble ) )
+  //       val graph = GraphLoader.textFile(sc, fname, a => (if(a.isEmpty) 1.0 else a(0).toDouble ) )
   //       val sp = Analytics.shortestPath(graph, sources, numIter)
   //       // val cc = if(isDynamic) Analytics.dynamicShortestPath(graph, sources, numIter)
   //       //   else  Analytics.shortestPath(graph, sources, numIter)
@@ -766,7 +632,7 @@ object Analytics extends Logging {
   //    println("======================================")
 
   //    val sc = new SparkContext(host, "ALS(" + fname + ")")
-  //    val graph = Graph.textFile(sc, fname, a => a(0).toDouble )
+  //    val graph = GraphLoader.textFile(sc, fname, a => a(0).toDouble )
   //    graph.numVPart = numVPart
   //    graph.numEPart = numEPart
 
diff --git a/graph/src/main/scala/org/apache/spark/graph/Edge.scala b/graph/src/main/scala/org/apache/spark/graph/Edge.scala
index 20539b8af05b88b992d0589a1a0cb031342ffdea..67b64540177fc638f2076a1c52aebf7fc23444fa 100644
--- a/graph/src/main/scala/org/apache/spark/graph/Edge.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/Edge.scala
@@ -8,6 +8,27 @@ package org.apache.spark.graph
  * @tparam ED type of the edge attribute
  */
 case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED] (
-  var src: Vid = 0,
-  var dst: Vid = 0,
-  var data: ED = nullValue[ED])
+  var srcId: Vid = 0,
+  var dstId: Vid = 0,
+  var attr: ED = nullValue[ED]) {
+
+  /**
+   * Given one vertex in the edge return the other vertex.
+   *
+   * @param vid the id one of the two vertices on the edge.
+   * @return the id of the other vertex on the edge.
+   */
+  def otherVertexId(vid: Vid): Vid =
+    if (srcId == vid) dstId else { assert(dstId == vid); srcId }
+
+
+  /**
+   * Return the relative direction of the edge to the corresponding vertex.
+   *
+   * @param vid the id of one of the two vertices in the edge.
+   * @return the relative direction of the edge to the corresponding vertex.
+   */
+  def relativeDirection(vid: Vid): EdgeDirection =
+    if (vid == srcId) EdgeDirection.Out else { assert(vid == dstId); EdgeDirection.In }
+
+}
diff --git a/graph/src/main/scala/org/apache/spark/graph/EdgeTriplet.scala b/graph/src/main/scala/org/apache/spark/graph/EdgeTriplet.scala
index 4ade1d7333d81edd5de0f3cbd6d811b378935ca4..ef3aa199bdf419a59c6c94405cd946a86b950136 100644
--- a/graph/src/main/scala/org/apache/spark/graph/EdgeTriplet.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/EdgeTriplet.scala
@@ -5,49 +5,52 @@ package org.apache.spark.graph
  *
  * @tparam VD the type of the vertex attribute.
  * @tparam ED the type of the edge attribute
+ * 
+ * @todo specialize edge triplet for basic types, though when I last tried
+ * specializing I got a warning about inherenting from a type that is not
+ * a trait.
  */
-class EdgeTriplet[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) VD,
-                  @specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED] {
+class EdgeTriplet[VD, ED] extends Edge[ED] {
+// class EdgeTriplet[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) VD: ClassManifest,
+//                   @specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED: ClassManifest] extends Edge[ED] {
+ 
+
   /**
-   * The vertex (id and attribute) corresponding to the source vertex.
+   * The source vertex attribute
    */
-  var src: Vertex[VD] = _
+   var srcAttr: VD = _ //nullValue[VD] 
 
   /**
-   * The vertex (id and attribute) corresponding to the target vertex.
+   * The destination vertex attribute
    */
-  var dst: Vertex[VD] = _
+   var dstAttr: VD = _ //nullValue[VD]
 
   /**
-   * The attribute associated with the edge.
+   * Set the edge properties of this triplet.  
    */
-  var data: ED = _
+  protected[spark] def set(other: Edge[ED]): EdgeTriplet[VD,ED] = {
+    srcId = other.srcId
+    dstId = other.dstId
+    attr = other.attr
+    this
+  }
 
   /**
    * Given one vertex in the edge return the other vertex.
    *
    * @param vid the id one of the two vertices on the edge.
-   * @return the other vertex on the edge.
+   * @return the attribute for the other vertex on the edge.
    */
-  def otherVertex(vid: Vid): Vertex[VD] =
-    if (src.id == vid) dst else { assert(dst.id == vid); src }
+  def otherVertexAttr(vid: Vid): VD =
+    if (srcId == vid) dstAttr else { assert(dstId == vid); srcAttr }
 
   /**
    * Get the vertex object for the given vertex in the edge.
    *
    * @param vid the id of one of the two vertices on the edge
-   * @return the vertex object with that id.
-   */
-  def vertex(vid: Vid): Vertex[VD] =
-    if (src.id == vid) src else { assert(dst.id == vid); dst }
-
-  /**
-   * Return the relative direction of the edge to the corresponding vertex.
-   *
-   * @param vid the id of one of the two vertices in the edge.
-   * @return the relative direction of the edge to the corresponding vertex.
+   * @return the attr for the vertex with that id.
    */
-  def relativeDirection(vid: Vid): EdgeDirection =
-    if (vid == src.id) EdgeDirection.Out else { assert(vid == dst.id); EdgeDirection.In }
+  def vertexAttr(vid: Vid): VD =
+    if (srcId == vid) srcAttr else { assert(dstId == vid); dstAttr }
 
 }
diff --git a/graph/src/main/scala/org/apache/spark/graph/Graph.scala b/graph/src/main/scala/org/apache/spark/graph/Graph.scala
index 09a1af63a6713f3b6cc94553c59e03ae770ed39a..342151173a63674a3ae294ac9768f0e1bc3d207d 100644
--- a/graph/src/main/scala/org/apache/spark/graph/Graph.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/Graph.scala
@@ -2,6 +2,7 @@ package org.apache.spark.graph
 
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.util.ClosureCleaner
 
 
 
@@ -21,13 +22,13 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
   /**
    * Get the vertices and their data.
    *
+   * @note vertex ids are unique. 
    * @return An RDD containing the vertices in this graph
    *
    * @see Vertex for the vertex type.
    *
-   * @todo should vertices return tuples instead of vertex objects?
    */
-  def vertices: RDD[Vertex[VD]]
+  val vertices: RDD[(Vid,VD)]
 
   /**
    * Get the Edges and their data as an RDD.  The entries in the RDD contain
@@ -42,7 +43,7 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
    * @todo Should edges return 3 tuples instead of Edge objects?  In this case
    * we could rename EdgeTriplet to Edge?
    */
-  def edges: RDD[Edge[ED]]
+  val edges: RDD[Edge[ED]]
 
   /**
    * Get the edges with the vertex data associated with the adjacent pair of
@@ -62,7 +63,7 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
    * @see edges() If only the edge data and adjacent vertex ids are required.
    *
    */
-  def triplets: RDD[EdgeTriplet[VD, ED]]
+  val triplets: RDD[EdgeTriplet[VD, ED]]
 
   /**
    * Return a graph that is cached when first created. This is used to pin a
@@ -73,6 +74,14 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
    */
   def cache(): Graph[VD, ED]
 
+
+  /**
+   * Compute statistics describing the graph representation.
+   */
+  def statistics: Map[String, Any]
+
+
+
   /**
    * Construct a new graph where each vertex value has been transformed by the
    * map function.
@@ -91,17 +100,17 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
    * val rawGraph: Graph[(), ()] = Graph.textFile("hdfs://file")
    * val root = 42
    * var bfsGraph = rawGraph
-   *   .mapVertices[Int](v => if(v.id == 0) 0 else Math.MaxValue)
+   *   .mapVertices[Int]((vid, data) => if(vid == root) 0 else Math.MaxValue)
    * }}}
    *
    */
-  def mapVertices[VD2: ClassManifest](map: Vertex[VD] => VD2): Graph[VD2, ED]
+  def mapVertices[VD2: ClassManifest](map: (Vid, VD) => VD2): Graph[VD2, ED]
 
   /**
    * Construct a new graph where each the value of each edge is transformed by
    * the map operation.  This function is not passed the vertex value for the
    * vertices adjacent to the edge.  If vertex values are desired use the
-   * mapEdgesWithVertices function.
+   * mapTriplets function.
    *
    * @note This graph is not changed and that the new graph has the same
    * structure.  As a consequence the underlying index structures can be
@@ -134,7 +143,7 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
    * on the attributes associated with each vertex.
    * {{{
    * val rawGraph: Graph[Int, Int] = someLoadFunction()
-   * val graph = rawGraph.mapEdgesWithVertices[Int]( edge =>
+   * val graph = rawGraph.mapTriplets[Int]( edge =>
    *   edge.src.data - edge.dst.data)
    * }}}
    *
@@ -143,14 +152,6 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
     map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2]
 
 
-  /**
-   * Remove edges conntecting vertices that are not in the graph.
-   *
-   * @todo remove this function and ensure that for a graph G=(V,E):
-   *     if (u,v) in E then u in V and v in V 
-   */
-  def correctEdges(): Graph[VD, ED]
-
   /**
    * Construct a new graph with all the edges reversed.  If this graph contains
    * an edge from a to b then the returned graph contains an edge from b to a.
@@ -177,117 +178,93 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
    * @return the subgraph containing only the vertices and edges that satisfy the
    * predicates. 
    */
-  def subgraph(epred: EdgeTriplet[VD,ED] => Boolean = (_ => true), 
-    vpred: Vertex[VD] => Boolean = (_ => true) ): Graph[VD, ED]
-
+  def subgraph(epred: EdgeTriplet[VD,ED] => Boolean = (x => true), 
+    vpred: (Vid, VD) => Boolean = ((v,d) => true) ): Graph[VD, ED]
 
-  // /**
-  //  * Combine the attrributes of edges connecting the same vertices.   
-  //  *
-  //  * @todo Do we want to support this function
-  //  */
-  // def combineEdges(reduce: (ED, ED) => ED): Graph[VD, ED]
 
 
   /**
-   * This function is used to compute a statistic for the neighborhood of each
-   * vertex.
+   * groupEdgeTriplets is used to merge multiple edges that have the
+   * same source and destination vertex into a single edge. The user
+   * supplied function is applied to each directed pair of vertices (u, v) and
+   * has access to all EdgeTriplets
    *
-   * This is one of the core functions in the Graph API in that enables
-   * neighborhood level computation.  For example this function can be used to
-   * count neighbors satisfying a predicate or implement PageRank.
+   * {e: for all e in E where e.src = u and e.dst = v}
    *
-   * @note The returned RDD may contain fewer entries than their are vertices
-   * in the graph.  This is because some vertices may not have neighbors or the
-   * map function may return None for all neighbors.
-   *
-   * @param mapFunc the function applied to each edge adjacent to each vertex.
-   * The mapFunc can optionally return None in which case it does not
-   * contribute to the final sum.
-   * @param mergeFunc the function used to merge the results of each map
-   * operation.
-   * @param direction the direction of edges to consider (e.g., In, Out, Both).
-   * @tparam VD2 The returned type of the aggregation operation.
-   *
-   * @return A Spark.RDD containing tuples of vertex identifiers and thee
-   * resulting value.  Note that the returned RDD may contain fewer vertices
-   * than in the original graph since some vertices may not have neighbors or
-   * the map function could return None for all neighbors.
-   *
-   * @example We can use this function to compute the average follower age for
-   * each user
-   * {{{
-   * val graph: Graph[Int,Int] = loadGraph()
-   * val averageFollowerAge: RDD[(Int, Int)] =
-   *   graph.aggregateNeighbors[(Int,Double)](
-   *     (vid, edge) => (edge.otherVertex(vid).data, 1),
-   *     (a, b) => (a._1 + b._1, a._2 + b._2),
-   *     EdgeDirection.In)
-   *     .mapValues{ case (sum,followers) => sum.toDouble / followers}
-   * }}}
+   * This function is identical to [[org.apache.spark.graph.Graph.groupEdges]]
+   * except that this function
+   * provides the user-supplied function with an iterator over EdgeTriplets,
+   * which contain the vertex data, whereas groupEdges provides the user-supplied
+   * function with an iterator over Edges, which only contain the vertex IDs.
+   *
+   * @tparam ED2 the type of the resulting edge data after grouping
+   *
+   * @param f the user supplied function to merge multiple EdgeTriplets
+   * into a single ED2 object
+   *
+   * @return Graph[VD,ED2] The resulting graph with a single Edge for each
+   * source, dest vertex pair.
    *
    */
-  def aggregateNeighbors[A: ClassManifest](
-      mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
-      mergeFunc: (A, A) => A,
-      direction: EdgeDirection)
-    : Graph[(VD, Option[A]), ED]
+  def groupEdgeTriplets[ED2: ClassManifest](f: Iterator[EdgeTriplet[VD,ED]] => ED2 ): Graph[VD,ED2]
+
 
   /**
-   * This function is used to compute a statistic for the neighborhood of each
-   * vertex and returns a value for all vertices (including those without
-   * neighbors).
+   * This function merges multiple edges between two vertices into a single
+   * Edge. See [[org.apache.spark.graph.Graph.groupEdgeTriplets]] for more detail.
    *
-   * This is one of the core functions in the Graph API in that enables
-   * neighborhood level computation. For example this function can be used to
-   * count neighbors satisfying a predicate or implement PageRank.
+   * @tparam ED2 the type of the resulting edge data after grouping.
+   *
+   * @param f the user supplied function to merge multiple Edges
+   * into a single ED2 object.
    *
-   * @note Because the a default value is provided all vertices will have a
-   * corresponding entry in the returned RDD.
-   *
-   * @param mapFunc the function applied to each edge adjacent to each vertex.
-   * The mapFunc can optionally return None in which case it does not
-   * contribute to the final sum.
-   * @param reduceFunc the function used to merge the results of each map
-   * operation.
-   * @param default the default value to use for each vertex if it has no
-   * neighbors or the map function repeatedly evaluates to none
-   * @param direction the direction of edges to consider (e.g., In, Out, Both).
-   * @tparam VD2 The returned type of the aggregation operation.
-   *
-   * @return A Spark.RDD containing tuples of vertex identifiers and
-   * their resulting value.  There will be exactly one entry for ever vertex in
-   * the original graph.
-   *
-   * @example We can use this function to compute the average follower age
-   * for each user
+   * @return Graph[VD,ED2] The resulting graph with a single Edge for each
+   * source, dest vertex pair.
+   */
+  def groupEdges[ED2: ClassManifest](f: Iterator[Edge[ED]] => ED2 ): Graph[VD,ED2]
+
+
+  /**
+   * The mapReduceTriplets function is used to compute statistics about
+   * the neighboring edges and vertices of each vertex.  The user supplied
+   * `mapFunc` function is invoked on each edge of the graph generating 0 or 
+   * more "messages" to be "sent" to either vertex in the edge.  
+   * The `reduceFunc` is then used to combine the output of the map phase
+   * destined to each vertex.  
+   *
+   * @tparam A the type of "message" to be sent to each vertex
+   *
+   * @param mapFunc the user defined map function which returns 0 or 
+   * more messages to neighboring vertices.
+   * @param reduceFunc the user defined reduce function which should be
+   * commutative and assosciative and is used to combine the output of
+   * the map phase. 
+   * 
+   * @example We can use this function to compute the inDegree of each
+   * vertex
    * {{{
-   * val graph: Graph[Int,Int] = loadGraph()
-   * val averageFollowerAge: RDD[(Int, Int)] =
-   *   graph.aggregateNeighbors[(Int,Double)](
-   *     (vid, edge) => (edge.otherVertex(vid).data, 1),
-   *     (a, b) => (a._1 + b._1, a._2 + b._2),
-   *     -1,
-   *     EdgeDirection.In)
-   *     .mapValues{ case (sum,followers) => sum.toDouble / followers}
+   * val rawGraph: Graph[(),()] = Graph.textFile("twittergraph")
+   * val inDeg: RDD[(Vid, Int)] = 
+   *   mapReduceTriplets[Int](et => Array((et.dst.id, 1)), _ + _)
    * }}}
    *
-   * @todo Should this return a graph with the new vertex values?
-   *
+   * @note By expressing computation at the edge level we achieve maximum 
+   * parallelism.  This is one of the core functions in the Graph API in that enables
+   * neighborhood level computation. For example this function can be used to
+   * count neighbors satisfying a predicate or implement PageRank.
+   * 
    */
-  def aggregateNeighbors[A: ClassManifest](
-      mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
-      reduceFunc: (A, A) => A,
-      default: A, // Should this be a function or a value?
-      direction: EdgeDirection)
-    : Graph[(VD, Option[A]), ED]
+  def mapReduceTriplets[A: ClassManifest](
+      mapFunc: EdgeTriplet[VD, ED] => Array[(Vid, A)],
+      reduceFunc: (A, A) => A)
+    : RDD[(Vid, A)] 
 
 
   /**
    * Join the vertices with an RDD and then apply a function from the the
-   * vertex and RDD entry to a new vertex value and type.  The input table should
-   * contain at most one entry for each vertex.  If no entry is provided the
-   * map function is invoked passing none.
+   * vertex and RDD entry to a new vertex value and type.  
+   * The input table should contain at most one entry for each vertex.  
+   * If no entry is provided the map function is invoked passing none.
    *
    * @tparam U the type of entry in the table of updates
    * @tparam VD2 the new vertex value type
@@ -303,60 +280,17 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
    * vertex record
    * {{{
    * val rawGraph: Graph[(),()] = Graph.textFile("webgraph")
-   * val outDeg: RDD[(Int, Int)] = rawGraph.outDegrees()
-   * val graph = rawGraph.leftJoinVertices[Int,Int](outDeg,
-   *   (v, deg) => deg.getOrElse(0) )
+   * val outDeg: RDD[(Vid, Int)] = rawGraph.outDegrees()
+   * val graph = rawGraph.outerJoinVertices(outDeg) { 
+   *   (vid, data, optDeg) => optDeg.getOrElse(0)
+   * }
    * }}}
    *
-   * @todo Should this function be curried to enable type inference?  For
-   * example
-   * {{{
-   * graph.leftJoinVertices(tbl)( (v, row) => row.getOrElse(0) )
-   * }}}
-   * @todo Is leftJoinVertices the right name?
    */
-  def leftJoinVertices[U: ClassManifest, VD2: ClassManifest](
-      table: RDD[(Vid, U)],
-      mapFunc: (Vertex[VD], Option[U]) => VD2)
+  def outerJoinVertices[U: ClassManifest, VD2: ClassManifest](table: RDD[(Vid, U)])
+      (mapFunc: (Vid, VD, Option[U]) => VD2)
     : Graph[VD2, ED]
 
-  /**
-   * Join the vertices with an RDD and then apply a function from the the
-   * vertex and RDD entry to a new vertex value.  The input table should
-   * contain at most one entry for each vertex.  If no entry is provided the
-   * map function is skipped and the old value is used.
-   *
-   * @tparam U the type of entry in the table of updates
-   * @param table the table to join with the vertices in the graph.  The table
-   * should contain at most one entry for each vertex.
-   * @param mapFunc the function used to compute the new vertex values.  The
-   * map function is invoked only for vertices with a corresponding entry in
-   * the table otherwise the old vertex value is used.
-   *
-   * @note for small tables this function can be much more efficient than
-   * leftJoinVertices
-   *
-   * @example This function is used to update the vertices with new values
-   * based on external data.  For example we could add the out degree to each
-   * vertex record
-   * {{{
-   * val rawGraph: Graph[Int,()] = Graph.textFile("webgraph")
-   *   .mapVertices(v => 0)
-   * val outDeg: RDD[(Int, Int)] = rawGraph.outDegrees()
-   * val graph = rawGraph.leftJoinVertices[Int,Int](outDeg,
-   *   (v, deg) => deg )
-   * }}}
-   *
-   * @todo Should this function be curried to enable type inference?  For
-   * example
-   * {{{
-   * graph.joinVertices(tbl)( (v, row) => row )
-   * }}}
-   */
-  def joinVertices[U: ClassManifest](
-      table: RDD[(Vid, U)],
-      mapFunc: (Vertex[VD], U) => VD)
-    : Graph[VD, ED]
 
   // Save a copy of the GraphOps object so there is always one unique GraphOps object
   // for a given Graph object, and thus the lazy vals in GraphOps would work as intended.
@@ -364,11 +298,24 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
 }
 
 
+/**
+ * The Graph Singleton contains basic routines to create graphs
+ */
 object Graph {
 
   import org.apache.spark.graph.impl._
   import org.apache.spark.SparkContext._
 
+  /**
+   * Construct a graph from a list of Edges. 
+   *
+   * @param rawEdges a collection of edges in (src,dst) form.
+   * @param uniqueEdges if multiple identical edges are found they are combined
+   * and the edge attribute is set to the sum.  Otherwise duplicate edges are 
+   * treated as separate. 
+   *
+   * 
+   */
   def apply(rawEdges: RDD[(Vid, Vid)], uniqueEdges: Boolean = true): Graph[Int, Int] = {
     // Reduce to unique edges.
     val edges: RDD[Edge[Int]] =
@@ -378,16 +325,17 @@ object Graph {
         rawEdges.map { case (s, t) => Edge(s, t, 1) }
       }
     // Determine unique vertices
-    val vertices: RDD[Vertex[Int]] = edges.flatMap{ case Edge(s, t, cnt) => Array((s, 1), (t, 1)) }
-      .reduceByKey(_ + _)
-      .map{ case (id, deg) => Vertex(id, deg) }
+    /** @todo Should this reduceByKey operation be indexed? */ 
+    val vertices: RDD[(Vid, Int)] = 
+      edges.flatMap{ case Edge(s, t, cnt) => Array((s, 1), (t, 1)) }.reduceByKey(_ + _)
+ 
     // Return graph
-    new GraphImpl(vertices, edges)
+    GraphImpl(vertices, edges)
   }
 
   def apply[VD: ClassManifest, ED: ClassManifest](
-      vertices: RDD[Vertex[VD]], edges: RDD[Edge[ED]]): Graph[VD, ED] = {
-    new GraphImpl(vertices, edges)
+      vertices: RDD[(Vid,VD)], edges: RDD[Edge[ED]]): Graph[VD, ED] = {
+    GraphImpl(vertices, edges)
   }
 
   implicit def graphToGraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) = g.ops
diff --git a/graph/src/main/scala/org/apache/spark/graph/GraphKryoRegistrator.scala b/graph/src/main/scala/org/apache/spark/graph/GraphKryoRegistrator.scala
index 13a22f9051e0d46e7fff006cec8fd6eb9f3ed86b..29ea38ec67fdfed29c313733dab2ab597183124e 100644
--- a/graph/src/main/scala/org/apache/spark/graph/GraphKryoRegistrator.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/GraphKryoRegistrator.scala
@@ -2,23 +2,20 @@ package org.apache.spark.graph
 
 import com.esotericsoftware.kryo.Kryo
 
+import org.apache.spark.graph.impl.MessageToPartition
 import org.apache.spark.serializer.KryoRegistrator
-
+import org.apache.spark.graph.impl._
 
 class GraphKryoRegistrator extends KryoRegistrator {
 
   def registerClasses(kryo: Kryo) {
-    //kryo.register(classOf[(Int, Float, Float)])
-    registerClass[Int, Int, Int](kryo)
+    kryo.register(classOf[Edge[Object]])
+    kryo.register(classOf[MutableTuple2[Object, Object]])
+    kryo.register(classOf[MessageToPartition[Object]])
+    kryo.register(classOf[(Vid, Object)])
+    kryo.register(classOf[EdgePartition[Object]])
 
     // This avoids a large number of hash table lookups.
     kryo.setReferences(false)
   }
-
-  private def registerClass[VD: Manifest, ED: Manifest, VD2: Manifest](kryo: Kryo) {
-    kryo.register(classOf[Vertex[VD]])
-    kryo.register(classOf[Edge[ED]])
-    kryo.register(classOf[MutableTuple2[VD, VD2]])
-    kryo.register(classOf[(Vid, VD2)])
-  }
 }
diff --git a/graph/src/main/scala/org/apache/spark/graph/GraphLab.scala b/graph/src/main/scala/org/apache/spark/graph/GraphLab.scala
index 01f24a13024c7344fc1050baf84a6ba815d5c48f..2f2a624592de12e41fdc2fd4b82b91b8ec87f3fb 100644
--- a/graph/src/main/scala/org/apache/spark/graph/GraphLab.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/GraphLab.scala
@@ -4,7 +4,7 @@ import scala.collection.JavaConversions._
 import org.apache.spark.rdd.RDD
 
 /**
- * This object implement the graphlab gather-apply-scatter api.
+ * This object implements the GraphLab gather-apply-scatter api.
  */
 object GraphLab {
 
@@ -36,7 +36,7 @@ object GraphLab {
   def iterate[VD: ClassManifest, ED: ClassManifest, A: ClassManifest](graph: Graph[VD, ED])(
     gatherFunc: (Vid, EdgeTriplet[VD, ED]) => A,
     mergeFunc: (A, A) => A,
-    applyFunc: (Vertex[VD], Option[A]) => VD,
+    applyFunc: (Vid, VD, Option[A]) => VD,
     scatterFunc: (Vid, EdgeTriplet[VD, ED]) => Boolean,
     numIter: Int = Integer.MAX_VALUE,
     gatherDirection: EdgeDirection = EdgeDirection.In,
@@ -45,18 +45,18 @@ object GraphLab {
 
     // Add an active attribute to all vertices to track convergence.
     var activeGraph: Graph[(Boolean, VD), ED] = graph.mapVertices {
-      case Vertex(id, data) => (true, data)
+      case (id, data) => (true, data)
     }.cache()
 
     // The gather function wrapper strips the active attribute and
     // only invokes the gather function on active vertices
     def gather(vid: Vid, e: EdgeTriplet[(Boolean, VD), ED]): Option[A] = {
-      if (e.vertex(vid).data._1) {
-        val edge = new EdgeTriplet[VD,ED]
-        edge.src = Vertex(e.src.id, e.src.data._2)
-        edge.dst = Vertex(e.dst.id, e.dst.data._2)
-        edge.data = e.data
-        Some(gatherFunc(vid, edge))
+      if (e.vertexAttr(vid)._1) {
+        val edgeTriplet = new EdgeTriplet[VD,ED]
+        edgeTriplet.set(e)
+        edgeTriplet.srcAttr = e.srcAttr._2
+        edgeTriplet.dstAttr = e.dstAttr._2
+        Some(gatherFunc(vid, edgeTriplet))
       } else {
         None
       }
@@ -64,34 +64,31 @@ object GraphLab {
 
     // The apply function wrapper strips the vertex of the active attribute
     // and only invokes the apply function on active vertices
-    def apply(v: Vertex[((Boolean, VD), Option[A])]): (Boolean, VD) = {
-      val ((active, vData), accum) = v.data
-      if (active) (true, applyFunc(Vertex(v.id, vData), accum))
+    def apply(vid: Vid, data: (Boolean, VD), accum: Option[A]): (Boolean, VD) = {
+      val (active, vData) = data
+      if (active) (true, applyFunc(vid, vData, accum))
       else (false, vData)
     }
 
     // The scatter function wrapper strips the vertex of the active attribute
     // and only invokes the scatter function on active vertices
     def scatter(rawVid: Vid, e: EdgeTriplet[(Boolean, VD), ED]): Option[Boolean] = {
-      val vid = e.otherVertex(rawVid).id
-      if (e.vertex(vid).data._1) {
-        val edge = new EdgeTriplet[VD,ED]
-        edge.src = Vertex(e.src.id, e.src.data._2)
-        edge.dst = Vertex(e.dst.id, e.dst.data._2)
-        edge.data = e.data
-//        val src = Vertex(e.src.id, e.src.data._2)
-//        val dst = Vertex(e.dst.id, e.dst.data._2)
-//        val edge = new EdgeTriplet[VD,ED](src, dst, e.data)
-        Some(scatterFunc(vid, edge))
+      val vid = e.otherVertexId(rawVid)
+      if (e.vertexAttr(vid)._1) {
+        val edgeTriplet = new EdgeTriplet[VD,ED]
+        edgeTriplet.set(e)
+        edgeTriplet.srcAttr = e.srcAttr._2
+        edgeTriplet.dstAttr = e.dstAttr._2
+        Some(scatterFunc(vid, edgeTriplet))
       } else {
         None
       }
     }
 
     // Used to set the active status of vertices for the next round
-    def applyActive(v: Vertex[((Boolean, VD), Option[Boolean])]): (Boolean, VD) = {
-      val ((prevActive, vData), newActive) = v.data
-      (newActive.getOrElse(false), vData)
+    def applyActive(vid: Vid, data: (Boolean, VD), newActive: Boolean): (Boolean, VD) = {
+      val (prevActive, vData) = data
+      (newActive, vData)
     }
 
     // Main Loop ---------------------------------------------------------------------
@@ -99,29 +96,32 @@ object GraphLab {
     var numActive = activeGraph.numVertices
     while (i < numIter && numActive > 0) {
 
-      val gathered: Graph[((Boolean, VD), Option[A]), ED] =
+      // Gather
+      val gathered: RDD[(Vid, A)] =
         activeGraph.aggregateNeighbors(gather, mergeFunc, gatherDirection)
 
-      val applied: Graph[(Boolean, VD), ED] = gathered.mapVertices(apply).cache()
+      // Apply 
+      activeGraph = activeGraph.outerJoinVertices(gathered)(apply).cache()
 
-      activeGraph = applied.cache()
+      
 
       // Scatter is basically a gather in the opposite direction so we reverse the edge direction
       // activeGraph: Graph[(Boolean, VD), ED]
-      val scattered: Graph[((Boolean, VD), Option[Boolean]), ED] =
+      val scattered: RDD[(Vid, Boolean)] = 
         activeGraph.aggregateNeighbors(scatter, _ || _, scatterDirection.reverse)
-      val newActiveGraph: Graph[(Boolean, VD), ED] =
-        scattered.mapVertices(applyActive)
 
-      activeGraph = newActiveGraph.cache()
+      activeGraph = activeGraph.joinVertices(scattered)(applyActive).cache()
 
-      numActive = activeGraph.vertices.map(v => if (v.data._1) 1 else 0).reduce(_ + _)
+      // Calculate the number of active vertices
+      numActive = activeGraph.vertices.map{ 
+        case (vid, data) => if (data._1) 1 else 0
+        }.reduce(_ + _)
       println("Number active vertices: " + numActive)
       i += 1
     }
 
     // Remove the active attribute from the vertex data before returning the graph
-    activeGraph.mapVertices(v => v.data._2)
+    activeGraph.mapVertices{case (vid, data) => data._2 }
   }
 }
 
diff --git a/graph/src/main/scala/org/apache/spark/graph/GraphLoader.scala b/graph/src/main/scala/org/apache/spark/graph/GraphLoader.scala
index 4d7ca1268d36ef1a08ccdd0afd6797170b096693..76f69edf0e22fb60a75ea79924b4086de195dda2 100644
--- a/graph/src/main/scala/org/apache/spark/graph/GraphLoader.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/GraphLoader.scala
@@ -9,7 +9,18 @@ import org.apache.spark.graph.impl.GraphImpl
 object GraphLoader {
 
   /**
-   * Load an edge list from file initializing the Graph RDD
+   * Load an edge list from file initializing the Graph
+   *
+   * @tparam ED the type of the edge data of the resulting Graph
+   *
+   * @param sc the SparkContext used to construct RDDs
+   * @param path the path to the text file containing the edge list
+   * @param edgeParser a function that takes an array of strings and
+   * returns an ED object
+   * @param minEdgePartitions the number of partitions for the
+   * the Edge RDD
+   *
+   * @todo remove minVertexPartitions arg
    */
   def textFile[ED: ClassManifest](
       sc: SparkContext,
@@ -20,7 +31,7 @@ object GraphLoader {
     : GraphImpl[Int, ED] = {
 
     // Parse the edge data table
-    val edges = sc.textFile(path).flatMap { line =>
+    val edges = sc.textFile(path, minEdgePartitions).flatMap { line =>
       if (!line.isEmpty && line(0) != '#') {
         val lineArray = line.split("\\s+")
         if(lineArray.length < 2) {
@@ -38,17 +49,13 @@ object GraphLoader {
     }.cache()
 
     val graph = fromEdges(edges)
-    // println("Loaded graph:" +
-    //   "\n\t#edges:    " + graph.numEdges +
-    //   "\n\t#vertices: " + graph.numVertices)
-
     graph
   }
 
-  def fromEdges[ED: ClassManifest](edges: RDD[Edge[ED]]): GraphImpl[Int, ED] = {
-    val vertices = edges.flatMap { edge => List((edge.src, 1), (edge.dst, 1)) }
+  private def fromEdges[ED: ClassManifest](edges: RDD[Edge[ED]]): GraphImpl[Int, ED] = {
+    val vertices = edges.flatMap { edge => List((edge.srcId, 1), (edge.dstId, 1)) }
       .reduceByKey(_ + _)
-      .map{ case (vid, degree) => Vertex(vid, degree) }
-    new GraphImpl[Int, ED](vertices, edges)
+      .map{ case (vid, degree) => (vid, degree) }
+    GraphImpl(vertices, edges)
   }
 }
diff --git a/graph/src/main/scala/org/apache/spark/graph/GraphOps.scala b/graph/src/main/scala/org/apache/spark/graph/GraphOps.scala
index 9e8cc0a6d52a7ec7d9ef37b5d289463977b37b38..5e8f082fdad8aef9b9661f5d1f5ead42f1d1e0a2 100644
--- a/graph/src/main/scala/org/apache/spark/graph/GraphOps.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/GraphOps.scala
@@ -1,13 +1,17 @@
 package org.apache.spark.graph
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.SparkContext._
+import org.apache.spark.util.ClosureCleaner
 
 
-class GraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) {
+class GraphOps[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, ED]) {
 
-  lazy val numEdges: Long = g.edges.count()
 
-  lazy val numVertices: Long = g.vertices.count()
+
+  lazy val numEdges: Long = graph.edges.count()
+
+  lazy val numVertices: Long = graph.vertices.count()
 
   lazy val inDegrees: RDD[(Vid, Int)] = degreesRDD(EdgeDirection.In)
 
@@ -15,23 +19,143 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) {
 
   lazy val degrees: RDD[(Vid, Int)] = degreesRDD(EdgeDirection.Both)
 
+
+  /**
+   * This function is used to compute a statistic for the neighborhood of each
+   * vertex and returns a value for all vertices (including those without
+   * neighbors).
+   *
+   * @note Because the a default value is provided all vertices will have a
+   * corresponding entry in the returned RDD.
+   *
+   * @param mapFunc the function applied to each edge adjacent to each vertex.
+   * The mapFunc can optionally return None in which case it does not
+   * contribute to the final sum.
+   * @param reduceFunc the function used to merge the results of each map
+   * operation.
+   * @param default the default value to use for each vertex if it has no
+   * neighbors or the map function repeatedly evaluates to none
+   * @param direction the direction of edges to consider (e.g., In, Out, Both).
+   * @tparam VD2 The returned type of the aggregation operation.
+   *
+   * @return A Spark.RDD containing tuples of vertex identifiers and
+   * their resulting value.  There will be exactly one entry for ever vertex in
+   * the original graph.
+   *
+   * @example We can use this function to compute the average follower age
+   * for each user
+   * {{{
+   * val graph: Graph[Int,Int] = loadGraph()
+   * val averageFollowerAge: RDD[(Int, Int)] =
+   *   graph.aggregateNeighbors[(Int,Double)](
+   *     (vid, edge) => (edge.otherVertex(vid).data, 1),
+   *     (a, b) => (a._1 + b._1, a._2 + b._2),
+   *     -1,
+   *     EdgeDirection.In)
+   *     .mapValues{ case (sum,followers) => sum.toDouble / followers}
+   * }}}
+   *
+   * @todo Should this return a graph with the new vertex values?
+   *
+   */
+  def aggregateNeighbors[A: ClassManifest](
+      mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
+      reduceFunc: (A, A) => A,
+      dir: EdgeDirection)
+    : RDD[(Vid, A)] = {
+
+    ClosureCleaner.clean(mapFunc)
+    ClosureCleaner.clean(reduceFunc)
+
+    // Define a new map function over edge triplets 
+    val mf = (et: EdgeTriplet[VD,ED]) => {
+      // Compute the message to the dst vertex
+      val dst = 
+        if (dir == EdgeDirection.In || dir == EdgeDirection.Both) {
+          mapFunc(et.dstId, et)
+        } else { Option.empty[A] }
+      // Compute the message to the source vertex
+      val src = 
+        if (dir == EdgeDirection.Out || dir == EdgeDirection.Both) {
+          mapFunc(et.srcId, et)
+        } else { Option.empty[A] }
+      // construct the return array
+      (src, dst) match {
+        case (None, None) => Array.empty[(Vid, A)]
+        case (Some(srcA),None) => Array((et.srcId, srcA))
+        case (None, Some(dstA)) => Array((et.dstId, dstA))
+        case (Some(srcA), Some(dstA)) => 
+          Array((et.srcId, srcA), (et.dstId, dstA))
+      }
+    }
+
+    ClosureCleaner.clean(mf)
+    graph.mapReduceTriplets(mf, reduceFunc)
+  } // end of aggregateNeighbors
+
+
   def collectNeighborIds(edgeDirection: EdgeDirection) : RDD[(Vid, Array[Vid])] = {
-    val graph: Graph[(VD, Option[Array[Vid]]), ED] = g.aggregateNeighbors(
-      (vid, edge) => Some(Array(edge.otherVertex(vid).id)),
+    val nbrs = graph.aggregateNeighbors[Array[Vid]](
+      (vid, edge) => Some(Array(edge.otherVertexId(vid))),
       (a, b) => a ++ b,
       edgeDirection)
-    graph.vertices.map(v => {
-      val (_, neighborIds) = v.data
-      (v.id, neighborIds.getOrElse(Array()))
-    })
+
+    graph.vertices.leftOuterJoin(nbrs).mapValues{
+      case (_, Some(nbrs)) => nbrs
+      case (_, None) => Array.empty[Vid]
+    }
   }
 
+
   private def degreesRDD(edgeDirection: EdgeDirection): RDD[(Vid, Int)] = {
-    val degreeGraph: Graph[(VD, Option[Int]), ED] =
-      g.aggregateNeighbors((vid, edge) => Some(1), _+_, edgeDirection)
-    degreeGraph.vertices.map(v => {
-      val (_, degree) = v.data
-      (v.id, degree.getOrElse(0))
-    })
+    graph.aggregateNeighbors((vid, edge) => Some(1), _+_, edgeDirection)
   }
+
+
+  /**
+   * Join the vertices with an RDD and then apply a function from the the
+   * vertex and RDD entry to a new vertex value.  The input table should
+   * contain at most one entry for each vertex.  If no entry is provided the
+   * map function is skipped and the old value is used.
+   *
+   * @tparam U the type of entry in the table of updates
+   * @param table the table to join with the vertices in the graph.  The table
+   * should contain at most one entry for each vertex.
+   * @param mapFunc the function used to compute the new vertex values.  The
+   * map function is invoked only for vertices with a corresponding entry in
+   * the table otherwise the old vertex value is used.
+   *
+   * @note for small tables this function can be much more efficient than
+   * leftJoinVertices
+   *
+   * @example This function is used to update the vertices with new values
+   * based on external data.  For example we could add the out degree to each
+   * vertex record
+   * {{{
+   * val rawGraph: Graph[Int,()] = Graph.textFile("webgraph")
+   *   .mapVertices(v => 0)
+   * val outDeg: RDD[(Int, Int)] = rawGraph.outDegrees()
+   * val graph = rawGraph.leftJoinVertices[Int,Int](outDeg,
+   *   (v, deg) => deg )
+   * }}}
+   *
+   * @todo Should this function be curried to enable type inference?  For
+   * example
+   * {{{
+   * graph.joinVertices(tbl)( (v, row) => row )
+   * }}}
+   */
+  def joinVertices[U: ClassManifest](table: RDD[(Vid, U)])(mapFunc: (Vid, VD, U) => VD)
+    : Graph[VD, ED] = {
+    ClosureCleaner.clean(mapFunc)
+    val uf = (id: Vid, data: VD, o: Option[U]) => {
+      o match {
+        case Some(u) => mapFunc(id, data, u)
+        case None => data
+      }
+    }
+    ClosureCleaner.clean(uf)
+    graph.outerJoinVertices(table)(uf)
+  }
+
 }
diff --git a/graph/src/main/scala/org/apache/spark/graph/Pregel.scala b/graph/src/main/scala/org/apache/spark/graph/Pregel.scala
index 09bcc67c8ced9c384cc5fe759856f4c8f087ca1c..7ad6fda2a4570e4ca43378ae4db626660578babd 100644
--- a/graph/src/main/scala/org/apache/spark/graph/Pregel.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/Pregel.scala
@@ -3,10 +3,39 @@ package org.apache.spark.graph
 import org.apache.spark.rdd.RDD
 
 
+/**
+ * This object implements the Pregel bulk-synchronous
+ * message-passing API.
+ */
 object Pregel {
 
+
+  /**
+   * Execute the Pregel program.
+   *
+   * @tparam VD the vertex data type
+   * @tparam ED the edge data type
+   * @tparam A the Pregel message type
+   *
+   * @param vprog a user supplied function that acts as the vertex program for
+   *              the Pregel computation. It takes the vertex ID of the vertex it is running on,
+   *              the accompanying data for that vertex, and the incoming data and returns the
+   *              new vertex value.
+   * @param sendMsg a user supplied function that takes the current vertex ID and an EdgeTriplet
+   *                between the vertex and one of its neighbors and produces a message to send
+   *                to that neighbor.
+   * @param mergeMsg a user supplied function that takes two incoming messages of type A and merges
+   *                 them into a single message of type A. ''This function must be commutative and
+   *                 associative.''
+   * @param initialMsg the message each vertex will receive at the beginning of the
+   *                   first iteration.
+   * @param numIter the number of iterations to run this computation for.
+   *
+   * @return the resulting graph at the end of the computation
+   *
+   */
   def iterate[VD: ClassManifest, ED: ClassManifest, A: ClassManifest](graph: Graph[VD, ED])(
-      vprog: (Vertex[VD], A) => VD,
+      vprog: (Vid, VD, A) => VD,
       sendMsg: (Vid, EdgeTriplet[VD, ED]) => Option[A],
       mergeMsg: (A, A) => A,
       initialMsg: A,
@@ -17,27 +46,20 @@ object Pregel {
     //var g = graph.cache()
     var i = 0
 
-    def mapF(vid: Vid, edge: EdgeTriplet[VD,ED]) = sendMsg(edge.otherVertex(vid).id, edge)
-
-    def runProg(vertexWithMsgs: Vertex[(VD, Option[A])]): VD = {
-      val (vData, msg) = vertexWithMsgs.data
-      val v = Vertex(vertexWithMsgs.id, vData)
-      msg match {
-        case Some(m) => vprog(v, m)
-        case None => v.data
-      }
-    }
+    def mapF(vid: Vid, edge: EdgeTriplet[VD,ED]) = sendMsg(edge.otherVertexId(vid), edge)
 
-    var graphWithMsgs: Graph[(VD, Option[A]), ED] =
-      g.mapVertices(v => (v.data, Some(initialMsg)))
+    // Receive the first set of messages
+    g.mapVertices( (vid, vdata) => vprog(vid, vdata, initialMsg))
 
     while (i < numIter) {
-      val newGraph: Graph[VD, ED] = graphWithMsgs.mapVertices(runProg).cache()
-      graphWithMsgs = newGraph.aggregateNeighbors(mapF, mergeMsg, EdgeDirection.In)
+      // compute the messages
+      val messages = g.aggregateNeighbors(mapF, mergeMsg, EdgeDirection.In)
+      // receive the messages
+      g = g.joinVertices(messages)(vprog)
+      // count the iteration
       i += 1
     }
-    graphWithMsgs.mapVertices(vertexWithMsgs => vertexWithMsgs.data match {
-      case (vData, _) => vData
-    })
+    // Return the final graph
+    g
   }
 }
diff --git a/graph/src/main/scala/org/apache/spark/graph/Vertex.scala b/graph/src/main/scala/org/apache/spark/graph/Vertex.scala
deleted file mode 100644
index c8671b7f13024be6e5019b4b6f0d5b7ff9f3ae08..0000000000000000000000000000000000000000
--- a/graph/src/main/scala/org/apache/spark/graph/Vertex.scala
+++ /dev/null
@@ -1,15 +0,0 @@
-package org.apache.spark.graph
-
-/**
- * A graph vertex consists of a vertex id and attribute.
- *
- * @tparam VD the type of the vertex attribute.
- */
-case class Vertex[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) VD] (
-  var id: Vid = 0,
-  var data: VD = nullValue[VD]) {
-
-  def this(tuple: (Vid, VD)) = this(tuple._1, tuple._2)
-
-  def tuple = (id, data)
-}
diff --git a/graph/src/main/scala/org/apache/spark/graph/impl/EdgePartition.scala b/graph/src/main/scala/org/apache/spark/graph/impl/EdgePartition.scala
index 3d218f27b1850febbf1ec5de8004e978d886b1e7..dbfccde8b91d5830daae16e23dff70a3e37ed48e 100644
--- a/graph/src/main/scala/org/apache/spark/graph/impl/EdgePartition.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/impl/EdgePartition.scala
@@ -1,39 +1,49 @@
 package org.apache.spark.graph.impl
 
 import scala.collection.mutable.ArrayBuilder
-
-import it.unimi.dsi.fastutil.ints.IntArrayList
-
 import org.apache.spark.graph._
 
 
 /**
  * A partition of edges in 3 large columnar arrays.
  */
-private[graph]
-class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED: ClassManifest] {
+class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED: ClassManifest](
+  val srcIds: Array[Vid],
+  val dstIds: Array[Vid],
+  val data: Array[ED]
+  ){
 
-  private var _data: Array[ED] = _
-  private var _dataBuilder = ArrayBuilder.make[ED]
+  // private var _data: Array[ED] = _
+  // private var _dataBuilder = ArrayBuilder.make[ED]
 
-  val srcIds = new VertexArrayList
-  val dstIds = new VertexArrayList
+  // var srcIds = new VertexArrayList
+  // var dstIds = new VertexArrayList
 
-  def data: Array[ED] = _data
+  def reverse: EdgePartition[ED] = new EdgePartition(dstIds, srcIds, data)
 
-  /** Add a new edge to the partition. */
-  def add(src: Vid, dst: Vid, d: ED) {
-    srcIds.add(src)
-    dstIds.add(dst)
-    _dataBuilder += d
+  def map[ED2: ClassManifest](f: Edge[ED] => ED2): EdgePartition[ED2] = {
+    val newData = new Array[ED2](data.size)
+    val edge = new Edge[ED]()
+    for(i <- 0 until data.size){
+      edge.srcId  = srcIds(i)
+      edge.dstId  = dstIds(i)
+      edge.attr = data(i)
+      newData(i) = f(edge) 
+    }
+    new EdgePartition(srcIds, dstIds, newData)
   }
 
-  def trim() {
-    srcIds.trim()
-    dstIds.trim()
-    _data = _dataBuilder.result()
+  def foreach(f: Edge[ED] => Unit)  {
+    val edge = new Edge[ED]
+    for(i <- 0 until data.size){
+      edge.srcId  = srcIds(i)
+      edge.dstId  = dstIds(i)
+      edge.attr = data(i)
+      f(edge) 
+    }
   }
 
+
   def size: Int = srcIds.size
 
   def iterator = new Iterator[Edge[ED]] {
@@ -43,11 +53,13 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
     override def hasNext: Boolean = pos < EdgePartition.this.size
 
     override def next(): Edge[ED] = {
-      edge.src = srcIds.get(pos)
-      edge.dst = dstIds.get(pos)
-      edge.data = _data(pos)
+      edge.srcId = srcIds(pos)
+      edge.dstId = dstIds(pos)
+      edge.attr = data(pos)
       pos += 1
       edge
     }
   }
 }
+
+
diff --git a/graph/src/main/scala/org/apache/spark/graph/impl/EdgePartitionBuilder.scala b/graph/src/main/scala/org/apache/spark/graph/impl/EdgePartitionBuilder.scala
new file mode 100644
index 0000000000000000000000000000000000000000..cc3a443fa2bb8f6232a267694f48e55118bc6c0a
--- /dev/null
+++ b/graph/src/main/scala/org/apache/spark/graph/impl/EdgePartitionBuilder.scala
@@ -0,0 +1,31 @@
+package org.apache.spark.graph.impl
+
+import scala.collection.mutable.ArrayBuilder
+import org.apache.spark.graph._
+
+
+//private[graph]
+class EdgePartitionBuilder[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) 
+ED: ClassManifest]{
+  val srcIds = new VertexArrayList
+  val dstIds = new VertexArrayList
+  var dataBuilder = ArrayBuilder.make[ED]
+
+
+  /** Add a new edge to the partition. */
+  def add(src: Vid, dst: Vid, d: ED) {
+    srcIds.add(src)
+    dstIds.add(dst)
+    dataBuilder += d
+  }
+
+  def toEdgePartition: EdgePartition[ED] = {
+    new EdgePartition(srcIds.toLongArray(), dstIds.toLongArray(), dataBuilder.result())
+  }
+  
+
+}
+
+
+
+
diff --git a/graph/src/main/scala/org/apache/spark/graph/impl/EdgeTripletRDD.scala b/graph/src/main/scala/org/apache/spark/graph/impl/EdgeTripletRDD.scala
deleted file mode 100644
index 18d5d2b5aae70ecf174b6df54d1891ba4be203b2..0000000000000000000000000000000000000000
--- a/graph/src/main/scala/org/apache/spark/graph/impl/EdgeTripletRDD.scala
+++ /dev/null
@@ -1,87 +0,0 @@
-package org.apache.spark.graph.impl
-
-import org.apache.spark.Aggregator
-import org.apache.spark.Partition
-import org.apache.spark.SparkEnv
-import org.apache.spark.TaskContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.Dependency
-import org.apache.spark.OneToOneDependency
-import org.apache.spark.ShuffleDependency
-import org.apache.spark.SparkContext._
-import org.apache.spark.graph._
-
-
-private[graph]
-class EdgeTripletPartition(idx: Int, val vPart: Partition, val ePart: Partition)
-  extends Partition {
-  override val index: Int = idx
-  override def hashCode(): Int = idx
-}
-
-
-/**
- * A RDD that brings together edge data with its associated vertex data.
- */
-private[graph]
-class EdgeTripletRDD[VD: ClassManifest, ED: ClassManifest](
-    vTableReplicated: RDD[(Vid, VD)],
-    eTable: RDD[(Pid, EdgePartition[ED])])
-  extends RDD[(VertexHashMap[VD], Iterator[EdgeTriplet[VD, ED]])](eTable.context, Nil) {
-
-  println(vTableReplicated.partitioner.get.numPartitions)
-  println(eTable.partitioner.get.numPartitions)
-
-  assert(vTableReplicated.partitioner == eTable.partitioner)
-
-  override def getDependencies: List[Dependency[_]] = {
-    List(new OneToOneDependency(eTable), new OneToOneDependency(vTableReplicated))
-  }
-
-  override def getPartitions = Array.tabulate[Partition](eTable.partitions.size) {
-    i => new EdgeTripletPartition(i, eTable.partitions(i), vTableReplicated.partitions(i))
-  }
-
-  override val partitioner = eTable.partitioner
-
-  override def getPreferredLocations(s: Partition) =
-    eTable.preferredLocations(s.asInstanceOf[EdgeTripletPartition].ePart)
-
-  override def compute(s: Partition, context: TaskContext)
-    : Iterator[(VertexHashMap[VD], Iterator[EdgeTriplet[VD, ED]])] = {
-
-    val split = s.asInstanceOf[EdgeTripletPartition]
-
-    // Fetch the vertices and put them in a hashmap.
-    // TODO: use primitive hashmaps for primitive VD types.
-    val vmap = new VertexHashMap[VD]//(1000000)
-    vTableReplicated.iterator(split.vPart, context).foreach { v => vmap.put(v._1, v._2) }
-
-    val (pid, edgePartition) = eTable.iterator(split.ePart, context).next()
-      .asInstanceOf[(Pid, EdgePartition[ED])]
-
-    // Return an iterator that looks up the hash map to find matching vertices for each edge.
-    val iter = new Iterator[EdgeTriplet[VD, ED]] {
-      private var pos = 0
-      private val e = new EdgeTriplet[VD, ED]
-      e.src = new Vertex[VD]
-      e.dst = new Vertex[VD]
-
-      override def hasNext: Boolean = pos < edgePartition.size
-      override def next() = {
-        e.src.id = edgePartition.srcIds.getLong(pos)
-        // assert(vmap.containsKey(e.src.id))
-        e.src.data = vmap.get(e.src.id)
-
-        e.dst.id = edgePartition.dstIds.getLong(pos)
-        // assert(vmap.containsKey(e.dst.id))
-        e.dst.data = vmap.get(e.dst.id)
-
-        e.data = edgePartition.data(pos)
-        pos += 1
-        e
-      }
-    }
-    Iterator((vmap, iter))
-  }
-}
diff --git a/graph/src/main/scala/org/apache/spark/graph/impl/GraphImpl.scala b/graph/src/main/scala/org/apache/spark/graph/impl/GraphImpl.scala
index e397293a3d63d623556b471e372f1280d0a18c32..e7a708e895b283d2d630b85cdf170b7b1e68b7e7 100644
--- a/graph/src/main/scala/org/apache/spark/graph/impl/GraphImpl.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/impl/GraphImpl.scala
@@ -2,323 +2,543 @@ package org.apache.spark.graph.impl
 
 import scala.collection.JavaConversions._
 
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.ArrayBuilder
+import scala.collection.mutable.BitSet
+
+
 import org.apache.spark.SparkContext._
 import org.apache.spark.Partitioner
 import org.apache.spark.HashPartitioner 
 import org.apache.spark.util.ClosureCleaner
 
+import org.apache.spark.rdd
 import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.IndexedRDD
+import org.apache.spark.rdd.RDDIndex
+
 
 import org.apache.spark.graph._
 import org.apache.spark.graph.impl.GraphImpl._
+import org.apache.spark.graph.impl.MessageToPartitionRDDFunctions._
 
+/**
+ * The Iterator type returned when constructing edge triplets
+ */
+class EdgeTripletIterator[VD: ClassManifest, ED: ClassManifest](
+  val vidToIndex: VertexIdToIndexMap,
+  val vertexArray: Array[VD],
+  val edgePartition: EdgePartition[ED]) extends Iterator[EdgeTriplet[VD, ED]] {
+
+  private var pos = 0
+  private val et = new EdgeTriplet[VD, ED]
+  
+  override def hasNext: Boolean = pos < edgePartition.size
+  override def next() = {
+    et.srcId = edgePartition.srcIds(pos)
+    // assert(vmap.containsKey(e.src.id))
+    et.srcAttr = vertexArray(vidToIndex(et.srcId))
+    et.dstId = edgePartition.dstIds(pos)
+    // assert(vmap.containsKey(e.dst.id))
+    et.dstAttr = vertexArray(vidToIndex(et.dstId))
+    et.attr = edgePartition.data(pos)
+    pos += 1
+    et
+  }
 
+  override def toList: List[EdgeTriplet[VD, ED]] = {
+    val lb = new mutable.ListBuffer[EdgeTriplet[VD,ED]]
+    val currentEdge = new EdgeTriplet[VD, ED]
+    for (i <- (0 until edgePartition.size)) {
+      currentEdge.srcId = edgePartition.srcIds(i)
+      // assert(vmap.containsKey(e.src.id))
+      currentEdge.srcAttr = vertexArray(vidToIndex(currentEdge.srcId))
+      currentEdge.dstId = edgePartition.dstIds(i)
+      // assert(vmap.containsKey(e.dst.id))
+      currentEdge.dstAttr = vertexArray(vidToIndex(currentEdge.dstId))
+      currentEdge.attr = edgePartition.data(i)
+      lb += currentEdge
+    }
+    lb.toList
+  }
+} // end of Edge Triplet Iterator
 
 
 
+object EdgeTripletBuilder {
+  def makeTriplets[VD: ClassManifest, ED: ClassManifest]( 
+    localVidMap: IndexedRDD[Pid, VertexIdToIndexMap],
+    vTableReplicatedValues: IndexedRDD[Pid, Array[VD]],
+    eTable: IndexedRDD[Pid, EdgePartition[ED]]): RDD[EdgeTriplet[VD, ED]] = {
+    val iterFun = (iter: Iterator[(Pid, ((VertexIdToIndexMap, Array[VD]), EdgePartition[ED]))]) => {
+      val (pid, ((vidToIndex, vertexArray), edgePartition)) = iter.next()
+      assert(iter.hasNext == false)
+      new EdgeTripletIterator(vidToIndex, vertexArray, edgePartition)
+    }
+    ClosureCleaner.clean(iterFun) 
+    localVidMap.zipJoin(vTableReplicatedValues).zipJoin(eTable)
+      .mapPartitions( iterFun ) // end of map partition
+  }
+}
+
+
+//   {
+//     val iterFun = (iter: Iterator[(Pid, ((VertexIdToIndexMap, Array[VD]), EdgePartition[ED]))]) => {
+//       val (pid, ((vidToIndex, vertexArray), edgePartition)) = iter.next()
+//       assert(iter.hasNext == false)
+//       // Return an iterator that looks up the hash map to find matching 
+//       // vertices for each edge.
+//       new EdgeTripletIterator(vidToIndex, vertexArray, edgePartition)
+//     }
+//     ClosureCleaner.clean(iterFun) 
+//     localVidMap.zipJoin(vTableReplicatedValues).zipJoinRDD(eTable)
+//       .mapPartitions( iterFun ) // end of map partition
+//   }
+// }
+
+
 /**
  * A Graph RDD that supports computation on graphs.
  */
 class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
-    val numVertexPartitions: Int,
-    val numEdgePartitions: Int,
-    _rawVertices: RDD[Vertex[VD]],
-    _rawEdges: RDD[Edge[ED]],
-    _rawVTable: RDD[(Vid, (VD, Array[Pid]))],
-    _rawETable: RDD[(Pid, EdgePartition[ED])])
+    @transient val vTable: IndexedRDD[Vid, VD],
+    @transient val vid2pid: IndexedRDD[Vid, Array[Pid]],
+    @transient val localVidMap: IndexedRDD[Pid, VertexIdToIndexMap],
+    @transient val eTable: IndexedRDD[Pid, EdgePartition[ED]])
   extends Graph[VD, ED] {
 
-  def this(vertices: RDD[Vertex[VD]], edges: RDD[Edge[ED]]) = {
-    this(vertices.partitions.size, edges.partitions.size, vertices, edges, null, null)
-  }
+//  def this() = this(null,null,null)
 
-  def withPartitioner(numVertexPartitions: Int, numEdgePartitions: Int): Graph[VD, ED] = {
-    if (_cached) {
-      new GraphImpl(numVertexPartitions, numEdgePartitions, null, null, _rawVTable, _rawETable)
-        .cache()
-    } else {
-      new GraphImpl(numVertexPartitions, numEdgePartitions, _rawVertices, _rawEdges, null, null)
-    }
-  }
 
-  def withVertexPartitioner(numVertexPartitions: Int) = {
-    withPartitioner(numVertexPartitions, numEdgePartitions)
-  }
+  /**
+   * (localVidMap: IndexedRDD[Pid, VertexIdToIndexMap]) is a version of the
+   * vertex data after it is replicated. Within each partition, it holds a map
+   * from vertex ID to the index where that vertex's attribute is stored. This
+   * index refers to an array in the same partition in vTableReplicatedValues.
+   *
+   * (vTableReplicatedValues: IndexedRDD[Pid, Array[VD]]) holds the vertex data
+   * and is arranged as described above.
+   */
+  @transient val vTableReplicatedValues =
+    createVTableReplicated(vTable, vid2pid, localVidMap)
+
+
+  /** Return a RDD of vertices. */
+  @transient override val vertices: RDD[(Vid, VD)] = vTable
+
 
-  def withEdgePartitioner(numEdgePartitions: Int) = {
-    withPartitioner(numVertexPartitions, numEdgePartitions)
+  /** Return a RDD of edges. */
+  @transient override val edges: RDD[Edge[ED]] = {
+    eTable.mapPartitions { iter => iter.next()._2.iterator }
   }
 
-  protected var _cached = false
+
+  /** Return a RDD that brings edges with its source and destination vertices together. */
+  @transient override val triplets: RDD[EdgeTriplet[VD, ED]] =
+    EdgeTripletBuilder.makeTriplets(localVidMap, vTableReplicatedValues, eTable)
+
+
+  // {
+  //   val iterFun = (iter: Iterator[(Pid, (VertexHashMap[VD], EdgePartition[ED]))]) => {
+  //     val (pid, (vmap, edgePartition)) = iter.next()
+  //     //assert(iter.hasNext == false)
+  //     // Return an iterator that looks up the hash map to find matching 
+  //     // vertices for each edge.
+  //     new EdgeTripletIterator(vmap, edgePartition)
+  //   }
+  //   ClosureCleaner.clean(iterFun) 
+  //   vTableReplicated.join(eTable).mapPartitions( iterFun ) // end of map partition
+  // }
+
+
+
 
   override def cache(): Graph[VD, ED] = {
     eTable.cache()
+    vid2pid.cache()
     vTable.cache()
-    _cached = true
     this
   }
 
-  override def reverse: Graph[VD, ED] = {
-    newGraph(vertices, edges.map{ case Edge(s, t, e) => Edge(t, s, e) })
-  }
 
-  /** Return a RDD of vertices. */
-  override def vertices: RDD[Vertex[VD]] = {
-    if (!_cached && _rawVertices != null) {
-      _rawVertices
-    } else {
-      vTable.map { case(vid, (data, pids)) => new Vertex(vid, data) }
-    }
+  override def statistics: Map[String, Any] = {
+    val numVertices = this.numVertices
+    val numEdges = this.numEdges
+    val replicationRatio = 
+      vid2pid.map(kv => kv._2.size).sum / vTable.count
+    val loadArray = 
+      eTable.map{ case (pid, epart) => epart.data.size }.collect.map(x => x.toDouble / numEdges)
+    val minLoad = loadArray.min
+    val maxLoad = loadArray.max
+    Map(
+      "Num Vertices" -> numVertices, "Num Edges" -> numEdges,
+      "Replication" -> replicationRatio, "Load Array" -> loadArray, 
+      "Min Load" -> minLoad, "Max Load" -> maxLoad) 
   }
 
-  /** Return a RDD of edges. */
-  override def edges: RDD[Edge[ED]] = {
-    if (!_cached && _rawEdges != null) {
-      _rawEdges
-    } else {
-      eTable.mapPartitions { iter => iter.next()._2.iterator }
-    }
-  }
 
-  /** Return a RDD that brings edges with its source and destination vertices together. */
-  override def triplets: RDD[EdgeTriplet[VD, ED]] = {
-    new EdgeTripletRDD(vTableReplicated, eTable).mapPartitions { part => part.next()._2 }
+  override def reverse: Graph[VD, ED] = {
+    val etable = eTable.mapValues( _.reverse ).asInstanceOf[IndexedRDD[Pid, EdgePartition[ED]]] 
+    new GraphImpl(vTable, vid2pid, localVidMap, etable)
   }
 
-  override def mapVertices[VD2: ClassManifest](f: Vertex[VD] => VD2): Graph[VD2, ED] = {
-    newGraph(vertices.map(v => Vertex(v.id, f(v))), edges)
+
+  override def mapVertices[VD2: ClassManifest](f: (Vid, VD) => VD2): Graph[VD2, ED] = {
+    val newVTable = vTable.mapValuesWithKeys((vid, data) => f(vid, data))
+      .asInstanceOf[IndexedRDD[Vid, VD2]]
+    new GraphImpl(newVTable, vid2pid, localVidMap, eTable)
   }
 
   override def mapEdges[ED2: ClassManifest](f: Edge[ED] => ED2): Graph[VD, ED2] = {
-    newGraph(vertices, edges.map(e => Edge(e.src, e.dst, f(e))))
+    val newETable = eTable.mapValues(eBlock => eBlock.map(f))
+      .asInstanceOf[IndexedRDD[Pid, EdgePartition[ED2]]]
+    new GraphImpl(vTable, vid2pid, localVidMap, newETable)
   }
 
+
   override def mapTriplets[ED2: ClassManifest](f: EdgeTriplet[VD, ED] => ED2):
     Graph[VD, ED2] = {
-    newGraph(vertices, triplets.map(e => Edge(e.src.id, e.dst.id, f(e))))
+    val newETable = eTable.zipJoin(localVidMap).zipJoin(vTableReplicatedValues).mapValues{ 
+      case ((edgePartition, vidToIndex), vertexArray) =>
+        val et = new EdgeTriplet[VD, ED]
+        edgePartition.map{e =>
+          et.set(e)
+          et.srcAttr = vertexArray(vidToIndex(e.srcId))
+          et.dstAttr = vertexArray(vidToIndex(e.dstId))
+          f(et)
+        }
+    }.asInstanceOf[IndexedRDD[Pid, EdgePartition[ED2]]]
+    new GraphImpl(vTable, vid2pid, localVidMap, newETable)
   }
 
-  override def correctEdges(): Graph[VD, ED] = {
-    val sc = vertices.context
-    val vset = sc.broadcast(vertices.map(_.id).collect().toSet)
-    val newEdges = edges.filter(e => vset.value.contains(e.src) && vset.value.contains(e.dst))
-    Graph(vertices, newEdges)
-  }
+  // override def correctEdges(): Graph[VD, ED] = {
+  //   val sc = vertices.context
+  //   val vset = sc.broadcast(vertices.map(_.id).collect().toSet)
+  //   val newEdges = edges.filter(e => vset.value.contains(e.src) && vset.value.contains(e.dst))
+  //   Graph(vertices, newEdges)
+  // }
+
 
+  override def subgraph(epred: EdgeTriplet[VD,ED] => Boolean = (x => true), 
+    vpred: (Vid, VD) => Boolean = ((a,b) => true) ): Graph[VD, ED] = {
+
+    /** @todo The following code behaves deterministically on each
+     * vertex predicate but uses additional space.  Should we swithc to
+     * this version
+     */
+    // val predGraph = mapVertices(v => (v.data, vpred(v)))
+    // val newETable = predGraph.triplets.filter(t => 
+    //   if(v.src.data._2 && v.dst.data._2) {
+    //     val src = Vertex(t.src.id, t.src.data._1)
+    //     val dst = Vertex(t.dst.id, t.dst.data._1)
+    //     epred(new EdgeTriplet[VD, ED](src, dst, t.data))
+    //   } else { false })
+
+    // val newVTable = predGraph.vertices.filter(v => v.data._1)
+    //   .map(v => (v.id, v.data._1)).indexed()
+
+    // Reuse the partitioner (but not the index) from this graph
+    val newVTable = vertices.filter(v => vpred(v._1, v._2)).indexed(vTable.index.partitioner)
 
-  override def subgraph(epred: EdgeTriplet[VD,ED] => Boolean = (_ => true), 
-    vpred: Vertex[VD] => Boolean = (_ => true) ): Graph[VD, ED] = {
 
-    // Restrict the set of vertices to those that satisfy the vertex predicate
-    val newVertices = vertices.filter(vpred)
     // Restrict the set of edges to those that satisfy the vertex and the edge predicate.
-    val newEdges = triplets.filter(t => vpred(t.src) && vpred(t.dst) && epred(t))
-      .map( t => Edge(t.src.id, t.dst.id, t.data) )
+    val newETable = createETable(
+      triplets.filter(
+        t => vpred( t.srcId, t.srcAttr ) && vpred( t.dstId, t.dstAttr ) && epred(t)
+        )
+        .map( t => Edge(t.srcId, t.dstId, t.attr) ),
+      eTable.index.partitioner.numPartitions
+      )
+
+    // Construct the Vid2Pid map. Here we assume that the filter operation 
+    // behaves deterministically.  
+    // @todo reindex the vertex and edge tables 
+    val newVid2Pid = createVid2Pid(newETable, newVTable.index)
+    val newVidMap = createLocalVidMap(newETable)
+
+    new GraphImpl(newVTable, newVid2Pid, localVidMap, newETable)
+  }
+
 
-    new GraphImpl(newVertices, newEdges)
+  override def groupEdgeTriplets[ED2: ClassManifest](
+    f: Iterator[EdgeTriplet[VD,ED]] => ED2 ): Graph[VD,ED2] = {
+      val newEdges: RDD[Edge[ED2]] = triplets.mapPartitions { partIter =>
+        partIter
+        // TODO(crankshaw) toList requires that the entire edge partition
+        // can fit in memory right now.
+        .toList
+        // groups all ETs in this partition that have the same src and dst
+        // Because all ETs with the same src and dst will live on the same
+        // partition due to the EdgePartitioner, this guarantees that these
+        // ET groups will be complete.
+        .groupBy { t: EdgeTriplet[VD, ED] =>  (t.srcId, t.dstId) }
+        .mapValues { ts: List[EdgeTriplet[VD, ED]] => f(ts.toIterator) }
+        .toList
+        .toIterator
+        .map { case ((src, dst), data) => Edge(src, dst, data) }
+      }
+
+      //TODO(crankshaw) eliminate the need to call createETable
+      val newETable = createETable(newEdges, 
+        eTable.index.partitioner.numPartitions)
+      new GraphImpl(vTable, vid2pid, localVidMap, newETable)
+  }
+
+
+  override def groupEdges[ED2: ClassManifest](f: Iterator[Edge[ED]] => ED2 ):
+    Graph[VD,ED2] = {
+
+      val newEdges: RDD[Edge[ED2]] = edges.mapPartitions { partIter =>
+        partIter.toList
+        .groupBy { e: Edge[ED] => (e.srcId, e.dstId) }
+        .mapValues { ts => f(ts.toIterator) }
+        .toList
+        .toIterator
+        .map { case ((src, dst), data) => Edge(src, dst, data) }
+      }
+      // TODO(crankshaw) eliminate the need to call createETable
+      val newETable = createETable(newEdges, 
+        eTable.index.partitioner.numPartitions)
+
+      new GraphImpl(vTable, vid2pid, localVidMap, newETable)
   }
 
+
+
   //////////////////////////////////////////////////////////////////////////////////////////////////
   // Lower level transformation methods
   //////////////////////////////////////////////////////////////////////////////////////////////////
 
-  override def aggregateNeighbors[A: ClassManifest](
-      mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
-      reduceFunc: (A, A) => A,
-      default: A,
-      gatherDirection: EdgeDirection)
-    : Graph[(VD, Option[A]), ED] = {
+  override def mapReduceTriplets[A: ClassManifest](
+      mapFunc: EdgeTriplet[VD, ED] => Array[(Vid, A)],
+      reduceFunc: (A, A) => A)
+    : RDD[(Vid, A)] = {
 
     ClosureCleaner.clean(mapFunc)
     ClosureCleaner.clean(reduceFunc)
 
-    val newVTable = vTableReplicated.mapPartitions({ part =>
-        part.map { v => (v._1, MutableTuple2(v._2, Option.empty[A])) }
-      }, preservesPartitioning = true)
-
-    val newVertices: RDD[(Vid, A)] =
-      new EdgeTripletRDD[MutableTuple2[VD, Option[A]], ED](newVTable, eTable)
-        .mapPartitions { part =>
-          val (vmap, edges) = part.next()
-          val edgeSansAcc = new EdgeTriplet[VD, ED]()
-          edgeSansAcc.src = new Vertex[VD]
-          edgeSansAcc.dst = new Vertex[VD]
-          edges.foreach { e: EdgeTriplet[MutableTuple2[VD, Option[A]], ED] =>
-            edgeSansAcc.data = e.data
-            edgeSansAcc.src.data = e.src.data._1
-            edgeSansAcc.dst.data = e.dst.data._1
-            edgeSansAcc.src.id = e.src.id
-            edgeSansAcc.dst.id = e.dst.id
-            if (gatherDirection == EdgeDirection.In || gatherDirection == EdgeDirection.Both) {
-              e.dst.data._2 =
-                if (e.dst.data._2.isEmpty) {
-                  mapFunc(edgeSansAcc.dst.id, edgeSansAcc)
-                } else {
-                  val tmp = mapFunc(edgeSansAcc.dst.id, edgeSansAcc)
-                  if (!tmp.isEmpty) Some(reduceFunc(e.dst.data._2.get, tmp.get)) else e.dst.data._2
-                }
+    // Map and preaggregate 
+    val preAgg = localVidMap.zipJoin(vTableReplicatedValues).zipJoin(eTable).flatMap{
+      case (pid, ((vidToIndex, vertexArray), edgePartition)) => 
+        // We can reuse the vidToIndex map for aggregation here as well.
+        /** @todo Since this has the downside of not allowing "messages" to arbitrary
+         * vertices we should consider just using a fresh map.
+         */
+        val msgArray = new Array[A](vertexArray.size)
+        val msgBS = new BitSet(vertexArray.size)
+        // Iterate over the partition
+        val et = new EdgeTriplet[VD, ED]
+        edgePartition.foreach{e => 
+          et.set(e)
+          et.srcAttr = vertexArray(vidToIndex(e.srcId))
+          et.dstAttr = vertexArray(vidToIndex(e.dstId))
+          mapFunc(et).foreach{ case (vid, msg) =>
+            // verify that the vid is valid
+            assert(vid == et.srcId || vid == et.dstId)
+            val ind = vidToIndex(vid)
+            // Populate the aggregator map
+            if(msgBS(ind)) {
+              msgArray(ind) = reduceFunc(msgArray(ind), msg)
+            } else { 
+              msgArray(ind) = msg
+              msgBS(ind) = true
             }
-            if (gatherDirection == EdgeDirection.Out || gatherDirection == EdgeDirection.Both) {
-              e.dst.data._2 =
-                if (e.dst.data._2.isEmpty) {
-                  mapFunc(edgeSansAcc.src.id, edgeSansAcc)
-                } else {
-                  val tmp = mapFunc(edgeSansAcc.src.id, edgeSansAcc)
-                  if (!tmp.isEmpty) Some(reduceFunc(e.src.data._2.get, tmp.get)) else e.src.data._2
-                }
-            }
-          }
-          vmap.long2ObjectEntrySet().fastIterator().filter(!_.getValue()._2.isEmpty).map{ entry =>
-            (entry.getLongKey(), entry.getValue()._2)
           }
         }
-        .map{ case (vid, aOpt) => (vid, aOpt.get) }
-        .combineByKey((v: A) => v, reduceFunc, null, vertexPartitioner, false)
+        // Return the aggregate map
+        vidToIndex.long2IntEntrySet().fastIterator()
+        // Remove the entries that did not receive a message
+        .filter{ entry => msgBS(entry.getValue()) }
+        // Construct the actual pairs
+        .map{ entry => 
+          val vid = entry.getLongKey()
+          val ind = entry.getValue()
+          val msg = msgArray(ind)
+          (vid, msg)
+        }
+      }.partitionBy(vTable.index.rdd.partitioner.get)
+    // do the final reduction reusing the index map
+    IndexedRDD(preAgg, vTable.index, reduceFunc)
+  }
+
 
-    this.leftJoinVertices(newVertices, (v: Vertex[VD], a: Option[A]) => (v.data, a))
+  override def outerJoinVertices[U: ClassManifest, VD2: ClassManifest]
+    (updates: RDD[(Vid, U)])(updateF: (Vid, VD, Option[U]) => VD2)
+    : Graph[VD2, ED] = {
+    ClosureCleaner.clean(updateF)
+    val newVTable = vTable.leftJoin(updates).mapValuesWithKeys(
+      (vid, vu) => updateF(vid, vu._1, vu._2) )
+    new GraphImpl(newVTable, vid2pid, localVidMap, eTable)
   }
 
-  /**
-   * Same as aggregateNeighbors but map function can return none and there is no default value.
-   * As a consequence, the resulting table may be much smaller than the set of vertices.
-   */
-  override def aggregateNeighbors[A: ClassManifest](
-    mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
-    reduceFunc: (A, A) => A,
-    gatherDirection: EdgeDirection): Graph[(VD, Option[A]), ED] = {
 
-    ClosureCleaner.clean(mapFunc)
-    ClosureCleaner.clean(reduceFunc)
+} // end of class GraphImpl
 
-    val newVTable = vTableReplicated.mapPartitions({ part =>
-        part.map { v => (v._1, MutableTuple2(v._2, Option.empty[A])) }
-      }, preservesPartitioning = true)
-
-    val newVertices: RDD[(Vid, A)] =
-      new EdgeTripletRDD[MutableTuple2[VD, Option[A]], ED](newVTable, eTable)
-        .mapPartitions { part =>
-          val (vmap, edges) = part.next()
-          val edgeSansAcc = new EdgeTriplet[VD, ED]()
-          edgeSansAcc.src = new Vertex[VD]
-          edgeSansAcc.dst = new Vertex[VD]
-          edges.foreach { e: EdgeTriplet[MutableTuple2[VD, Option[A]], ED] =>
-            edgeSansAcc.data = e.data
-            edgeSansAcc.src.data = e.src.data._1
-            edgeSansAcc.dst.data = e.dst.data._1
-            edgeSansAcc.src.id = e.src.id
-            edgeSansAcc.dst.id = e.dst.id
-            if (gatherDirection == EdgeDirection.In || gatherDirection == EdgeDirection.Both) {
-              e.dst.data._2 =
-                if (e.dst.data._2.isEmpty) {
-                  mapFunc(edgeSansAcc.dst.id, edgeSansAcc)
-                } else {
-                  val tmp = mapFunc(edgeSansAcc.dst.id, edgeSansAcc)
-                  if (!tmp.isEmpty) Some(reduceFunc(e.dst.data._2.get, tmp.get)) else e.dst.data._2
-                }
-            }
-            if (gatherDirection == EdgeDirection.Out || gatherDirection == EdgeDirection.Both) {
-              e.src.data._2 =
-                if (e.src.data._2.isEmpty) {
-                  mapFunc(edgeSansAcc.src.id, edgeSansAcc)
-                } else {
-                  val tmp = mapFunc(edgeSansAcc.src.id, edgeSansAcc)
-                  if (!tmp.isEmpty) Some(reduceFunc(e.src.data._2.get, tmp.get)) else e.src.data._2
-                }
-            }
-          }
-          vmap.long2ObjectEntrySet().fastIterator().filter(!_.getValue()._2.isEmpty).map{ entry =>
-            (entry.getLongKey(), entry.getValue()._2)
-          }
-        }
-        .map{ case (vid, aOpt) => (vid, aOpt.get) }
-        .combineByKey((v: A) => v, reduceFunc, null, vertexPartitioner, false)
 
-    this.leftJoinVertices(newVertices, (v: Vertex[VD], a: Option[A]) => (v.data, a))
-  }
 
-  override def leftJoinVertices[U: ClassManifest, VD2: ClassManifest](
-      updates: RDD[(Vid, U)],
-      updateF: (Vertex[VD], Option[U]) => VD2)
-    : Graph[VD2, ED] = {
 
-    ClosureCleaner.clean(updateF)
 
-    val newVTable = vTable.leftOuterJoin(updates).mapPartitions({ iter =>
-      iter.map { case (vid, ((vdata, pids), update)) =>
-        val newVdata = updateF(Vertex(vid, vdata), update)
-        (vid, (newVdata, pids))
-      }
-    }, preservesPartitioning = true).cache()
 
-    new GraphImpl(newVTable.partitions.length, eTable.partitions.length, null, null, newVTable, eTable)
-  }
 
-  override def joinVertices[U: ClassManifest](
-      updates: RDD[(Vid, U)],
-      updateF: (Vertex[VD], U) => VD)
-    : Graph[VD, ED] = {
 
-    ClosureCleaner.clean(updateF)
 
-    val newVTable = vTable.leftOuterJoin(updates).mapPartitions({ iter =>
-      iter.map { case (vid, ((vdata, pids), update)) =>
-        if (update.isDefined) {
-          val newVdata = updateF(Vertex(vid, vdata), update.get)
-          (vid, (newVdata, pids))
-        } else {
-          (vid, (vdata, pids))
-        }
-      }
-    }, preservesPartitioning = true).cache()
 
-    new GraphImpl(newVTable.partitions.length, eTable.partitions.length, null, null, newVTable, eTable)
+
+
+
+
+
+
+object GraphImpl {
+
+  def apply[VD: ClassManifest, ED: ClassManifest](
+    vertices: RDD[(Vid, VD)], edges: RDD[Edge[ED]]): 
+  GraphImpl[VD,ED] = {
+
+    apply(vertices, edges, 
+      vertices.context.defaultParallelism, edges.context.defaultParallelism)
   }
 
 
-  //////////////////////////////////////////////////////////////////////////////////////////////////
-  // Internals hidden from callers
-  //////////////////////////////////////////////////////////////////////////////////////////////////
+  def apply[VD: ClassManifest, ED: ClassManifest](
+    vertices: RDD[(Vid, VD)], edges: RDD[Edge[ED]],
+    numVPart: Int, numEPart: Int): GraphImpl[VD,ED] = {
+
+    val vtable = vertices.indexed(numVPart)
+    val etable = createETable(edges, numEPart)
+    val vid2pid = createVid2Pid(etable, vtable.index)
+    val localVidMap = createLocalVidMap(etable)
+    new GraphImpl(vtable, vid2pid, localVidMap, etable)
+  }
 
-  // TODO: Support non-hash partitioning schemes.
-  protected val vertexPartitioner = new HashPartitioner(numVertexPartitions)
-  protected val edgePartitioner = new HashPartitioner(numEdgePartitions)
 
-  /** Create a new graph but keep the current partitioning scheme. */
-  protected def newGraph[VD2: ClassManifest, ED2: ClassManifest](
-    vertices: RDD[Vertex[VD2]], edges: RDD[Edge[ED2]]): Graph[VD2, ED2] = {
-    (new GraphImpl[VD2, ED2](vertices, edges)).withPartitioner(numVertexPartitions, numEdgePartitions)
+
+  /**
+   * Create the edge table RDD, which is much more efficient for Java heap storage than the
+   * normal edges data structure (RDD[(Vid, Vid, ED)]).
+   *
+   * The edge table contains multiple partitions, and each partition contains only one RDD
+   * key-value pair: the key is the partition id, and the value is an EdgePartition object
+   * containing all the edges in a partition.
+   */
+  protected def createETable[ED: ClassManifest](
+    edges: RDD[Edge[ED]], numPartitions: Int)
+    : IndexedRDD[Pid, EdgePartition[ED]] = {
+      val ceilSqrt: Pid = math.ceil(math.sqrt(numPartitions)).toInt 
+    edges
+      .map { e =>
+        // Random partitioning based on the source vertex id.
+        // val part: Pid = edgePartitionFunction1D(e.srcId, e.dstId, numPartitions)
+        // val part: Pid = edgePartitionFunction2D(e.srcId, e.dstId, numPartitions, ceilSqrt)
+        val part: Pid = randomVertexCut(e.srcId, e.dstId, numPartitions)
+        //val part: Pid = canonicalEdgePartitionFunction2D(e.srcId, e.dstId, numPartitions, ceilSqrt)
+
+        // Should we be using 3-tuple or an optimized class
+        MessageToPartition(part, (e.srcId, e.dstId, e.attr))
+      }
+      .partitionBy(new HashPartitioner(numPartitions))
+      .mapPartitionsWithIndex({ (pid, iter) =>
+        val builder = new EdgePartitionBuilder[ED]
+        iter.foreach { message =>
+          val data = message.data
+          builder.add(data._1, data._2, data._3)
+        }
+        val edgePartition = builder.toEdgePartition
+        Iterator((pid, edgePartition))
+      }, preservesPartitioning = true).indexed()
   }
 
-  protected lazy val eTable: RDD[(Pid, EdgePartition[ED])] = {
-    if (_rawETable == null) {
-      createETable(_rawEdges, numEdgePartitions)
-    } else {
-      _rawETable
+
+  protected def createVid2Pid[ED: ClassManifest](
+    eTable: IndexedRDD[Pid, EdgePartition[ED]],
+    vTableIndex: RDDIndex[Vid]): IndexedRDD[Vid, Array[Pid]] = {
+    val preAgg = eTable.mapPartitions { iter =>
+      val (pid, edgePartition) = iter.next()
+      val vSet = new VertexSet
+      edgePartition.foreach(e => {vSet.add(e.srcId); vSet.add(e.dstId)})
+      vSet.iterator.map { vid => (vid.toLong, pid) }
     }
+    IndexedRDD[Vid, Pid, ArrayBuffer[Pid]](preAgg, vTableIndex, 
+      (p: Pid) => ArrayBuffer(p),
+      (ab: ArrayBuffer[Pid], p:Pid) => {ab.append(p); ab},
+      (a: ArrayBuffer[Pid], b: ArrayBuffer[Pid]) => a ++ b)
+      .mapValues(a => a.toArray).asInstanceOf[IndexedRDD[Vid, Array[Pid]]]
   }
 
-  protected lazy val vTable: RDD[(Vid, (VD, Array[Pid]))] = {
-    if (_rawVTable == null) {
-      createVTable(_rawVertices, eTable, numVertexPartitions)
-    } else {
-      _rawVTable
+
+  protected def createLocalVidMap[ED: ClassManifest](
+    eTable: IndexedRDD[Pid, EdgePartition[ED]]): IndexedRDD[Pid, VertexIdToIndexMap] = {
+    eTable.mapValues{ epart =>
+      val vidToIndex = new VertexIdToIndexMap()
+      var i = 0
+      epart.foreach{ e => 
+        if(!vidToIndex.contains(e.srcId)) {
+          vidToIndex.put(e.srcId, i)
+          i += 1
+        }
+        if(!vidToIndex.contains(e.dstId)) {
+          vidToIndex.put(e.dstId, i)
+          i += 1
+        }
+      }
+      vidToIndex
     }
   }
 
-  protected lazy val vTableReplicated: RDD[(Vid, VD)] = {
-    // Join vid2pid and vTable, generate a shuffle dependency on the joined result, and get
-    // the shuffle id so we can use it on the slave.
-    vTable
-      .flatMap { case (vid, (vdata, pids)) => pids.iterator.map { pid => (pid, (vid, vdata)) } }
-      .partitionBy(edgePartitioner)
-      .mapPartitions(
-        { part => part.map { case(pid, (vid, vdata)) => (vid, vdata) } },
-        preservesPartitioning = true)
-  }
-}
 
+  protected def createVTableReplicated[VD: ClassManifest](
+      vTable: IndexedRDD[Vid, VD], 
+      vid2pid: IndexedRDD[Vid, Array[Pid]],
+      replicationMap: IndexedRDD[Pid, VertexIdToIndexMap]): 
+    IndexedRDD[Pid, Array[VD]] = {
+    // Join vid2pid and vTable, generate a shuffle dependency on the joined 
+    // result, and get the shuffle id so we can use it on the slave.
+    val msgsByPartition = vTable.zipJoin(vid2pid)
+      .flatMap { case (vid, (vdata, pids)) =>
+        pids.iterator.map { pid => MessageToPartition(pid, (vid, vdata)) }
+      }
+      .partitionBy(replicationMap.partitioner.get).cache()
+   
+    val newValuesRDD = replicationMap.valuesRDD.zipPartitions(msgsByPartition){ 
+      (mapIter, msgsIter) =>
+      val (IndexedSeq(vidToIndex), bs) = mapIter.next()
+      assert(!mapIter.hasNext)
+      // Populate the vertex array using the vidToIndex map
+      val vertexArray = new Array[VD](vidToIndex.size)
+      for (msg <- msgsIter) {
+        val ind = vidToIndex(msg.data._1)
+        vertexArray(ind) = msg.data._2
+      }
+      Iterator((IndexedSeq(vertexArray), bs))
+    }
 
-object GraphImpl {
+    new IndexedRDD(replicationMap.index, newValuesRDD)
+
+    // @todo assert edge table has partitioner
+
+    // val localVidMap: IndexedRDD[Pid, VertexIdToIndexMap] =
+    //   msgsByPartition.mapPartitionsWithIndex( (pid, iter) => {
+    //     val vidToIndex = new VertexIdToIndexMap
+    //     var i = 0
+    //     for (msg <- iter) {
+    //       vidToIndex.put(msg.data._1, i)
+    //       i += 1
+    //     }
+    //     Array((pid, vidToIndex)).iterator
+    //   }, preservesPartitioning = true).indexed(eTable.index)
+
+    // val vTableReplicatedValues: IndexedRDD[Pid, Array[VD]] =
+    //   msgsByPartition.mapPartitionsWithIndex( (pid, iter) => {
+    //     val vertexArray = ArrayBuilder.make[VD]
+    //     for (msg <- iter) {
+    //       vertexArray += msg.data._2
+    //     }
+    //     Array((pid, vertexArray.result)).iterator
+    //   }, preservesPartitioning = true).indexed(eTable.index)
+
+    // (localVidMap, vTableReplicatedValues)
+  }
 
 
   protected def edgePartitionFunction1D(src: Vid, dst: Vid, numParts: Pid): Pid = {
@@ -361,7 +581,7 @@ object GraphImpl {
    * Notice that P0 has many edges and as a consequence this 
    * partitioning would lead to poor work balance.  To improve
    * balance we first multiply each vertex id by a large prime 
-   * to effectively suffle the vertex locations. 
+   * to effectively shuffle the vertex locations. 
    *
    * One of the limitations of this approach is that the number of
    * machines must either be a perfect square.  We partially address
@@ -382,66 +602,28 @@ object GraphImpl {
 
 
   /**
-   * Create the edge table RDD, which is much more efficient for Java heap storage than the
-   * normal edges data structure (RDD[(Vid, Vid, ED)]).
-   *
-   * The edge table contains multiple partitions, and each partition contains only one RDD
-   * key-value pair: the key is the partition id, and the value is an EdgePartition object
-   * containing all the edges in a partition.
+   * Assign edges to an aribtrary machine corresponding to a 
+   * random vertex cut.
    */
-  protected def createETable[ED: ClassManifest](edges: RDD[Edge[ED]], numPartitions: Int)
-    : RDD[(Pid, EdgePartition[ED])] = {
-      val ceilSqrt: Pid = math.ceil(math.sqrt(numPartitions)).toInt 
-
-    edges
-      .map { e =>
-        // Random partitioning based on the source vertex id.
-        // val part: Pid = edgePartitionFunction1D(e.src, e.dst, numPartitions)
-        val part: Pid = edgePartitionFunction2D(e.src, e.dst, numPartitions, ceilSqrt)
-
-        // Should we be using 3-tuple or an optimized class
-        (part, (e.src, e.dst, e.data))
-        //  (math.abs(e.src) % numPartitions, (e.src, e.dst, e.data))
-       
-      }
-      .partitionBy(new HashPartitioner(numPartitions))
-      .mapPartitionsWithIndex({ (pid, iter) =>
-        val edgePartition = new EdgePartition[ED]
-        iter.foreach { case (_, (src, dst, data)) => edgePartition.add(src, dst, data) }
-        edgePartition.trim()
-        Iterator((pid, edgePartition))
-      }, preservesPartitioning = true)
+  protected def randomVertexCut(src: Vid, dst: Vid, numParts: Pid): Pid = {
+    math.abs((src, dst).hashCode()) % numParts
   }
 
-  protected def createVTable[VD: ClassManifest, ED: ClassManifest](
-      vertices: RDD[Vertex[VD]],
-      eTable: RDD[(Pid, EdgePartition[ED])],
-      numPartitions: Int)
-    : RDD[(Vid, (VD, Array[Pid]))] = {
-    val partitioner = new HashPartitioner(numPartitions)
 
-    // A key-value RDD. The key is a vertex id, and the value is a list of
-    // partitions that contains edges referencing the vertex.
-    val vid2pid : RDD[(Vid, Seq[Pid])] = eTable.mapPartitions { iter =>
-      val (pid, edgePartition) = iter.next()
-      val vSet = new VertexSet
-      var i = 0
-      while (i < edgePartition.srcIds.size) {
-        vSet.add(edgePartition.srcIds.getLong(i))
-        vSet.add(edgePartition.dstIds.getLong(i))
-        i += 1
-      }
-      vSet.iterator.map { vid => (vid.toLong, pid) }
-    }.groupByKey(partitioner)
-
-    vertices
-      .map { v => (v.id, v.data) }
-      .partitionBy(partitioner)
-      .leftOuterJoin(vid2pid)
-      .mapValues {
-        case (vdata, None)       => (vdata, Array.empty[Pid])
-        case (vdata, Some(pids)) => (vdata, pids.toArray)
-      }
+  /**
+   * @todo This will only partition edges to the upper diagonal
+   * of the 2D processor space.
+   */
+  protected def canonicalEdgePartitionFunction2D(srcOrig: Vid, dstOrig: Vid, 
+    numParts: Pid, ceilSqrtNumParts: Pid): Pid = {
+    val mixingPrime: Vid = 1125899906842597L 
+    // Partitions by canonical edge direction
+    val src = math.min(srcOrig, dstOrig)
+    val dst = math.max(srcOrig, dstOrig)
+    val col: Pid = ((math.abs(src) * mixingPrime) % ceilSqrtNumParts).toInt
+    val row: Pid = ((math.abs(dst) * mixingPrime) % ceilSqrtNumParts).toInt
+    (col * ceilSqrtNumParts + row) % numParts
   }
-}
+
+} // end of object GraphImpl
 
diff --git a/graph/src/main/scala/org/apache/spark/graph/impl/MessageToPartition.scala b/graph/src/main/scala/org/apache/spark/graph/impl/MessageToPartition.scala
new file mode 100644
index 0000000000000000000000000000000000000000..b7bbf257a4a5692163d5e53c07473ca35d3dab9b
--- /dev/null
+++ b/graph/src/main/scala/org/apache/spark/graph/impl/MessageToPartition.scala
@@ -0,0 +1,49 @@
+package org.apache.spark.graph.impl
+
+import org.apache.spark.Partitioner
+import org.apache.spark.graph.Pid
+import org.apache.spark.rdd.{ShuffledRDD, RDD}
+
+
+/**
+ * A message used to send a specific value to a partition.
+ * @param partition index of the target partition.
+ * @param data value to send
+ */
+class MessageToPartition[@specialized(Int, Long, Double, Char, Boolean/*, AnyRef*/) T](
+    @transient var partition: Pid,
+    var data: T)
+  extends Product2[Pid, T] {
+
+  override def _1 = partition
+
+  override def _2 = data
+
+  override def canEqual(that: Any): Boolean = that.isInstanceOf[MessageToPartition[_]]
+}
+
+/**
+ * Companion object for MessageToPartition.
+ */
+object MessageToPartition {
+  def apply[T](partition: Pid, value: T) = new MessageToPartition(partition, value)
+}
+
+
+class MessageToPartitionRDDFunctions[T: ClassManifest](self: RDD[MessageToPartition[T]]) {
+
+  /**
+   * Return a copy of the RDD partitioned using the specified partitioner.
+   */
+  def partitionBy(partitioner: Partitioner): RDD[MessageToPartition[T]] = {
+    new ShuffledRDD[Pid, T, MessageToPartition[T]](self, partitioner)
+  }
+
+}
+
+
+object MessageToPartitionRDDFunctions {
+  implicit def rdd2PartitionRDDFunctions[T: ClassManifest](rdd: RDD[MessageToPartition[T]]) = {
+    new MessageToPartitionRDDFunctions(rdd)
+  }
+}
diff --git a/graph/src/main/scala/org/apache/spark/graph/package.scala b/graph/src/main/scala/org/apache/spark/graph/package.scala
index 474ace520f44052642a97846bb74dad4c088b0d5..4627c3566ca192f675c4129cb23a2ff0398f727b 100644
--- a/graph/src/main/scala/org/apache/spark/graph/package.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/package.scala
@@ -8,6 +8,8 @@ package object graph {
   type VertexHashMap[T] = it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap[T]
   type VertexSet = it.unimi.dsi.fastutil.longs.LongOpenHashSet
   type VertexArrayList = it.unimi.dsi.fastutil.longs.LongArrayList
+  // @todo replace with rxin's fast hashmap
+  type VertexIdToIndexMap = it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap
 
   /**
    * Return the default null-like value for a data type T.
diff --git a/graph/src/main/scala/org/apache/spark/graph/util/GraphGenerators.scala b/graph/src/main/scala/org/apache/spark/graph/util/GraphGenerators.scala
new file mode 100644
index 0000000000000000000000000000000000000000..061cce99b6e3d70f9b5e13668f102c60a095d4ed
--- /dev/null
+++ b/graph/src/main/scala/org/apache/spark/graph/util/GraphGenerators.scala
@@ -0,0 +1,250 @@
+package org.apache.spark.graph.util
+
+import util._
+import math._
+import scala.annotation.tailrec
+//import scala.collection.mutable
+
+
+import org.apache.spark._
+import org.apache.spark.serializer._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.graph._
+import org.apache.spark.graph.Graph
+import org.apache.spark.graph.Edge
+import org.apache.spark.graph.impl.GraphImpl
+
+/**
+ * @todo(crankshaw) cleanup and modularize code
+ */
+object GraphGenerators {
+
+  val RMATa = 0.45
+  val RMATb = 0.15
+  val RMATc = 0.15
+  val RMATd = 0.25
+
+  def main(args: Array[String]) {
+
+
+    val serializer = "org.apache.spark.serializer.KryoSerializer"
+    System.setProperty("spark.serializer", serializer)
+    //System.setProperty("spark.shuffle.compress", "false")
+    System.setProperty("spark.kryo.registrator", "spark.graph.GraphKryoRegistrator")
+    val host = "local[4]"
+    val sc = new SparkContext(host, "Lognormal graph generator")
+
+    val lnGraph = logNormalGraph(sc, 10000)
+
+    val rmat = rmatGraph(sc, 1000, 3000)
+
+    //for (v <- lnGraph.vertices) {
+    //  println(v.id + ":\t" + v.data)
+    //}
+
+    val times = 100000
+    //val nums = (1 to times).flatMap { n => List(sampleLogNormal(4.0, 1.3, times)) }.toList
+    //val avg = nums.sum / nums.length
+    //val sumSquares = nums.foldLeft(0.0) {(total, next) =>
+    //  (total + math.pow((next - avg), 2)) }
+    //val stdev = math.sqrt(sumSquares/(nums.length - 1))
+
+    //println("avg: " + avg + "+-" + stdev)
+
+
+    //for (i <- 1 to 1000) {
+    //  println(sampleLogNormal(4.0, 1.3, 1000))
+    //}
+
+    sc.stop()
+
+  }
+
+
+  // Right now it just generates a bunch of edges where
+  // the edge data is the weight (default 1)
+  def logNormalGraph(sc: SparkContext, numVertices: Int): GraphImpl[Int, Int] = {
+    // based on Pregel settings
+    val mu = 4
+    val sigma = 1.3
+    //val vertsAndEdges = (0 until numVertices).flatMap { src => {
+
+    val vertices: RDD[(Vid, Int)] = sc.parallelize(0 until numVertices).map{
+      src => (src, sampleLogNormal(mu, sigma, numVertices))
+    }
+
+    val edges = vertices.flatMap{ 
+      v => generateRandomEdges(v._1.toInt, v._2, numVertices) 
+    }
+    
+    GraphImpl(vertices, edges)
+    //println("Vertices:")
+    //for (v <- vertices) {
+    //  println(v.id)
+    //}
+
+    //println("Edges")
+    //for (e <- edges) {
+    //  println(e.src, e.dst, e.data)
+    //}
+
+  }
+
+
+  def generateRandomEdges(src: Int, numEdges: Int, maxVid: Int): Array[Edge[Int]] = {
+    val rand = new Random()
+    var dsts: Set[Int] = Set()
+    while (dsts.size < numEdges) {
+      val nextDst = rand.nextInt(maxVid)
+      if (nextDst != src) {
+        dsts += nextDst
+      }
+    }
+    dsts.map {dst => Edge[Int](src, dst, 1) }.toArray
+  }
+
+
+  /**
+   * Randomly samples from a log normal distribution
+   * whose corresponding normal distribution has the
+   * the given mean and standard deviation. It uses
+   * the formula X = exp(m+s*Z) where m, s are the
+   * mean, standard deviation of the lognormal distribution
+   * and Z~N(0, 1). In this function,
+   * m = e^(mu+sigma^2/2) and
+   * s = sqrt[(e^(sigma^2) - 1)(e^(2*mu+sigma^2))].
+   *
+   * @param mu the mean of the normal distribution
+   * @param sigma the standard deviation of the normal distribution
+   * @param macVal exclusive upper bound on the value of the sample
+   */
+  def sampleLogNormal(mu: Double, sigma: Double, maxVal: Int): Int = {
+    val rand = new Random()
+    val m = math.exp(mu+(sigma*sigma)/2.0)
+    val s = math.sqrt((math.exp(sigma*sigma) - 1) * math.exp(2*mu + sigma*sigma))
+    // Z ~ N(0, 1)
+    var X: Double = maxVal
+
+    while (X >= maxVal) {
+      val Z = rand.nextGaussian()
+      //X = math.exp((m + s*Z))
+      X = math.exp((mu + sigma*Z))
+    }
+    math.round(X.toFloat)
+  }
+
+
+
+  def rmatGraph(sc: SparkContext, requestedNumVertices: Int, numEdges: Int): GraphImpl[Int, Int] = {
+    // let N = requestedNumVertices
+    // the number of vertices is 2^n where n=ceil(log2[N])
+    // This ensures that the 4 quadrants are the same size at all recursion levels
+    val numVertices = math.round(math.pow(2.0, math.ceil(math.log(requestedNumVertices)/math.log(2.0)))).toInt
+    var edges: Set[Edge[Int]] = Set()
+    while (edges.size < numEdges) {
+      if (edges.size % 100 == 0) {
+        println(edges.size + " edges")
+      }
+      edges += addEdge(numVertices)
+
+    }
+    val graph = outDegreeFromEdges(sc.parallelize(edges.toList))
+    graph
+
+  }
+
+  def outDegreeFromEdges[ED: ClassManifest](edges: RDD[Edge[ED]]): GraphImpl[Int, ED] = {
+    
+    val vertices = edges.flatMap { edge => List((edge.srcId, 1)) }
+      .reduceByKey(_ + _)
+      .map{ case (vid, degree) => (vid, degree) }
+    GraphImpl(vertices, edges)
+  }
+
+  /**
+   * @param numVertices Specifies the total number of vertices in the graph (used to get
+   * the dimensions of the adjacency matrix
+   */
+  def addEdge(numVertices: Int): Edge[Int] = {
+    //val (src, dst) = chooseCell(numVertices/2.0, numVertices/2.0, numVertices/2.0)
+    val v = math.round(numVertices.toFloat/2.0).toInt
+
+    val (src, dst) = chooseCell(v, v, v)
+    Edge[Int](src, dst, 1)
+  }
+
+
+  /**
+   * This method recursively subdivides the the adjacency matrix into quadrants
+   * until it picks a single cell. The naming conventions in this paper match
+   * those of the R-MAT paper. There are a power of 2 number of nodes in the graph.
+   * The adjacency matrix looks like:
+   *
+   *          dst ->
+   * (x,y) ***************  _
+   *       |      |      |  |
+   *       |  a   |  b   |  |
+   *  src  |      |      |  |
+   *   |   ***************  | T
+   *  \|/  |      |      |  |
+   *       |   c  |   d  |  |
+   *       |      |      |  |
+   *       ***************  -
+   *        
+   * where this represents the subquadrant of the adj matrix currently being
+   * subdivided. (x,y) represent the upper left hand corner of the subquadrant,
+   * and T represents the side length (guaranteed to be a power of 2).
+   *
+   * After choosing the next level subquadrant, we get the resulting sets
+   * of parameters:
+   *    quad = a, x'=x, y'=y, T'=T/2
+   *    quad = b, x'=x+T/2, y'=y, T'=T/2
+   *    quad = c, x'=x, y'=y+T/2, T'=T/2
+   *    quad = d, x'=x+T/2, y'=y+T/2, T'=T/2
+   *
+   * @param src is the 
+   */
+  @tailrec
+  def chooseCell(x: Int, y: Int, t: Int): (Int, Int) = {
+    if (t <= 1)
+      (x,y)
+    else {
+      val newT = math.round(t.toFloat/2.0).toInt
+      pickQuadrant(RMATa, RMATb, RMATc, RMATd) match {
+        case 0 => chooseCell(x, y, newT)
+        case 1 => chooseCell(x+newT, y, newT)
+        case 2 => chooseCell(x, y+newT, newT)
+        case 3 => chooseCell(x+newT, y+newT, newT)
+      }
+    }
+  }
+
+  // TODO(crankshaw) turn result into an enum (or case class for pattern matching}
+  def pickQuadrant(a: Double, b: Double, c: Double, d: Double): Int = {
+    if (a+b+c+d != 1.0) {
+      throw new IllegalArgumentException("R-MAT probability parameters sum to " + (a+b+c+d) + ", should sum to 1.0")
+    }
+    val rand = new Random()
+    val result = rand.nextDouble()
+    result match {
+      case x if x < a => 0 // 0 corresponds to quadrant a
+      case x if (x >= a && x < a+b) => 1 // 1 corresponds to b
+      case x if (x >= a+b && x < a+b+c) => 2 // 2 corresponds to c
+      case _ => 3 // 3 corresponds to d
+    }
+  }
+
+}
+
+
+
+
+
+
+
+
+
+
+
diff --git a/graph/src/test/scala/org/apache/spark/graph/GraphSuite.scala b/graph/src/test/scala/org/apache/spark/graph/GraphSuite.scala
index aa885de957939f9727f545f706db30584392c056..145be3c126a3845f91f801364d908a78e2034f0d 100644
--- a/graph/src/test/scala/org/apache/spark/graph/GraphSuite.scala
+++ b/graph/src/test/scala/org/apache/spark/graph/GraphSuite.scala
@@ -10,6 +10,9 @@ class GraphSuite extends FunSuite with LocalSparkContext {
 
 //  val sc = new SparkContext("local[4]", "test")
 
+  System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+  System.setProperty("spark.kryo.registrator", "org.apache.spark.graph.GraphKryoRegistrator")
+
   test("Graph Creation") {
     withSpark(new SparkContext("local", "test")) { sc =>
       val rawEdges = (0L to 100L).zip((1L to 99L) :+ 0L)
@@ -26,20 +29,20 @@ class GraphSuite extends FunSuite with LocalSparkContext {
       val indegrees = star.aggregateNeighbors(
         (vid, edge) => Some(1),
         (a: Int, b: Int) => a + b,
-        EdgeDirection.In).vertices.map(v => (v.id, v.data._2.getOrElse(0)))
-      assert(indegrees.collect().toSet === Set((0, 0), (1, 1), (2, 1), (3, 1)))
+        EdgeDirection.In)// .map((vid, attr) => (vid, attr._2.getOrElse(0)))
+      assert(indegrees.collect().toSet === Set((1, 1), (2, 1), (3, 1))) // (0, 0),
 
       val outdegrees = star.aggregateNeighbors(
         (vid, edge) => Some(1),
         (a: Int, b: Int) => a + b,
-        EdgeDirection.Out).vertices.map(v => (v.id, v.data._2.getOrElse(0)))
-      assert(outdegrees.collect().toSet === Set((0, 3), (1, 0), (2, 0), (3, 0)))
+        EdgeDirection.Out) //.map((vid, attr) => (vid, attr._2.getOrElse(0)))
+      assert(outdegrees.collect().toSet === Set((0, 3))) //, (1, 0), (2, 0), (3, 0)))
 
       val noVertexValues = star.aggregateNeighbors[Int](
         (vid: Vid, edge: EdgeTriplet[Int, Int]) => None,
         (a: Int, b: Int) => throw new Exception("reduceFunc called unexpectedly"),
-        EdgeDirection.In).vertices.map(v => (v.id, v.data._2))
-      assert(noVertexValues.collect().toSet === Set((0, None), (1, None), (2, None), (3, None)))
+        EdgeDirection.In)//.map((vid, attr) => (vid, attr))
+      assert(noVertexValues.collect().toSet === Set.empty[(Vid, Int)] ) // ((0, None), (1, None), (2, None), (3, None)))
     }
   }