Skip to content
Snippets Groups Projects
Commit 04c2d6a6 authored by Matei Zaharia's avatar Matei Zaharia
Browse files

stuff

parent 0fb691dd
No related branches found
No related tags found
No related merge requests found
package spark
import java.net.URL
import java.io.EOFException
import java.io.ObjectInputStream
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
@serializable
sealed trait CoGroupSplitDep
case class NarrowCoGroupSplitDep(rdd: RDD[_], split: Split) extends CoGroupSplitDep
case class ShuffleCoGroupSplitDep(shuffleId: Int) extends CoGroupSplitDep
@serializable
class CoGroupSplit(idx: Int, val deps: Seq[CoGroupSplitDep])
extends Split {
override val index = idx
override def hashCode(): Int = idx
}
@serializable
class CoGroupAggregator[K] extends Aggregator[K, Any, ArrayBuffer[Any]] (
{ x => ArrayBuffer(x) },
{ (b, x) => b += x },
{ (b1, b2) => b1 ++ b2 }
)
class CoGroupedRDD[K](rdds: Seq[RDD[(K, _)]], part: Partitioner[K])
extends RDD[(K, Seq[Seq[_]])](rdds.first.context) {
val aggr = new CoGroupAggregator[K]
override val dependencies = {
val deps = new ArrayBuffer[Dependency[_]]
for ((rdd, index) <- rdds.zipWithIndex) {
if (rdd.partitioner == Some(part)) {
deps += new OneToOneDependency(rdd)
} else {
deps += new ShuffleDependency[K, Any, ArrayBuffer[Any]](
context.newShuffleId, rdd, aggr, part)
}
}
deps.toList
}
@transient val splits_ : Array[Split] = {
val firstRdd = rdds.first
val array = new Array[Split](part.numPartitions)
for (i <- 0 until array.size) {
array(i) = new CoGroupSplit(i, rdds.zipWithIndex.map { case (r, j) =>
dependencies(j) match {
case s: ShuffleDependency[_, _, _] =>
new ShuffleCoGroupSplitDep(s.shuffleId): CoGroupSplitDep
case _ =>
new NarrowCoGroupSplitDep(r, r.splits(i)): CoGroupSplitDep
}
}.toList)
}
array
}
override def splits = splits_
override val partitioner: Option[Partitioner[_]] = Some(part.asInstanceOf[Partitioner[_]])
override def preferredLocations(s: Split) = Nil
override def compute(s: Split): Iterator[(K, Seq[Seq[_]])] = {
val split = s.asInstanceOf[CoGroupSplit]
val map = new HashMap[K, Seq[ArrayBuffer[Any]]]
def getSeq(k: K): Seq[ArrayBuffer[Any]] = {
map.getOrElseUpdate(k, Array.fill(rdds.size)(new ArrayBuffer[Any]))
}
for ((dep, index) <- split.deps.zipWithIndex) dep match {
case NarrowCoGroupSplitDep(rdd, itsSplit) => {
// Read them from the parent
for ((k: K, v) <- rdd.iterator(itsSplit)) {
getSeq(k)(index) += v
}
}
case ShuffleCoGroupSplitDep(shuffleId) => {
// Read map outputs of shuffle
val splitsByUri = new HashMap[String, ArrayBuffer[Int]]
val serverUris = MapOutputTracker.getServerUris(shuffleId)
for ((serverUri, index) <- serverUris.zipWithIndex) {
splitsByUri.getOrElseUpdate(serverUri, ArrayBuffer()) += index
}
for ((serverUri, inputIds) <- Utils.shuffle(splitsByUri)) {
for (i <- inputIds) {
val url = "%s/shuffle/%d/%d/%d".format(serverUri, shuffleId, i, index)
val inputStream = new ObjectInputStream(new URL(url).openStream())
try {
while (true) {
val (k, vs) = inputStream.readObject().asInstanceOf[(K, Seq[Any])]
val mySeq = getSeq(k)
for (v <- vs)
mySeq(index) += v
}
} catch {
case e: EOFException => {}
}
inputStream.close()
}
}
}
}
map.iterator
}
}
\ No newline at end of file
...@@ -246,4 +246,46 @@ extends RDD[Array[T]](prev.context) { ...@@ -246,4 +246,46 @@ extends RDD[Array[T]](prev.context) {
def numCores = self.context.numCores def numCores = self.context.numCores
def collectAsMap(): Map[K, V] = HashMap(self.collect(): _*) def collectAsMap(): Map[K, V] = HashMap(self.collect(): _*)
def mapValues[U](f: V => U): RDD[(K, U)] =
{
val cleanF = self.context.clean(f)
new MappedValuesRDD(self, cleanF)
}
/*
def groupWith[W](other: RDD[(K, W)]): RDD[(K, (Seq[V], Seq[W]))] = {
if (self.partitioner != None) {
val part = self.partitoner.get
if (other.partitioner != None && other.partitioner.get == part) {
// Can do a partition-wise cogroup
return new PartitionWiseGroupedRDD(self, other)
}
}
val vs: RDD[(K, Either[V, W])] = self.map { case (k, v) => (k, Left(v)) }
val ws: RDD[(K, Either[V, W])] = other.map { case (k, w) => (k, Right(w)) }
(vs ++ ws).groupByKey(numSplits).flatMap {
case (k, seq) => {
val vbuf = new ArrayBuffer[V]
val wbuf = new ArrayBuffer[W]
seq.foreach(_ match {
case Left(v) => vbuf += v
case Right(w) => wbuf += w
})
for (v <- vbuf; w <- wbuf) yield (k, (v, w))
}
}
}
*/
} }
class MappedValuesRDD[K, V, U](
prev: RDD[(K, V)], f: V => U)
extends RDD[(K, U)](prev.context) {
override def splits = prev.splits
override def preferredLocations(split: Split) = prev.preferredLocations(split)
override val dependencies = List(new OneToOneDependency(prev))
override def compute(split: Split) = prev.iterator(split).map{case (k, v) => (k, f(v))}
override val partitioner = prev.partitioner
}
\ No newline at end of file
...@@ -16,7 +16,8 @@ class ShuffledRDD[K, V, C]( ...@@ -16,7 +16,8 @@ class ShuffledRDD[K, V, C](
aggregator: Aggregator[K, V, C], aggregator: Aggregator[K, V, C],
part : Partitioner[K]) part : Partitioner[K])
extends RDD[(K, C)](parent.context) { extends RDD[(K, C)](parent.context) {
override val partitioner = Some(part) //override val partitioner = Some(part)
override val partitioner: Option[Partitioner[_]] = Some(part.asInstanceOf[Partitioner[_]])
@transient val splits_ = @transient val splits_ =
Array.tabulate[Split](part.numPartitions)(i => new ShuffledRDDSplit(i)) Array.tabulate[Split](part.numPartitions)(i => new ShuffledRDDSplit(i))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment