From 34eccedbf5781b8723abdc1e6fd3d98e14056999 Mon Sep 17 00:00:00 2001 From: root <root@domU-12-31-39-05-4C-21.compute-1.internal> Date: Sun, 3 Oct 2010 05:06:06 +0000 Subject: [PATCH] Fixed a rather bad bug in HDFS files that has been in for a while: caching was not working because Split objects did not have a consistent toString value --- src/scala/spark/HdfsFile.scala | 1 + src/scala/spark/RDD.scala | 1 + 2 files changed, 2 insertions(+) diff --git a/src/scala/spark/HdfsFile.scala b/src/scala/spark/HdfsFile.scala index 595386fceb..886272a8ed 100644 --- a/src/scala/spark/HdfsFile.scala +++ b/src/scala/spark/HdfsFile.scala @@ -14,6 +14,7 @@ import org.apache.hadoop.mapred.Reporter @serializable class HdfsSplit(@transient s: InputSplit) extends Split { val inputSplit = new SerializableWritable[InputSplit](s) + override def toString = inputSplit.toString } class HdfsTextFile(sc: SparkContext, path: String) diff --git a/src/scala/spark/RDD.scala b/src/scala/spark/RDD.scala index 4d0c8c6711..181f7e8b03 100644 --- a/src/scala/spark/RDD.scala +++ b/src/scala/spark/RDD.scala @@ -198,6 +198,7 @@ extends RDD[T](prev.sparkContext) with Logging { override def iterator(split: Split): Iterator[T] = { val key = id + "::" + split.toString + logInfo("CachedRDD split key is " + key) val cache = CachedRDD.cache val loading = CachedRDD.loading val cachedVal = cache.get(key) -- GitLab