Skip to content
Snippets Groups Projects
Commit 8d7b77bc authored by Matei Zaharia's avatar Matei Zaharia
Browse files

Some doc and usability improvements:

- Added a StorageLevels class for easy access to StorageLevel constants
  in Java
- Added doc comments on Function classes in Java
- Updated Accumulator and HadoopWriter docs slightly
parent 682b2d93
No related branches found
No related tags found
No related merge requests found
Showing
with 90 additions and 24 deletions
...@@ -6,16 +6,17 @@ import scala.collection.mutable.Map ...@@ -6,16 +6,17 @@ import scala.collection.mutable.Map
import scala.collection.generic.Growable import scala.collection.generic.Growable
/** /**
* A datatype that can be accumulated, i.e. has an commutative and associative +. * A datatype that can be accumulated, i.e. has an commutative and associative "add" operation,
* but where the result type, `R`, may be different from the element type being added, `T`.
* *
* You must define how to add data, and how to merge two of these together. For some datatypes, these might be * You must define how to add data, and how to merge two of these together. For some datatypes,
* the same operation (eg., a counter). In that case, you might want to use [[spark.AccumulatorParam]]. They won't * such as a counter, these might be the same operation. In that case, you can use the simpler
* always be the same, though -- eg., imagine you are accumulating a set. You will add items to the set, and you * [[spark.Accumulator]]. They won't always be the same, though -- e.g., imagine you are
* will union two sets together. * accumulating a set. You will add items to the set, and you will union two sets together.
* *
* @param initialValue initial value of accumulator * @param initialValue initial value of accumulator
* @param param helper object defining how to add elements of type `T` * @param param helper object defining how to add elements of type `R` and `T`
* @tparam R the full accumulated data * @tparam R the full accumulated data (result type)
* @tparam T partial data that can be added in * @tparam T partial data that can be added in
*/ */
class Accumulable[R, T] ( class Accumulable[R, T] (
...@@ -44,6 +45,10 @@ class Accumulable[R, T] ( ...@@ -44,6 +45,10 @@ class Accumulable[R, T] (
* @param term the other Accumulable that will get merged with this * @param term the other Accumulable that will get merged with this
*/ */
def ++= (term: R) { value_ = param.addInPlace(value_, term)} def ++= (term: R) { value_ = param.addInPlace(value_, term)}
/**
* Access the accumulator's current value; only allowed on master.
*/
def value = { def value = {
if (!deserialized) value_ if (!deserialized) value_
else throw new UnsupportedOperationException("Can't read accumulator value in task") else throw new UnsupportedOperationException("Can't read accumulator value in task")
...@@ -60,6 +65,9 @@ class Accumulable[R, T] ( ...@@ -60,6 +65,9 @@ class Accumulable[R, T] (
*/ */
def localValue = value_ def localValue = value_
/**
* Set the accumulator's value; only allowed on master.
*/
def value_= (r: R) { def value_= (r: R) {
if (!deserialized) value_ = r if (!deserialized) value_ = r
else throw new UnsupportedOperationException("Can't assign accumulator value in task") else throw new UnsupportedOperationException("Can't assign accumulator value in task")
...@@ -77,28 +85,37 @@ class Accumulable[R, T] ( ...@@ -77,28 +85,37 @@ class Accumulable[R, T] (
} }
/** /**
* Helper object defining how to accumulate values of a particular type. * Helper object defining how to accumulate values of a particular type. An implicit
* AccumulableParam needs to be available when you create Accumulables of a specific type.
* *
* @tparam R the full accumulated data * @tparam R the full accumulated data (result type)
* @tparam T partial data that can be added in * @tparam T partial data that can be added in
*/ */
trait AccumulableParam[R, T] extends Serializable { trait AccumulableParam[R, T] extends Serializable {
/** /**
* Add additional data to the accumulator value. * Add additional data to the accumulator value. Is allowed to modify and return `r`
* for efficiency (to avoid allocating objects).
*
* @param r the current value of the accumulator * @param r the current value of the accumulator
* @param t the data to be added to the accumulator * @param t the data to be added to the accumulator
* @return the new value of the accumulator * @return the new value of the accumulator
*/ */
def addAccumulator(r: R, t: T) : R def addAccumulator(r: R, t: T): R
/** /**
* Merge two accumulated values together * Merge two accumulated values together. Is allowed to modify and return the first value
* for efficiency (to avoid allocating objects).
*
* @param r1 one set of accumulated data * @param r1 one set of accumulated data
* @param r2 another set of accumulated data * @param r2 another set of accumulated data
* @return both data sets merged together * @return both data sets merged together
*/ */
def addInPlace(r1: R, r2: R): R def addInPlace(r1: R, r2: R): R
/**
* Return the "zero" (identity) value for an accumulator type, given its initial value. For
* example, if R was a vector of N dimensions, this would return a vector of N zeroes.
*/
def zero(initialValue: R): R def zero(initialValue: R): R
} }
...@@ -106,12 +123,12 @@ private[spark] ...@@ -106,12 +123,12 @@ private[spark]
class GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Serializable, T] class GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Serializable, T]
extends AccumulableParam[R,T] { extends AccumulableParam[R,T] {
def addAccumulator(growable: R, elem: T) : R = { def addAccumulator(growable: R, elem: T): R = {
growable += elem growable += elem
growable growable
} }
def addInPlace(t1: R, t2: R) : R = { def addInPlace(t1: R, t2: R): R = {
t1 ++= t2 t1 ++= t2
t1 t1
} }
...@@ -134,17 +151,18 @@ class GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Ser ...@@ -134,17 +151,18 @@ class GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Ser
* @param param helper object defining how to add elements of type `T` * @param param helper object defining how to add elements of type `T`
* @tparam T result type * @tparam T result type
*/ */
class Accumulator[T]( class Accumulator[T](@transient initialValue: T, param: AccumulatorParam[T])
@transient initialValue: T, extends Accumulable[T,T](initialValue, param)
param: AccumulatorParam[T]) extends Accumulable[T,T](initialValue, param)
/** /**
* A simpler version of [[spark.AccumulableParam]] where the only datatype you can add in is the same type * A simpler version of [[spark.AccumulableParam]] where the only datatype you can add in is the same type
* as the accumulated value * as the accumulated value. An implicit AccumulatorParam object needs to be available when you create
* Accumulators of a specific type.
*
* @tparam T type of value to accumulate * @tparam T type of value to accumulate
*/ */
trait AccumulatorParam[T] extends AccumulableParam[T, T] { trait AccumulatorParam[T] extends AccumulableParam[T, T] {
def addAccumulator(t1: T, t2: T) : T = { def addAccumulator(t1: T, t2: T): T = {
addInPlace(t1, t2) addInPlace(t1, t2)
} }
} }
......
...@@ -16,7 +16,7 @@ import spark.Logging ...@@ -16,7 +16,7 @@ import spark.Logging
import spark.SerializableWritable import spark.SerializableWritable
/** /**
* An internal helper class that saves an RDD using a Hadoop OutputFormat. This is only public * Internal helper class that saves an RDD using a Hadoop OutputFormat. This is only public
* because we need to access this class from the `spark` package to use some package-private Hadoop * because we need to access this class from the `spark` package to use some package-private Hadoop
* functions, but this class should not be used directly by users. * functions, but this class should not be used directly by users.
* *
......
package spark.api.java;
import spark.storage.StorageLevel;
/**
* Expose some commonly useful storage level constants.
*/
public class StorageLevels {
public static final StorageLevel NONE = new StorageLevel(false, false, false, 1);
public static final StorageLevel DISK_ONLY = new StorageLevel(true, false, false, 1);
public static final StorageLevel DISK_ONLY_2 = new StorageLevel(true, false, false, 2);
public static final StorageLevel MEMORY_ONLY = new StorageLevel(false, true, true, 1);
public static final StorageLevel MEMORY_ONLY_2 = new StorageLevel(false, true, true, 2);
public static final StorageLevel MEMORY_ONLY_SER = new StorageLevel(false, true, false, 1);
public static final StorageLevel MEMORY_ONLY_SER_2 = new StorageLevel(false, true, false, 2);
public static final StorageLevel MEMORY_AND_DISK = new StorageLevel(true, true, true, 1);
public static final StorageLevel MEMORY_AND_DISK_2 = new StorageLevel(true, true, true, 2);
public static final StorageLevel MEMORY_AND_DISK_SER = new StorageLevel(true, true, false, 1);
public static final StorageLevel MEMORY_AND_DISK_SER_2 = new StorageLevel(true, true, false, 2);
}
...@@ -5,6 +5,9 @@ import scala.runtime.AbstractFunction1; ...@@ -5,6 +5,9 @@ import scala.runtime.AbstractFunction1;
import java.io.Serializable; import java.io.Serializable;
/**
* A function that returns zero or more records of type Double from each input record.
*/
// DoubleFlatMapFunction does not extend FlatMapFunction because flatMap is // DoubleFlatMapFunction does not extend FlatMapFunction because flatMap is
// overloaded for both FlatMapFunction and DoubleFlatMapFunction. // overloaded for both FlatMapFunction and DoubleFlatMapFunction.
public abstract class DoubleFlatMapFunction<T> extends AbstractFunction1<T, Iterable<Double>> public abstract class DoubleFlatMapFunction<T> extends AbstractFunction1<T, Iterable<Double>>
......
...@@ -5,6 +5,9 @@ import scala.runtime.AbstractFunction1; ...@@ -5,6 +5,9 @@ import scala.runtime.AbstractFunction1;
import java.io.Serializable; import java.io.Serializable;
/**
* A function that returns Doubles, and can be used to construct DoubleRDDs.
*/
// DoubleFunction does not extend Function because some UDF functions, like map, // DoubleFunction does not extend Function because some UDF functions, like map,
// are overloaded for both Function and DoubleFunction. // are overloaded for both Function and DoubleFunction.
public abstract class DoubleFunction<T> extends WrappedFunction1<T, Double> public abstract class DoubleFunction<T> extends WrappedFunction1<T, Double>
......
package spark.api.java.function package spark.api.java.function
/**
* A function that returns zero or more output records from each input record.
*/
abstract class FlatMapFunction[T, R] extends Function[T, java.lang.Iterable[R]] { abstract class FlatMapFunction[T, R] extends Function[T, java.lang.Iterable[R]] {
@throws(classOf[Exception]) @throws(classOf[Exception])
def call(x: T) : java.lang.Iterable[R] def call(x: T) : java.lang.Iterable[R]
......
...@@ -8,8 +8,9 @@ import java.io.Serializable; ...@@ -8,8 +8,9 @@ import java.io.Serializable;
/** /**
* Base class for functions whose return types do not have special RDDs; DoubleFunction is * Base class for functions whose return types do not create special RDDs. PairFunction and
* handled separately, to allow DoubleRDDs to be constructed when mapping RDDs to doubles. * DoubleFunction are handled separately, to allow PairRDDs and DoubleRDDs to be constructed
* when mapping RDDs of other types.
*/ */
public abstract class Function<T, R> extends WrappedFunction1<T, R> implements Serializable { public abstract class Function<T, R> extends WrappedFunction1<T, R> implements Serializable {
public abstract R call(T t) throws Exception; public abstract R call(T t) throws Exception;
......
...@@ -6,6 +6,9 @@ import scala.runtime.AbstractFunction2; ...@@ -6,6 +6,9 @@ import scala.runtime.AbstractFunction2;
import java.io.Serializable; import java.io.Serializable;
/**
* A two-argument function that takes arguments of type T1 and T2 and returns an R.
*/
public abstract class Function2<T1, T2, R> extends WrappedFunction2<T1, T2, R> public abstract class Function2<T1, T2, R> extends WrappedFunction2<T1, T2, R>
implements Serializable { implements Serializable {
......
...@@ -7,6 +7,10 @@ import scala.runtime.AbstractFunction1; ...@@ -7,6 +7,10 @@ import scala.runtime.AbstractFunction1;
import java.io.Serializable; import java.io.Serializable;
/**
* A function that returns zero or more key-value pair records from each input record. The
* key-value pairs are represented as scala.Tuple2 objects.
*/
// PairFlatMapFunction does not extend FlatMapFunction because flatMap is // PairFlatMapFunction does not extend FlatMapFunction because flatMap is
// overloaded for both FlatMapFunction and PairFlatMapFunction. // overloaded for both FlatMapFunction and PairFlatMapFunction.
public abstract class PairFlatMapFunction<T, K, V> public abstract class PairFlatMapFunction<T, K, V>
......
...@@ -7,6 +7,9 @@ import scala.runtime.AbstractFunction1; ...@@ -7,6 +7,9 @@ import scala.runtime.AbstractFunction1;
import java.io.Serializable; import java.io.Serializable;
/**
* A function that returns key-value pairs (Tuple2<K, V>), and can be used to construct PairRDDs.
*/
// PairFunction does not extend Function because some UDF functions, like map, // PairFunction does not extend Function because some UDF functions, like map,
// are overloaded for both Function and PairFunction. // are overloaded for both Function and PairFunction.
public abstract class PairFunction<T, K, V> public abstract class PairFunction<T, K, V>
......
package spark.api.java.function package spark.api.java.function
/**
* A function with no return value.
*/
// This allows Java users to write void methods without having to return Unit. // This allows Java users to write void methods without having to return Unit.
abstract class VoidFunction[T] extends Serializable { abstract class VoidFunction[T] extends Serializable {
@throws(classOf[Exception]) @throws(classOf[Exception])
......
...@@ -72,6 +72,11 @@ class has a single abstract method, `call()`, that must be implemented. ...@@ -72,6 +72,11 @@ class has a single abstract method, `call()`, that must be implemented.
<tr><td>Function2&lt;T1, T2, R&gt;</td><td>T1, T2 =&gt; R (function of two arguments)</td></tr> <tr><td>Function2&lt;T1, T2, R&gt;</td><td>T1, T2 =&gt; R (function of two arguments)</td></tr>
</table> </table>
## Storage Levels
RDD [storage level](scala-programming-guide.html#rdd-persistence) constants, such as `MEMORY_AND_DISK`, are
declared in the [spark.api.java.StorageLevels](api/core/index.html#spark.api.java.StorageLevels) class.
# Other Features # Other Features
......
package spark.examples; package spark.examples;
import scala.util.Random;
import spark.api.java.JavaRDD; import spark.api.java.JavaRDD;
import spark.api.java.JavaSparkContext; import spark.api.java.JavaSparkContext;
import spark.api.java.function.Function; import spark.api.java.function.Function;
...@@ -9,6 +8,7 @@ import spark.api.java.function.Function2; ...@@ -9,6 +8,7 @@ import spark.api.java.function.Function2;
import java.io.Serializable; import java.io.Serializable;
import java.util.Arrays; import java.util.Arrays;
import java.util.StringTokenizer; import java.util.StringTokenizer;
import java.util.Random;
public class JavaHdfsLR { public class JavaHdfsLR {
......
package spark.examples; package spark.examples;
import scala.Tuple2; import scala.Tuple2;
import scala.util.Random;
import spark.api.java.JavaPairRDD; import spark.api.java.JavaPairRDD;
import spark.api.java.JavaSparkContext; import spark.api.java.JavaSparkContext;
import spark.api.java.function.PairFunction; import spark.api.java.function.PairFunction;
...@@ -9,6 +8,7 @@ import spark.api.java.function.PairFunction; ...@@ -9,6 +8,7 @@ import spark.api.java.function.PairFunction;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Random;
import java.util.Set; import java.util.Set;
/** /**
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment