Skip to content
Snippets Groups Projects
Commit 8ae14369 authored by Matei Zaharia's avatar Matei Zaharia
Browse files

Merge pull request #722 from JoshRosen/spark-825

Fix bug: DoubleRDDFunctions.sampleStdev() computed non-sample stdev()
parents 15fb3948 f649dabb
No related branches found
No related tags found
No related merge requests found
......@@ -54,7 +54,13 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
* Compute the sample standard deviation of this RDD's elements (which corrects for bias in
* estimating the standard deviation by dividing by N-1 instead of N).
*/
def sampleStdev(): Double = stats().stdev
def sampleStdev(): Double = stats().sampleStdev
/**
* Compute the sample variance of this RDD's elements (which corrects for bias in
* estimating the variance by dividing by N-1 instead of N).
*/
def sampleVariance(): Double = stats().sampleVariance
/** (Experimental) Approximate operation to return the mean within a timeout. */
def meanApprox(timeout: Long, confidence: Double = 0.95): PartialResult[BoundedDouble] = {
......
......@@ -115,33 +115,48 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) extends JavaRDDLike[Double, Jav
// Double RDD functions
/** Return the sum of the elements in this RDD. */
/** Add up the elements in this RDD. */
def sum(): Double = srdd.sum()
/** Return a [[spark.StatCounter]] describing the elements in this RDD. */
/**
* Return a [[spark.util.StatCounter]] object that captures the mean, variance and count
* of the RDD's elements in one operation.
*/
def stats(): StatCounter = srdd.stats()
/** Return the mean of the elements in this RDD. */
/** Compute the mean of this RDD's elements. */
def mean(): Double = srdd.mean()
/** Return the variance of the elements in this RDD. */
/** Compute the variance of this RDD's elements. */
def variance(): Double = srdd.variance()
/** Return the standard deviation of the elements in this RDD. */
/** Compute the standard deviation of this RDD's elements. */
def stdev(): Double = srdd.stdev()
/**
* Compute the sample standard deviation of this RDD's elements (which corrects for bias in
* estimating the standard deviation by dividing by N-1 instead of N).
*/
def sampleStdev(): Double = srdd.sampleStdev()
/**
* Compute the sample variance of this RDD's elements (which corrects for bias in
* estimating the standard variance by dividing by N-1 instead of N).
*/
def sampleVariance(): Double = srdd.sampleVariance()
/** Return the approximate mean of the elements in this RDD. */
def meanApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble] =
srdd.meanApprox(timeout, confidence)
/** Return the approximate mean of the elements in this RDD. */
/** (Experimental) Approximate operation to return the mean within a timeout. */
def meanApprox(timeout: Long): PartialResult[BoundedDouble] = srdd.meanApprox(timeout)
/** Return the approximate sum of the elements in this RDD. */
/** (Experimental) Approximate operation to return the sum within a timeout. */
def sumApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble] =
srdd.sumApprox(timeout, confidence)
/** Return the approximate sum of the elements in this RDD. */
/** (Experimental) Approximate operation to return the sum within a timeout. */
def sumApprox(timeout: Long): PartialResult[BoundedDouble] = srdd.sumApprox(timeout)
}
......
......@@ -326,7 +326,9 @@ public class JavaAPISuite implements Serializable {
Assert.assertEquals(20/6.0, rdd.mean(), 0.01);
Assert.assertEquals(20/6.0, rdd.mean(), 0.01);
Assert.assertEquals(6.22222, rdd.variance(), 0.01);
Assert.assertEquals(7.46667, rdd.sampleVariance(), 0.01);
Assert.assertEquals(2.49444, rdd.stdev(), 0.01);
Assert.assertEquals(2.73252, rdd.sampleStdev(), 0.01);
Double first = rdd.first();
List<Double> take = rdd.take(5);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment