Skip to content
Snippets Groups Projects
Commit 103d8cce authored by Bimal Tandel's avatar Bimal Tandel Committed by Xiangrui Meng
Browse files

[SPARK-8921] [MLLIB] Add @since tags to mllib.stat

Author: Bimal Tandel <bimal@bimal-MBP.local>

Closes #7730 from BimalTandel/branch_spark_8921 and squashes the following commits:

3ea230a [Bimal Tandel] Spark 8921 add @since tags
parent 86505962
No related branches found
No related tags found
No related merge requests found
...@@ -37,6 +37,7 @@ import org.apache.spark.rdd.RDD ...@@ -37,6 +37,7 @@ import org.apache.spark.rdd.RDD
* .setBandwidth(3.0) * .setBandwidth(3.0)
* val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) * val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
* }}} * }}}
* @since 1.4.0
*/ */
@Experimental @Experimental
class KernelDensity extends Serializable { class KernelDensity extends Serializable {
...@@ -51,6 +52,7 @@ class KernelDensity extends Serializable { ...@@ -51,6 +52,7 @@ class KernelDensity extends Serializable {
/** /**
* Sets the bandwidth (standard deviation) of the Gaussian kernel (default: `1.0`). * Sets the bandwidth (standard deviation) of the Gaussian kernel (default: `1.0`).
* @since 1.4.0
*/ */
def setBandwidth(bandwidth: Double): this.type = { def setBandwidth(bandwidth: Double): this.type = {
require(bandwidth > 0, s"Bandwidth must be positive, but got $bandwidth.") require(bandwidth > 0, s"Bandwidth must be positive, but got $bandwidth.")
...@@ -60,6 +62,7 @@ class KernelDensity extends Serializable { ...@@ -60,6 +62,7 @@ class KernelDensity extends Serializable {
/** /**
* Sets the sample to use for density estimation. * Sets the sample to use for density estimation.
* @since 1.4.0
*/ */
def setSample(sample: RDD[Double]): this.type = { def setSample(sample: RDD[Double]): this.type = {
this.sample = sample this.sample = sample
...@@ -68,6 +71,7 @@ class KernelDensity extends Serializable { ...@@ -68,6 +71,7 @@ class KernelDensity extends Serializable {
/** /**
* Sets the sample to use for density estimation (for Java users). * Sets the sample to use for density estimation (for Java users).
* @since 1.4.0
*/ */
def setSample(sample: JavaRDD[java.lang.Double]): this.type = { def setSample(sample: JavaRDD[java.lang.Double]): this.type = {
this.sample = sample.rdd.asInstanceOf[RDD[Double]] this.sample = sample.rdd.asInstanceOf[RDD[Double]]
...@@ -76,6 +80,7 @@ class KernelDensity extends Serializable { ...@@ -76,6 +80,7 @@ class KernelDensity extends Serializable {
/** /**
* Estimates probability density function at the given array of points. * Estimates probability density function at the given array of points.
* @since 1.4.0
*/ */
def estimate(points: Array[Double]): Array[Double] = { def estimate(points: Array[Double]): Array[Double] = {
val sample = this.sample val sample = this.sample
......
...@@ -33,6 +33,7 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector} ...@@ -33,6 +33,7 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector}
* Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]] * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
* Zero elements (including explicit zero values) are skipped when calling add(), * Zero elements (including explicit zero values) are skipped when calling add(),
* to have time complexity O(nnz) instead of O(n) for each column. * to have time complexity O(nnz) instead of O(n) for each column.
* @since 1.1.0
*/ */
@DeveloperApi @DeveloperApi
class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with Serializable { class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with Serializable {
...@@ -52,6 +53,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S ...@@ -52,6 +53,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
* *
* @param sample The sample in dense/sparse vector format to be added into this summarizer. * @param sample The sample in dense/sparse vector format to be added into this summarizer.
* @return This MultivariateOnlineSummarizer object. * @return This MultivariateOnlineSummarizer object.
* @since 1.1.0
*/ */
def add(sample: Vector): this.type = { def add(sample: Vector): this.type = {
if (n == 0) { if (n == 0) {
...@@ -107,6 +109,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S ...@@ -107,6 +109,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
* *
* @param other The other MultivariateOnlineSummarizer to be merged. * @param other The other MultivariateOnlineSummarizer to be merged.
* @return This MultivariateOnlineSummarizer object. * @return This MultivariateOnlineSummarizer object.
* @since 1.1.0
*/ */
def merge(other: MultivariateOnlineSummarizer): this.type = { def merge(other: MultivariateOnlineSummarizer): this.type = {
if (this.totalCnt != 0 && other.totalCnt != 0) { if (this.totalCnt != 0 && other.totalCnt != 0) {
...@@ -149,6 +152,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S ...@@ -149,6 +152,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
this this
} }
/**
* @since 1.1.0
*/
override def mean: Vector = { override def mean: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.") require(totalCnt > 0, s"Nothing has been added to this summarizer.")
...@@ -161,6 +167,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S ...@@ -161,6 +167,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(realMean) Vectors.dense(realMean)
} }
/**
* @since 1.1.0
*/
override def variance: Vector = { override def variance: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.") require(totalCnt > 0, s"Nothing has been added to this summarizer.")
...@@ -183,14 +192,23 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S ...@@ -183,14 +192,23 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(realVariance) Vectors.dense(realVariance)
} }
/**
* @since 1.1.0
*/
override def count: Long = totalCnt override def count: Long = totalCnt
/**
* @since 1.1.0
*/
override def numNonzeros: Vector = { override def numNonzeros: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.") require(totalCnt > 0, s"Nothing has been added to this summarizer.")
Vectors.dense(nnz) Vectors.dense(nnz)
} }
/**
* @since 1.1.0
*/
override def max: Vector = { override def max: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.") require(totalCnt > 0, s"Nothing has been added to this summarizer.")
...@@ -202,6 +220,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S ...@@ -202,6 +220,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(currMax) Vectors.dense(currMax)
} }
/**
* @since 1.1.0
*/
override def min: Vector = { override def min: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.") require(totalCnt > 0, s"Nothing has been added to this summarizer.")
...@@ -213,6 +234,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S ...@@ -213,6 +234,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(currMin) Vectors.dense(currMin)
} }
/**
* @since 1.2.0
*/
override def normL2: Vector = { override def normL2: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.") require(totalCnt > 0, s"Nothing has been added to this summarizer.")
...@@ -227,6 +251,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S ...@@ -227,6 +251,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(realMagnitude) Vectors.dense(realMagnitude)
} }
/**
* @since 1.2.0
*/
override def normL1: Vector = { override def normL1: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.") require(totalCnt > 0, s"Nothing has been added to this summarizer.")
......
...@@ -21,46 +21,55 @@ import org.apache.spark.mllib.linalg.Vector ...@@ -21,46 +21,55 @@ import org.apache.spark.mllib.linalg.Vector
/** /**
* Trait for multivariate statistical summary of a data matrix. * Trait for multivariate statistical summary of a data matrix.
* @since 1.0.0
*/ */
trait MultivariateStatisticalSummary { trait MultivariateStatisticalSummary {
/** /**
* Sample mean vector. * Sample mean vector.
* @since 1.0.0
*/ */
def mean: Vector def mean: Vector
/** /**
* Sample variance vector. Should return a zero vector if the sample size is 1. * Sample variance vector. Should return a zero vector if the sample size is 1.
* @since 1.0.0
*/ */
def variance: Vector def variance: Vector
/** /**
* Sample size. * Sample size.
* @since 1.0.0
*/ */
def count: Long def count: Long
/** /**
* Number of nonzero elements (including explicitly presented zero values) in each column. * Number of nonzero elements (including explicitly presented zero values) in each column.
* @since 1.0.0
*/ */
def numNonzeros: Vector def numNonzeros: Vector
/** /**
* Maximum value of each column. * Maximum value of each column.
* @since 1.0.0
*/ */
def max: Vector def max: Vector
/** /**
* Minimum value of each column. * Minimum value of each column.
* @since 1.0.0
*/ */
def min: Vector def min: Vector
/** /**
* Euclidean magnitude of each column * Euclidean magnitude of each column
* @since 1.2.0
*/ */
def normL2: Vector def normL2: Vector
/** /**
* L1 norm of each column * L1 norm of each column
* @since 1.2.0
*/ */
def normL1: Vector def normL1: Vector
} }
...@@ -32,6 +32,7 @@ import org.apache.spark.rdd.RDD ...@@ -32,6 +32,7 @@ import org.apache.spark.rdd.RDD
/** /**
* :: Experimental :: * :: Experimental ::
* API for statistical functions in MLlib. * API for statistical functions in MLlib.
* @since 1.1.0
*/ */
@Experimental @Experimental
object Statistics { object Statistics {
...@@ -41,6 +42,7 @@ object Statistics { ...@@ -41,6 +42,7 @@ object Statistics {
* *
* @param X an RDD[Vector] for which column-wise summary statistics are to be computed. * @param X an RDD[Vector] for which column-wise summary statistics are to be computed.
* @return [[MultivariateStatisticalSummary]] object containing column-wise summary statistics. * @return [[MultivariateStatisticalSummary]] object containing column-wise summary statistics.
* @since 1.1.0
*/ */
def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = { def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = {
new RowMatrix(X).computeColumnSummaryStatistics() new RowMatrix(X).computeColumnSummaryStatistics()
...@@ -52,6 +54,7 @@ object Statistics { ...@@ -52,6 +54,7 @@ object Statistics {
* *
* @param X an RDD[Vector] for which the correlation matrix is to be computed. * @param X an RDD[Vector] for which the correlation matrix is to be computed.
* @return Pearson correlation matrix comparing columns in X. * @return Pearson correlation matrix comparing columns in X.
* @since 1.1.0
*/ */
def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X) def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X)
...@@ -68,6 +71,7 @@ object Statistics { ...@@ -68,6 +71,7 @@ object Statistics {
* @param method String specifying the method to use for computing correlation. * @param method String specifying the method to use for computing correlation.
* Supported: `pearson` (default), `spearman` * Supported: `pearson` (default), `spearman`
* @return Correlation matrix comparing columns in X. * @return Correlation matrix comparing columns in X.
* @since 1.1.0
*/ */
def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method) def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
...@@ -81,10 +85,14 @@ object Statistics { ...@@ -81,10 +85,14 @@ object Statistics {
* @param x RDD[Double] of the same cardinality as y. * @param x RDD[Double] of the same cardinality as y.
* @param y RDD[Double] of the same cardinality as x. * @param y RDD[Double] of the same cardinality as x.
* @return A Double containing the Pearson correlation between the two input RDD[Double]s * @return A Double containing the Pearson correlation between the two input RDD[Double]s
* @since 1.1.0
*/ */
def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y) def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
/** Java-friendly version of [[corr()]] */ /**
* Java-friendly version of [[corr()]]
* @since 1.4.1
*/
def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double = def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double =
corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]]) corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]])
...@@ -101,10 +109,14 @@ object Statistics { ...@@ -101,10 +109,14 @@ object Statistics {
* Supported: `pearson` (default), `spearman` * Supported: `pearson` (default), `spearman`
* @return A Double containing the correlation between the two input RDD[Double]s using the * @return A Double containing the correlation between the two input RDD[Double]s using the
* specified method. * specified method.
* @since 1.1.0
*/ */
def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method) def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
/** Java-friendly version of [[corr()]] */ /**
* Java-friendly version of [[corr()]]
* @since 1.4.1
*/
def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double = def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double =
corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]], method) corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]], method)
...@@ -121,6 +133,7 @@ object Statistics { ...@@ -121,6 +133,7 @@ object Statistics {
* `expected` is rescaled if the `expected` sum differs from the `observed` sum. * `expected` is rescaled if the `expected` sum differs from the `observed` sum.
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis. * the method used, and the null hypothesis.
* @since 1.1.0
*/ */
def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = { def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
ChiSqTest.chiSquared(observed, expected) ChiSqTest.chiSquared(observed, expected)
...@@ -135,6 +148,7 @@ object Statistics { ...@@ -135,6 +148,7 @@ object Statistics {
* @param observed Vector containing the observed categorical counts/relative frequencies. * @param observed Vector containing the observed categorical counts/relative frequencies.
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis. * the method used, and the null hypothesis.
* @since 1.1.0
*/ */
def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed) def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
...@@ -145,6 +159,7 @@ object Statistics { ...@@ -145,6 +159,7 @@ object Statistics {
* @param observed The contingency matrix (containing either counts or relative frequencies). * @param observed The contingency matrix (containing either counts or relative frequencies).
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis. * the method used, and the null hypothesis.
* @since 1.1.0
*/ */
def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed) def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed)
...@@ -157,6 +172,7 @@ object Statistics { ...@@ -157,6 +172,7 @@ object Statistics {
* Real-valued features will be treated as categorical for each distinct value. * Real-valued features will be treated as categorical for each distinct value.
* @return an array containing the ChiSquaredTestResult for every feature against the label. * @return an array containing the ChiSquaredTestResult for every feature against the label.
* The order of the elements in the returned array reflects the order of input features. * The order of the elements in the returned array reflects the order of input features.
* @since 1.1.0
*/ */
def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = { def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
ChiSqTest.chiSquaredFeatures(data) ChiSqTest.chiSquaredFeatures(data)
......
...@@ -32,6 +32,7 @@ import org.apache.spark.mllib.util.MLUtils ...@@ -32,6 +32,7 @@ import org.apache.spark.mllib.util.MLUtils
* *
* @param mu The mean vector of the distribution * @param mu The mean vector of the distribution
* @param sigma The covariance matrix of the distribution * @param sigma The covariance matrix of the distribution
* @since 1.3.0
*/ */
@DeveloperApi @DeveloperApi
class MultivariateGaussian ( class MultivariateGaussian (
...@@ -60,12 +61,16 @@ class MultivariateGaussian ( ...@@ -60,12 +61,16 @@ class MultivariateGaussian (
*/ */
private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
/** Returns density of this multivariate Gaussian at given point, x */ /** Returns density of this multivariate Gaussian at given point, x
* @since 1.3.0
*/
def pdf(x: Vector): Double = { def pdf(x: Vector): Double = {
pdf(x.toBreeze) pdf(x.toBreeze)
} }
/** Returns the log-density of this multivariate Gaussian at given point, x */ /** Returns the log-density of this multivariate Gaussian at given point, x
* @since 1.3.0
*/
def logpdf(x: Vector): Double = { def logpdf(x: Vector): Double = {
logpdf(x.toBreeze) logpdf(x.toBreeze)
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment