[SPARK-8921] [MLLIB] Add @since tags to mllib.stat

Author: Bimal Tandel <bimal@bimal-MBP.local> Closes #7730 from BimalTandel/branch_spark_8921 and squashes the following commits: 3ea230a [Bimal Tandel] Spark 8921 add @since tags

[SPARK-8921] [MLLIB] Add @since tags to mllib.stat
103d8cce · Bimal Tandel · Xiangrui Meng · 86505962 · 103d8cce · 103d8cce
Commit 103d8cce authored 9 years ago by Bimal Tandel Committed by Xiangrui Meng 9 years ago
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
@@ -37,6 +37,7 @@ import org.apache.spark.rdd.RDD
 *   .setBandwidth(3.0)
 * val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
 * }}}
+ * @since 1.4.0
 */
 @Experimental
 class KernelDensity extends Serializable {
@@ -51,6 +52,7 @@ class KernelDensity extends Serializable {
  /**
   * Sets the bandwidth (standard deviation) of the Gaussian kernel (default: `1.0`).
+   * @since 1.4.0
   */
  def setBandwidth(bandwidth: Double): this.type = {
    require(bandwidth > 0, s"Bandwidth must be positive, but got $bandwidth.")
@@ -60,6 +62,7 @@ class KernelDensity extends Serializable {
  /**
   * Sets the sample to use for density estimation.
+   * @since 1.4.0
   */
  def setSample(sample: RDD[Double]): this.type = {
    this.sample = sample
@@ -68,6 +71,7 @@ class KernelDensity extends Serializable {
  /**
   * Sets the sample to use for density estimation (for Java users).
+   * @since 1.4.0
   */
  def setSample(sample: JavaRDD[java.lang.Double]): this.type = {
    this.sample = sample.rdd.asInstanceOf[RDD[Double]]
@@ -76,6 +80,7 @@ class KernelDensity extends Serializable {
  /**
   * Estimates probability density function at the given array of points.
+   * @since 1.4.0
   */
  def estimate(points: Array[Double]): Array[Double] = {
    val sample = this.sample

--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -33,6 +33,7 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector}
 * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
 * Zero elements (including explicit zero values) are skipped when calling add(),
 * to have time complexity O(nnz) instead of O(n) for each column.
+ * @since 1.1.0
 */
 @DeveloperApi
 class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with Serializable {
@@ -52,6 +53,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   *
   * @param sample The sample in dense/sparse vector format to be added into this summarizer.
   * @return This MultivariateOnlineSummarizer object.
+   * @since 1.1.0
   */
  def add(sample: Vector): this.type = {
    if (n == 0) {
@@ -107,6 +109,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
   *
   * @param other The other MultivariateOnlineSummarizer to be merged.
   * @return This MultivariateOnlineSummarizer object.
+   * @since 1.1.0
   */
  def merge(other: MultivariateOnlineSummarizer): this.type = {
    if (this.totalCnt != 0 && other.totalCnt != 0) {
@@ -149,6 +152,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    this
  }
+  /**
+   * @since 1.1.0
+   */
  override def mean: Vector = {
    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -161,6 +167,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    Vectors.dense(realMean)
  }
+  /**
+   * @since 1.1.0
+   */
  override def variance: Vector = {
    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -183,14 +192,23 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    Vectors.dense(realVariance)
  }
+  /**
+   * @since 1.1.0
+   */
  override def count: Long = totalCnt
+  /**
+   * @since 1.1.0
+   */
  override def numNonzeros: Vector = {
    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
    Vectors.dense(nnz)
  }
+  /**
+   * @since 1.1.0
+   */
  override def max: Vector = {
    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -202,6 +220,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    Vectors.dense(currMax)
  }
+  /**
+   * @since 1.1.0
+   */
  override def min: Vector = {
    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -213,6 +234,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    Vectors.dense(currMin)
  }
+  /**
+   * @since 1.2.0
+   */
  override def normL2: Vector = {
    require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -227,6 +251,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    Vectors.dense(realMagnitude)
  }
+  /**
+   * @since 1.2.0
+   */
  override def normL1: Vector = {
    require(totalCnt > 0, s"Nothing has been added to this summarizer.")

--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
@@ -21,46 +21,55 @@ import org.apache.spark.mllib.linalg.Vector
 /**
 * Trait for multivariate statistical summary of a data matrix.
+ * @since 1.0.0
 */
 trait MultivariateStatisticalSummary {
  /**
   * Sample mean vector.
+   * @since 1.0.0
   */
  def mean: Vector
  /**
   * Sample variance vector. Should return a zero vector if the sample size is 1.
+   * @since 1.0.0
   */
  def variance: Vector
  /**
   * Sample size.
+   * @since 1.0.0
   */
  def count: Long
  /**
   * Number of nonzero elements (including explicitly presented zero values) in each column.
+   * @since 1.0.0
   */
  def numNonzeros: Vector
  /**
   * Maximum value of each column.
+   * @since 1.0.0
   */
  def max: Vector
  /**
   * Minimum value of each column.
+   * @since 1.0.0
   */
  def min: Vector
  /**
   * Euclidean magnitude of each column
+   * @since 1.2.0
   */
  def normL2: Vector
  /**
   * L1 norm of each column
+   * @since 1.2.0
   */
  def normL1: Vector
 }
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -32,6 +32,7 @@ import org.apache.spark.rdd.RDD
 /**
 * :: Experimental ::
 * API for statistical functions in MLlib.
+ * @since 1.1.0
 */
 @Experimental
 object Statistics {
@@ -41,6 +42,7 @@ object Statistics {
   *
   * @param X an RDD[Vector] for which column-wise summary statistics are to be computed.
   * @return [[MultivariateStatisticalSummary]] object containing column-wise summary statistics.
+   * @since 1.1.0
   */
  def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = {
    new RowMatrix(X).computeColumnSummaryStatistics()
@@ -52,6 +54,7 @@ object Statistics {
   *
   * @param X an RDD[Vector] for which the correlation matrix is to be computed.
   * @return Pearson correlation matrix comparing columns in X.
+   * @since 1.1.0
   */
  def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X)
@@ -68,6 +71,7 @@ object Statistics {
   * @param method String specifying the method to use for computing correlation.
   *               Supported: `pearson` (default), `spearman`
   * @return Correlation matrix comparing columns in X.
+   * @since 1.1.0
   */
  def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
@@ -81,10 +85,14 @@ object Statistics {
   * @param x RDD[Double] of the same cardinality as y.
   * @param y RDD[Double] of the same cardinality as x.
   * @return A Double containing the Pearson correlation between the two input RDD[Double]s
+   * @since 1.1.0
   */
  def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
-  /** Java-friendly version of [[corr()]] */
+  /**
+   * Java-friendly version of [[corr()]]
+   * @since 1.4.1
+   */
  def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double =
    corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]])
@@ -101,10 +109,14 @@ object Statistics {
   *               Supported: `pearson` (default), `spearman`
   * @return A Double containing the correlation between the two input RDD[Double]s using the
   *         specified method.
+   * @since 1.1.0
   */
  def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
-  /** Java-friendly version of [[corr()]] */
+  /**
+   * Java-friendly version of [[corr()]]
+   * @since 1.4.1
+   */
  def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double =
    corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]], method)
@@ -121,6 +133,7 @@ object Statistics {
   *                 `expected` is rescaled if the `expected` sum differs from the `observed` sum.
   * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
   *         the method used, and the null hypothesis.
+   * @since 1.1.0
   */
  def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
    ChiSqTest.chiSquared(observed, expected)
@@ -135,6 +148,7 @@ object Statistics {
   * @param observed Vector containing the observed categorical counts/relative frequencies.
   * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
   *         the method used, and the null hypothesis.
+   * @since 1.1.0
   */
  def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
@@ -145,6 +159,7 @@ object Statistics {
   * @param observed The contingency matrix (containing either counts or relative frequencies).
   * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
   *         the method used, and the null hypothesis.
+   * @since 1.1.0
   */
  def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed)
@@ -157,6 +172,7 @@ object Statistics {
   *             Real-valued features will be treated as categorical for each distinct value.
   * @return an array containing the ChiSquaredTestResult for every feature against the label.
   *         The order of the elements in the returned array reflects the order of input features.
+   * @since 1.1.0
   */
  def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
    ChiSqTest.chiSquaredFeatures(data)

--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -32,6 +32,7 @@ import org.apache.spark.mllib.util.MLUtils
 *
 * @param mu The mean vector of the distribution
 * @param sigma The covariance matrix of the distribution
+ * @since 1.3.0
 */
 @DeveloperApi
 class MultivariateGaussian (
@@ -60,12 +61,16 @@ class MultivariateGaussian (
   */
  private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
-  /** Returns density of this multivariate Gaussian at given point, x */
+  /** Returns density of this multivariate Gaussian at given point, x
+    * @since 1.3.0
+    */
  def pdf(x: Vector): Double = {
    pdf(x.toBreeze)
  }
-  /** Returns the log-density of this multivariate Gaussian at given point, x */
+  /** Returns the log-density of this multivariate Gaussian at given point, x
+    * @since 1.3.0
+    */
  def logpdf(x: Vector): Double = {
    logpdf(x.toBreeze)
  }