Skip to content
Snippets Groups Projects
Commit 00ae4be9 authored by Xiangrui Meng's avatar Xiangrui Meng Committed by DB Tsai
Browse files

[SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util

Same as #8421 but for `mllib.pmml` and `mllib.util`.

cc dbtsai

Author: Xiangrui Meng <meng@databricks.com>

Closes #8430 from mengxr/SPARK-10239 and squashes the following commits:

a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util
parent 92059078
No related branches found
No related tags found
No related merge requests found
Showing with 41 additions and 11 deletions
...@@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult ...@@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult
import org.jpmml.model.JAXBUtil import org.jpmml.model.JAXBUtil
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Experimental} import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
/** /**
...@@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory ...@@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
* developed by the Data Mining Group (www.dmg.org). * developed by the Data Mining Group (www.dmg.org).
*/ */
@DeveloperApi @DeveloperApi
@Since("1.4.0")
trait PMMLExportable { trait PMMLExportable {
/** /**
...@@ -48,6 +49,7 @@ trait PMMLExportable { ...@@ -48,6 +49,7 @@ trait PMMLExportable {
* Export the model to a local file in PMML format * Export the model to a local file in PMML format
*/ */
@Experimental @Experimental
@Since("1.4.0")
def toPMML(localPath: String): Unit = { def toPMML(localPath: String): Unit = {
toPMML(new StreamResult(new File(localPath))) toPMML(new StreamResult(new File(localPath)))
} }
...@@ -57,6 +59,7 @@ trait PMMLExportable { ...@@ -57,6 +59,7 @@ trait PMMLExportable {
* Export the model to a directory on a distributed file system in PMML format * Export the model to a directory on a distributed file system in PMML format
*/ */
@Experimental @Experimental
@Since("1.4.0")
def toPMML(sc: SparkContext, path: String): Unit = { def toPMML(sc: SparkContext, path: String): Unit = {
val pmml = toPMML() val pmml = toPMML()
sc.parallelize(Array(pmml), 1).saveAsTextFile(path) sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
...@@ -67,6 +70,7 @@ trait PMMLExportable { ...@@ -67,6 +70,7 @@ trait PMMLExportable {
* Export the model to the OutputStream in PMML format * Export the model to the OutputStream in PMML format
*/ */
@Experimental @Experimental
@Since("1.4.0")
def toPMML(outputStream: OutputStream): Unit = { def toPMML(outputStream: OutputStream): Unit = {
toPMML(new StreamResult(outputStream)) toPMML(new StreamResult(outputStream))
} }
...@@ -76,6 +80,7 @@ trait PMMLExportable { ...@@ -76,6 +80,7 @@ trait PMMLExportable {
* Export the model to a String in PMML format * Export the model to a String in PMML format
*/ */
@Experimental @Experimental
@Since("1.4.0")
def toPMML(): String = { def toPMML(): String = {
val writer = new StringWriter val writer = new StringWriter
toPMML(new StreamResult(writer)) toPMML(new StreamResult(writer))
......
...@@ -17,16 +17,17 @@ ...@@ -17,16 +17,17 @@
package org.apache.spark.mllib.util package org.apache.spark.mllib.util
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.Logging import org.apache.spark.Logging
import org.apache.spark.rdd.RDD import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
/** /**
* :: DeveloperApi :: * :: DeveloperApi ::
* A collection of methods used to validate data before applying ML algorithms. * A collection of methods used to validate data before applying ML algorithms.
*/ */
@DeveloperApi @DeveloperApi
@Since("0.8.0")
object DataValidators extends Logging { object DataValidators extends Logging {
/** /**
...@@ -34,6 +35,7 @@ object DataValidators extends Logging { ...@@ -34,6 +35,7 @@ object DataValidators extends Logging {
* *
* @return True if labels are all zero or one, false otherwise. * @return True if labels are all zero or one, false otherwise.
*/ */
@Since("1.0.0")
val binaryLabelValidator: RDD[LabeledPoint] => Boolean = { data => val binaryLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count() val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count()
if (numInvalid != 0) { if (numInvalid != 0) {
...@@ -48,6 +50,7 @@ object DataValidators extends Logging { ...@@ -48,6 +50,7 @@ object DataValidators extends Logging {
* *
* @return True if labels are all in the range of {0, 1, ..., k-1}, false otherwise. * @return True if labels are all in the range of {0, 1, ..., k-1}, false otherwise.
*/ */
@Since("1.3.0")
def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
val numInvalid = data.filter(x => val numInvalid = data.filter(x =>
x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
......
...@@ -19,8 +19,8 @@ package org.apache.spark.mllib.util ...@@ -19,8 +19,8 @@ package org.apache.spark.mllib.util
import scala.util.Random import scala.util.Random
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
/** /**
...@@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD ...@@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD
* cluster with scale 1 around each center. * cluster with scale 1 around each center.
*/ */
@DeveloperApi @DeveloperApi
@Since("0.8.0")
object KMeansDataGenerator { object KMeansDataGenerator {
/** /**
...@@ -42,6 +43,7 @@ object KMeansDataGenerator { ...@@ -42,6 +43,7 @@ object KMeansDataGenerator {
* @param r Scaling factor for the distribution of the initial centers * @param r Scaling factor for the distribution of the initial centers
* @param numPartitions Number of partitions of the generated RDD; default 2 * @param numPartitions Number of partitions of the generated RDD; default 2
*/ */
@Since("0.8.0")
def generateKMeansRDD( def generateKMeansRDD(
sc: SparkContext, sc: SparkContext,
numPoints: Int, numPoints: Int,
...@@ -62,6 +64,7 @@ object KMeansDataGenerator { ...@@ -62,6 +64,7 @@ object KMeansDataGenerator {
} }
} }
@Since("0.8.0")
def main(args: Array[String]) { def main(args: Array[String]) {
if (args.length < 6) { if (args.length < 6) {
// scalastyle:off println // scalastyle:off println
......
...@@ -22,11 +22,11 @@ import scala.util.Random ...@@ -22,11 +22,11 @@ import scala.util.Random
import com.github.fommil.netlib.BLAS.{getInstance => blas} import com.github.fommil.netlib.BLAS.{getInstance => blas}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
/** /**
* :: DeveloperApi :: * :: DeveloperApi ::
...@@ -35,6 +35,7 @@ import org.apache.spark.mllib.regression.LabeledPoint ...@@ -35,6 +35,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
* response variable `Y`. * response variable `Y`.
*/ */
@DeveloperApi @DeveloperApi
@Since("0.8.0")
object LinearDataGenerator { object LinearDataGenerator {
/** /**
...@@ -46,6 +47,7 @@ object LinearDataGenerator { ...@@ -46,6 +47,7 @@ object LinearDataGenerator {
* @param seed Random seed * @param seed Random seed
* @return Java List of input. * @return Java List of input.
*/ */
@Since("0.8.0")
def generateLinearInputAsList( def generateLinearInputAsList(
intercept: Double, intercept: Double,
weights: Array[Double], weights: Array[Double],
...@@ -68,6 +70,7 @@ object LinearDataGenerator { ...@@ -68,6 +70,7 @@ object LinearDataGenerator {
* @param eps Epsilon scaling factor. * @param eps Epsilon scaling factor.
* @return Seq of input. * @return Seq of input.
*/ */
@Since("0.8.0")
def generateLinearInput( def generateLinearInput(
intercept: Double, intercept: Double,
weights: Array[Double], weights: Array[Double],
...@@ -92,6 +95,7 @@ object LinearDataGenerator { ...@@ -92,6 +95,7 @@ object LinearDataGenerator {
* @param eps Epsilon scaling factor. * @param eps Epsilon scaling factor.
* @return Seq of input. * @return Seq of input.
*/ */
@Since("0.8.0")
def generateLinearInput( def generateLinearInput(
intercept: Double, intercept: Double,
weights: Array[Double], weights: Array[Double],
...@@ -132,6 +136,7 @@ object LinearDataGenerator { ...@@ -132,6 +136,7 @@ object LinearDataGenerator {
* *
* @return RDD of LabeledPoint containing sample data. * @return RDD of LabeledPoint containing sample data.
*/ */
@Since("0.8.0")
def generateLinearRDD( def generateLinearRDD(
sc: SparkContext, sc: SparkContext,
nexamples: Int, nexamples: Int,
...@@ -151,6 +156,7 @@ object LinearDataGenerator { ...@@ -151,6 +156,7 @@ object LinearDataGenerator {
data data
} }
@Since("0.8.0")
def main(args: Array[String]) { def main(args: Array[String]) {
if (args.length < 2) { if (args.length < 2) {
// scalastyle:off println // scalastyle:off println
......
...@@ -19,7 +19,7 @@ package org.apache.spark.mllib.util ...@@ -19,7 +19,7 @@ package org.apache.spark.mllib.util
import scala.util.Random import scala.util.Random
import org.apache.spark.annotation.DeveloperApi import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LabeledPoint
...@@ -31,6 +31,7 @@ import org.apache.spark.mllib.linalg.Vectors ...@@ -31,6 +31,7 @@ import org.apache.spark.mllib.linalg.Vectors
* with probability `probOne` and scales features for positive examples by `eps`. * with probability `probOne` and scales features for positive examples by `eps`.
*/ */
@DeveloperApi @DeveloperApi
@Since("0.8.0")
object LogisticRegressionDataGenerator { object LogisticRegressionDataGenerator {
/** /**
...@@ -43,6 +44,7 @@ object LogisticRegressionDataGenerator { ...@@ -43,6 +44,7 @@ object LogisticRegressionDataGenerator {
* @param nparts Number of partitions of the generated RDD. Default value is 2. * @param nparts Number of partitions of the generated RDD. Default value is 2.
* @param probOne Probability that a label is 1 (and not 0). Default value is 0.5. * @param probOne Probability that a label is 1 (and not 0). Default value is 0.5.
*/ */
@Since("0.8.0")
def generateLogisticRDD( def generateLogisticRDD(
sc: SparkContext, sc: SparkContext,
nexamples: Int, nexamples: Int,
...@@ -62,6 +64,7 @@ object LogisticRegressionDataGenerator { ...@@ -62,6 +64,7 @@ object LogisticRegressionDataGenerator {
data data
} }
@Since("0.8.0")
def main(args: Array[String]) { def main(args: Array[String]) {
if (args.length != 5) { if (args.length != 5) {
// scalastyle:off println // scalastyle:off println
......
...@@ -23,7 +23,7 @@ import scala.language.postfixOps ...@@ -23,7 +23,7 @@ import scala.language.postfixOps
import scala.util.Random import scala.util.Random
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix} import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix}
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
...@@ -52,7 +52,9 @@ import org.apache.spark.rdd.RDD ...@@ -52,7 +52,9 @@ import org.apache.spark.rdd.RDD
* testSampFact (Double) Percentage of training data to use as test data. * testSampFact (Double) Percentage of training data to use as test data.
*/ */
@DeveloperApi @DeveloperApi
@Since("0.8.0")
object MFDataGenerator { object MFDataGenerator {
@Since("0.8.0")
def main(args: Array[String]) { def main(args: Array[String]) {
if (args.length < 2) { if (args.length < 2) {
// scalastyle:off println // scalastyle:off println
......
...@@ -36,6 +36,7 @@ import org.apache.spark.streaming.dstream.DStream ...@@ -36,6 +36,7 @@ import org.apache.spark.streaming.dstream.DStream
/** /**
* Helper methods to load, save and pre-process data used in ML Lib. * Helper methods to load, save and pre-process data used in ML Lib.
*/ */
@Since("0.8.0")
object MLUtils { object MLUtils {
private[mllib] lazy val EPSILON = { private[mllib] lazy val EPSILON = {
...@@ -168,6 +169,7 @@ object MLUtils { ...@@ -168,6 +169,7 @@ object MLUtils {
* *
* @see [[org.apache.spark.mllib.util.MLUtils#loadLibSVMFile]] * @see [[org.apache.spark.mllib.util.MLUtils#loadLibSVMFile]]
*/ */
@Since("1.0.0")
def saveAsLibSVMFile(data: RDD[LabeledPoint], dir: String) { def saveAsLibSVMFile(data: RDD[LabeledPoint], dir: String) {
// TODO: allow to specify label precision and feature precision. // TODO: allow to specify label precision and feature precision.
val dataStr = data.map { case LabeledPoint(label, features) => val dataStr = data.map { case LabeledPoint(label, features) =>
......
...@@ -21,11 +21,11 @@ import scala.util.Random ...@@ -21,11 +21,11 @@ import scala.util.Random
import com.github.fommil.netlib.BLAS.{getInstance => blas} import com.github.fommil.netlib.BLAS.{getInstance => blas}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
/** /**
* :: DeveloperApi :: * :: DeveloperApi ::
...@@ -33,8 +33,10 @@ import org.apache.spark.mllib.regression.LabeledPoint ...@@ -33,8 +33,10 @@ import org.apache.spark.mllib.regression.LabeledPoint
* for the features and adds Gaussian noise with weight 0.1 to generate labels. * for the features and adds Gaussian noise with weight 0.1 to generate labels.
*/ */
@DeveloperApi @DeveloperApi
@Since("0.8.0")
object SVMDataGenerator { object SVMDataGenerator {
@Since("0.8.0")
def main(args: Array[String]) { def main(args: Array[String]) {
if (args.length < 2) { if (args.length < 2) {
// scalastyle:off println // scalastyle:off println
......
...@@ -24,7 +24,7 @@ import org.json4s._ ...@@ -24,7 +24,7 @@ import org.json4s._
import org.json4s.jackson.JsonMethods._ import org.json4s.jackson.JsonMethods._
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.types.{DataType, StructField, StructType}
...@@ -35,6 +35,7 @@ import org.apache.spark.sql.types.{DataType, StructField, StructType} ...@@ -35,6 +35,7 @@ import org.apache.spark.sql.types.{DataType, StructField, StructType}
* This should be inherited by the class which implements model instances. * This should be inherited by the class which implements model instances.
*/ */
@DeveloperApi @DeveloperApi
@Since("1.3.0")
trait Saveable { trait Saveable {
/** /**
...@@ -50,6 +51,7 @@ trait Saveable { ...@@ -50,6 +51,7 @@ trait Saveable {
* @param path Path specifying the directory in which to save this model. * @param path Path specifying the directory in which to save this model.
* If the directory already exists, this method throws an exception. * If the directory already exists, this method throws an exception.
*/ */
@Since("1.3.0")
def save(sc: SparkContext, path: String): Unit def save(sc: SparkContext, path: String): Unit
/** Current version of model save/load format. */ /** Current version of model save/load format. */
...@@ -64,6 +66,7 @@ trait Saveable { ...@@ -64,6 +66,7 @@ trait Saveable {
* This should be inherited by an object paired with the model class. * This should be inherited by an object paired with the model class.
*/ */
@DeveloperApi @DeveloperApi
@Since("1.3.0")
trait Loader[M <: Saveable] { trait Loader[M <: Saveable] {
/** /**
...@@ -75,6 +78,7 @@ trait Loader[M <: Saveable] { ...@@ -75,6 +78,7 @@ trait Loader[M <: Saveable] {
* @param path Path specifying the directory to which the model was saved. * @param path Path specifying the directory to which the model was saved.
* @return Model instance * @return Model instance
*/ */
@Since("1.3.0")
def load(sc: SparkContext, path: String): M def load(sc: SparkContext, path: String): M
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment